Import pbbam_0.7.4+ds.orig.tar.gz

author Afif Elghraoui <afif@debian.org>

Sun, 22 Jan 2017 07:48:21 +0000 (07:48 +0000)

committer Afif Elghraoui <afif@debian.org>

Sun, 22 Jan 2017 07:48:21 +0000 (07:48 +0000)
author Afif Elghraoui <afif@debian.org>
Sun, 22 Jan 2017 07:48:21 +0000 (07:48 +0000)
committer Afif Elghraoui <afif@debian.org>
Sun, 22 Jan 2017 07:48:21 +0000 (07:48 +0000)
diff --git a/.gitignore b/.gitignore

new file mode 100644 (file)

index 0000000..2bb3384
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,16 @@
+*.o
+*.pico
+*.so
+*.a
+*.dylib
+*.pyc
+*~
+CMakeLists.txt.user
+bin/
+build/
+docs/Doxyfile
+lib/
+tests/bin/test_pbbam
+tests/data/test_group_query/group.fofn
+tests/src/TestData.h
+
diff --git a/.travis.yml b/.travis.yml

new file mode 100644 (file)

index 0000000..b1990e9
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,64 @@
+language: cpp
+compiler:
+  - gcc
+
+before_install:
+
+  # Travis's default installs of gcc, boost, & cmake currently lag behind the minimums we need.
+  # So we need to manually setup them up. 
+  #
+  #  - gcc 4.8 (current default on Travis is 4.7, which is no good for C++11 work)
+  #  - boost 1.55
+  #  - cmake 3.x
+  
+  # add external repos
+  - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test           # gcc
+  - sudo add-apt-repository -y ppa:boost-latest/ppa                  # boost
+  - sudo add-apt-repository -y ppa:george-edison55/precise-backports # cmake
+  
+  # remove existing cmake install
+  - sudo apt-get remove -qq cmake cmake-data
+  - sudo apt-get autoremove -qq
+  
+  # update apt 
+  - sudo apt-get update -y -qq
+
+  # install
+  - sudo apt-get install -y -qq g++-4.8 boost1.55 cmake-data cmake 
+  
+  # make sure we're using new gcc tools
+  - sudo update-alternatives --install /usr/bin/g++  g++  /usr/bin/g++-4.8  90
+  - sudo update-alternatives --install /usr/bin/gcc  gcc  /usr/bin/gcc-4.8  90 
+  - sudo update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-4.8 90
+
+  # prep zlib
+  - sudo apt-get install -y -qq zlib1g-dev
+
+  # prep htslib
+  - "cd .. && git clone https://github.com/PacificBiosciences/htslib.git && cd htslib && make && sudo make install; cd $TRAVIS_BUILD_DIR"
+
+  # prep GoogleTest 
+  - sudo apt-get install -y -qq libgtest-dev
+
+before_script:
+  # run cmake
+  - mkdir build 
+  - cd build
+  - cmake .. -DGTEST_SRC_DIR=/usr/src/gtest -DCMAKE_BUILD_TYPE=Debug
+    
+script:
+  # build & test
+  - make -j 3
+  - make test
+
+branches:
+  only:
+    - master
+    
+notifications:
+  recipients:
+    - dbarnett@pacb.com
+  email:
+    on_success: change
+    on_failure: always 
+   
diff --git a/CHANGELOG.md b/CHANGELOG.md

new file mode 100644 (file)

index 0000000..7704263
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,396 @@
+# PacBio::BAM - change log\r
+\r
+All notable changes to this project will be documented in this file.\r
+This project adheres to [Semantic Versioning](http://semver.org/). \r
+\r
+**NOTE:** The current series (0.y.z) is under initial development. Anything may\r
+change at any time. The public API should not be considered stable yet. Once we\r
+lock down a version 1.0.0, this will define a reference point & compatibility\r
+guarantees will be maintained within each major version series.\r
+\r
+## Active\r
+\r
+### Added\r
+- Default DataSet 'Version' attribute if none already present (currently 4.0.0)\r
+\r
+## [0.7.4] - 2016-11-18\r
+\r
+### Changed\r
+- Compatibility for merging BAM files no longer requires exact match of PacBioBAM\r
+version number (header @HD:pb tag). As long as both files meet the minimum \r
+supported version number, the merge is allowed.\r
+\r
+## [0.7.3] - 2016-11-11\r
+\r
+### Added\r
+- Support for S/P2-C2 chemistry and forthcoming 4.0 basecaller\r
+\r
+## [0.7.2] - 2016-11-10\r
+\r
+### Removed\r
+- SAM header version equality check for merging BAM files. PacBioBAM version \r
+number carries more meaning for PacBio data and thus will be the basis of \r
+ensuring compatible merging.\r
+\r
+## [0.7.1] - 2016-11-09\r
+\r
+### Added\r
+- (Unindexed) FASTA reader & FastaSequence data structure.\r
+- Missing unit tests for internal BAM tag access.\r
+- Chemistry data for basecaller v3.3.\r
+- Missing parsers for filtering barcode quality ("bq"), barcode forward ("bcf"), \r
+and barcode reverse ("bcr") from DataSetXML.\r
+- Integrated htslib into project.\r
+\r
+### Fixed\r
+- Reverse complement on padding base.\r
+\r
+## [0.7.0] - 2016-09-26 \r
+\r
+### Added\r
+- Clipping for CCS records\r
+\r
+### Fixed\r
+- Cached position data leaking across records while iterating.\r
+- Rolled back default pulse behavior in internal BAM API, to be backward-\r
+compatible with existing client code (for now at least). v0.6.0 introduced\r
+returning basecalled positions ONLY by default, rather than return ALL \r
+pulses. \r
+- Fixed crash when attempting to read from empty BAM/PBI files using the \r
+PbiFilter-enabled APIs.\r
+\r
+## [0.6.0] - 2016-09-13\r
+\r
+### Added\r
+- BamWriter writes to a BAM file with the target name plus a ".tmp" suffix. On\r
+successful completion (i.e. normal BamWriter destruction, not triggered by a\r
+thrown exception) the file is renamed to the actual requested filename.\r
+- PBI file creation follows the same temporary naming convention.\r
+- Support for barcode pair (forward, reverse) in DataSetXML filter.\r
+- Validation API & 'auto-validate' compile-time switch. \r
+- Added support for a batched QNAME whitelist filter in DataSet XML. Uses (new) \r
+Property name 'qname_file', with the value being the filepath containing the \r
+whitelist.\r
+- Exposed MD5 hashing to API.\r
+- Ability to remove base features from a ReadGroupInfo object.\r
+- Can construct an aggregate PbiRawData index object from a DataSet: essentially\r
+concatenates all PBI data within the dataset.\r
+- New SamWriter class to create SAM-formatted output of PacBio BAM data.\r
+- Extended APIs for accessing "internal BAM" data, including PulseBehavior\r
+switch for selecting between all pulses & basecalls only. \r
+\r
+### Fixed\r
+- Improper 'clip to reference' product for BamRecord in some cases.\r
+- Improper behavior in tag accessors (e.g. BamRecord::IPD()) on reverse strand-\r
+aligned reads (bug 31339).\r
+- Improper basecaller version parsing in ReadGroupInfo.\r
+\r
+### Changed\r
+- RecordType::POLYMERASE renamed to RecordType::ZMW to reflect changes in\r
+PacBio BAM spec v3.0.4\r
+- Refactored the 'virtual' reader classes - to match the new nomenclature,\r
+and to combine the virtual reader & composite readers behind a shared \r
+interface. The old class names still exist, as typedefs to the new ones, \r
+and the interfaces are completely source-compatible - so as not to break \r
+existing code. However, the old classes should be considered deprecated and \r
+the new ones preferred. Below is the mapping of old -> new:\r
+\r
+   VirtualPolymeraseBamRecord        ->  VirtualZmwBamRecord\r
+   VirtualPolymeraseReader           ->  ZmwReadStitcher\r
+   VirtualPolymeraseCompositeReader  ->  ZmwReadStitcher\r
+   ZmwWhitelistVirtualReader         ->  WhitelistedZmwReadStitcher\r
+\r
+\r
+## [0.5.0] - 2016-02-22\r
+\r
+### Added\r
+- Platform model tag added to read group as RG::PM\r
+- New scrap zmw type sz\r
+- pbmerge accepts DataSetXML as input - using top-level resource BAMs as input,\r
+applying filters, and generating a merged BAM. Also added FOFN support, instead\r
+of listing out BAMs as command line args.\r
+- PbiLocalContextFilter to allow filtering on subread local context.\r
+- PbiBuilder: multithreading & zlib compression-level tuning for PBI output\r
+\r
+### Fixed\r
+- Fixed mishandling of relative BAM filenames in the filename constructor for\r
+DataSet (e.g. DataSet ds("../data.bam")).\r
+\r
+## [0.4.5] - 2016-01-14\r
+\r
+### Changed\r
+- PbiFilterQuery (and any other PBI-backed query, e.g. ZmwQuery ) now throws if\r
+PBI file(s) missing insted of returning empty result.\r
+- GenomicIntervalQuery now throws if BAI file(s) missing instead of returning\r
+empty result.\r
+- BamFile will throw if file is truncated (e.g. missing the EOF block). Disable\r
+by defining PBBAM_NO_CHECK_EOF .\r
+\r
+## [0.4.4] - 2016-01-07\r
+\r
+### Added\r
+- bam2sam command line utility. The primary benefit is removing the dependency\r
+on samtools during tests, but also provides users a functioning BAM -> SAM\r
+converter in the absence of samtools.\r
+- pbmerge command line utility. Allows merging N BAM files into one, optionally\r
+creating the PBI file alongside.\r
+- Added BamRecord::Pkmean2 & Pkmid2, 2D equivalent of Pkmean/Pkmid, for internal\r
+BAMs.\r
+\r
+### Removed \r
+- samtools dependency\r
+\r
+## [0.4.3] - 2015-12-22\r
+\r
+### Added\r
+- Compile using ccache by default, if available. Can be manually disabled using\r
+-DPacBioBAM_use_ccache=OFF with cmake.\r
+- pbindexdump: command-line utility that converts PBI file data into human-\r
+readable formats. (JSON by default).\r
+\r
+### Changed\r
+- CMake option PacBioBAM_build_pbindex is being deprecated. Use\r
+PacBioBAM_build_tools instead.\r
+\r
+## [0.4.2] - 2015-12-22\r
+\r
+### Changed\r
+- BamFile::PacBioIndexExists & StandardIndexExists no longer check timestamps.\r
+Copying/moving files around can yield timestamps that are not helpful (no longer\r
+guaranteed that the .pbi will be "newer" than the .bam, even though no content\r
+changed). Added methods (e.g. bool BamFile::PacBioIndexIsNewer()) to do that\r
+lookup if needed, but it is no longer done automatically.\r
+\r
+## [0.4.1] - 2015-12-18\r
+\r
+### Added\r
+- BamRecord::HasNumPasses\r
+\r
+### Changed\r
+- VirtualPolymeraseBamRecord::VirtualRegionsTable(type) returns an empty vector\r
+of regions if none are associated with the requested type, instead of throwing.\r
+\r
+## [0.4.0] - 2015-12-15\r
+\r
+### Changed\r
+- Redesigned PbiFilter interface and backend. Previous implementation did not\r
+scale well as intermediate results were far too unwieldy. This redesign provides\r
+speedups of orders of magnitude in many cases.\r
+\r
+## [0.3.2] - 2015-12-10\r
+\r
+### Added \r
+- Support for ReadGroupInfo sequencing chemistry data.\r
+InvalidSequencingChemistryException thrown if an unsupported combination is\r
+encountered.\r
+- VirtualPolymeraseCompositeReader - for re-stitching records, across multiple\r
+resources (e.g. from DataSetXML). Reader respects DataSet filter criteria.\r
+\r
+## [0.3.1] - 2015-10-30\r
+\r
+### Added\r
+- ZmwWhitelistVirtualReader: similar to VirtualPolymeraseReader but restricts\r
+iteration to a whitelist of ZMW hole numbers, leveraging PBI index data for\r
+random-access.\r
+\r
+### Fixed\r
+- Fixed error in PBI construction, in which entire file sections (e.g.\r
+BarcodeData or MappedData) where being dropped when any one record lacked data.\r
+Correct behavior is to allow file section ommission if all records lack that\r
+data type.\r
+\r
+## [0.3.0] - 2015-10-29\r
+\r
+### Fixed\r
+- Improper reporting of current offset from multi-threaded BamWriter. This had\r
+the effect of creating broken PBIs that were written alongside the BAM. Added a\r
+flush step, which incurs a performance hit, but restores correctness.\r
+\r
+## [0.2.4] - 2015-10-26\r
+\r
+### Fixed\r
+- Empty PbiFilter now returns all records, instead of filtering away all records.\r
+\r
+## [0.2.3] - 2015-10-26\r
+\r
+### Added/Fixed\r
+- Syncing DataSetXML across APIs. Primary changes include output of Version\r
+attribute ("3.0.1") on appropriate elements, as well as resolution of namespace\r
+issues.\r
+\r
+## [0.2.2] - 2015-10-22\r
+\r
+### Added\r
+- Added BAI bin calculation to BamWriter::Write, to ensure maximal compatibility\r
+with downstream tools (e.g. 'samtools index'). A new BinCalculationMode enum\r
+flag in BamWriter constructor cotnrols whether this behavior is enabled[default]\r
+or not.\r
+\r
+## [0.2.1] - 2015-10-19\r
+\r
+### Added\r
+- Exposed the following classes to public API:\r
+  - BamReader\r
+  - BaiIndexedBamReader\r
+  - PbiIndexedBamReader\r
+  - GenomicIntervalCompositeBamReader\r
+  - PbiFilterCompositeBamReader\r
+\r
+## [0.2.0] - 2015-10-09\r
+\r
+### Changed\r
+- BAM spec v3.0.1 compliance. Previous (betas) versions of the BAM spec are not\r
+supported and will causean exception to be throw if encountered.\r
+- PBI lookup interface & backend, see PbiIndex.h & PbiLookupData.h for details.\r
+\r
+### Added \r
+- BamFile::PacBioIndexExists() & BamFile::StandardIndexExists() - query the\r
+existence of index files without auto-building them if they are missing, as in\r
+BamFile::Ensure*IndexExists().\r
+- GenomicInterval now accepts an htslib/samtools-style REGION string in the\r
+constructor: GenomicInterval("chr1:1000-2000"). Please note though, that pbbam\r
+uses 0-based coordinates throughout, whereas samtools expects 1-based. The above\r
+string is equivalent to "chr1:1001-2000" in samtools.\r
+- Built-in PBI filters. See PbiFlter.h & PbiFilterTypes.h for built-in filters\r
+and constructing composite filters. These can be used in conjunction with the\r
+new PbiFilterQuery, which takes a generic PbiFilter and applies that to a\r
+DataSet for iteration.\r
+- New built-in queries: BarcodeQuery, ReadAccuracyQuery, SubreadLengthQuery.\r
+These leverage the new filter API to construct a PbiFilter and apply to a\r
+DataSet.\r
+- Built-in BamRecord comparators that are STL-compatible. See Compare.h for full\r
+list. This allows for statements like the following, which sorts records by ZMW\r
+number:\r
+``` c++\r
+    vector<BamRecord> data;\r
+    std::sort(data.begin(), data.end(), Compare::Zmw());\r
+```\r
+- "exciseSoftClips" option to BamRecord::CigarData()\r
+\r
+## [0.1.0] - 2015-07-17\r
+\r
+### Changed\r
+- BAM spec v3.0b7 compliance\r
+ - Removal of 'M' as allowed CIGAR operation. Attempt to use such a CIGAR op\r
+ will throw an exception.\r
+ - Addition of IPD/PulseWidth codec version info in header\r
+  \r
+### Added\r
+- Auto-generation of UTC timestamp for DataSet objects\r
+- PbiBuilder - allows generation of PBI index data alongside generation or\r
+modification of BAM record data. This obviates the need to wait for a completed\r
+BAM, then go through the zlib decompression, etc.\r
+- Added DataSet::FromXml(string xml) to create DataSets from "raw" XML string,\r
+rather than building up using DataSet API or loading from existing file.\r
+- "pbindex" command line tool to generate ".pbi" files from BAM data. The\r
+executable is built by default, but can be disabled using the cmake option\r
+"-DPacBioBAM_build_pbindex=OFF".\r
+  \r
+### Fixed\r
+- PBI construction failing on CCS reads\r
+\r
+## [0.0.8] - 2015-07-02\r
+\r
+### Changed\r
+- Build system refactoring.\r
+\r
+## [0.0.7] - 2015-07-02\r
+\r
+### Added\r
+- PBI index lookup API. Not so much intended for client use directly, but will\r
+enable construction of higher-level semantic queries: grouping by, filtering,\r
+etc.\r
+- DataSet & PBI-aware queries (e.g. ZmwGroupQuery). More PBI-enabled queries to\r
+follow.\r
+- More flexibility in tag access. Samtools has a habit of performing a\r
+"shrink-to-fit" when it handles integer-valued tag data. Thus we cannot\r
+**guarantee** the binary type that our API will have to process. Safe\r
+conversions are allowed on integer-like data only. Under- or overflows in\r
+casting will trigger an exception. All other tag data types must be asked for\r
+explicitly, or else an exception will be raised, as before.\r
+- BamHeader::DeepCopy - allows creation of editable header data, without\r
+overwriting all shared instances\r
+\r
+### Fixed\r
+- XSD compliance for DataSet APIs.\r
+\r
+### Changed\r
+- The functionality provided by ZmwQuery (group by hole number), is now\r
+available using the ZmwGroupQuery object. The new ZmwQuery returns a single-\r
+record iterator (a la EntireFileQuery), but limited to a whitelist of requested\r
+hole numbers.\r
+\r
+### Removed\r
+- XSD non-compliant classes (e.g. ExternalDataReference)\r
+\r
+## [0.0.6] - 2015-06-07\r
+\r
+### Added\r
+\r
+- Accessor methods for pulse bam support:\r
+ - LabelQV()\r
+ - AltLabelQV()\r
+ - LabelTag()\r
+ - AltLabelTag()\r
+ - Pkmean()\r
+ - Pkmid()\r
+ - PrePulseFrames() only RC, no clipping\r
+ - PulseCallWidth() only RC, no clipping\r
+ - PulseCall() case-sensitive RC, no clipping\r
+ - IPDRaw() to avoid up and downscaling for stitching\r
+- BamRecord::ParseTagName and BamRecord::ParseTagString to convert a two \r
+  character tag string to a TagName enum and back. Allows a switch over tags.\r
+- VirtualPolymeraseReader to create VirtualPolymeraseBamRecord from a \r
+  subreads|hqregion+scraps.bam\r
+- VirtualRegion represents annotations of the polymerase reads, for adapters, \r
+  barcodes, lqregions, and hqregions.\r
+- ReadGroupInfo operator== \r
+\r
+### Fixed\r
+\r
+- Reimplemented QueryStart(int), QueryEnd(int), UpdateName(void), \r
+  ReadGroup(ReadGroupInfo&), ReadGroupId(std::string&);\r
+\r
+## [0.0.5] - 2015-05-29\r
+\r
+### Added\r
+\r
+- DataSet support. This includes XML I/O, basic dataset query/manipulation, and\r
+multi-BAM-file queries. New classes are located in <pbbam/dataset/>. DataSet-\r
+capable queries currently reside in the PacBio::BAM::staging namespace. These\r
+will be ported over to the main namespace once the support is stabilized and\r
+works seamlessly with either a single BamFile or DataSet object as input. (bug\r
+25941)\r
+- PBI support. This includes read/write raw data & building from a BamFile. The\r
+lookup API for random-access queries is under development, but the raw data is\r
+available - for creating PBI files & generating summary statistics. (bug 26025)\r
+- C# SWIG bindings, alongside existing Python and R wrappers.\r
+- LocalContextFlags support in BamRecord (bug 26623)\r
+\r
+### Fixed\r
+\r
+- BamRecord[Impl] map quality now  initialized with 255 (missing) value, instead\r
+of 0. (bug 26228)\r
+- ReadGroupId calculation. (bug 25940)\r
+  \r
+## [0.0.4] - 2015-04-22\r
+\r
+### Added\r
+\r
+- This changelog. Hope it helps.\r
+- Hook to set verbosity of underlying htslib warnings.\r
+- Grouped queries. (bug 26361)\r
+\r
+### Changed\r
+\r
+- Now using exceptions instead of return codes, output parameters, etc.\r
+- Removed "messy" shared_ptrs across interface (see especially BamHeader). These\r
+are now taken care of within the API, not exposed to client code.\r
+\r
+### Removed\r
+\r
+- BamReader \r
+\r
+### Fixed\r
+\r
+- ASCII tag output. (bug 26381)\r
diff --git a/CMakeLists.txt b/CMakeLists.txt

new file mode 100644 (file)

index 0000000..48eeee8
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,75 @@
+########################################################################
+# CMake build script for PacBioBAM library.
+########################################################################
+
+cmake_policy(SET CMP0048 NEW)  # lets us set version in project()
+project(PacBioBAM VERSION 0.7.4 LANGUAGES CXX C)
+cmake_minimum_required(VERSION 3.0)
+
+# project name & version
+set(PacBioBAM_NAME pbbam)
+set(PacBioBAM_VERSION
+  "${PacBioBAM_VERSION_MAJOR}.${PacBioBAM_VERSION_MINOR}.${PacBioBAM_VERSION_PATCH}"
+)
+
+# list build-time options
+option(PacBioBAM_build_docs    "Build PacBioBAM's API documentation."                   ON)
+option(PacBioBAM_build_tests   "Build PacBioBAM's unit tests."                          ON)
+option(PacBioBAM_build_shared  "Build PacBioBAM as shared library as well."             OFF)
+option(PacBioBAM_build_tools   "Build PacBioBAM command line utilities (e.g. pbindex)"  ON)
+option(PacBioBAM_wrap_csharp   "Build PacBioBAM with SWIG bindings for C#."             OFF)
+option(PacBioBAM_wrap_python   "Build PacBioBAM with SWIG bindings for Python."         OFF)
+option(PacBioBAM_wrap_r        "Build PacBioBAM with SWIG bindings for R."              OFF)
+option(PacBioBAM_use_modbuild  "Build PacBioBAM using Modular Build System."            OFF)
+option(PacBioBAM_use_ccache    "Build PacBioBAM using ccache, if available."            ON)
+option(PacBioBAM_auto_validate "Build PacBioBAM with auto-validation enabled."          OFF)
+
+if (PacBioBAM_wrap_csharp OR PacBioBAM_wrap_r OR PacBioBAM_wrap_python)
+    set(wrapping_swig TRUE)
+else()
+    set(wrapping_swig FALSE)
+endif()
+
+if(PacBioBAM_build_tests)
+    enable_testing()
+endif()
+
+# project paths
+set(PacBioBAM_RootDir       ${CMAKE_CURRENT_LIST_DIR})
+set(PacBioBAM_DocsDir       ${PacBioBAM_RootDir}/docs)
+set(PacBioBAM_IncludeDir    ${PacBioBAM_RootDir}/include)
+set(PacBioBAM_SourceDir     ${PacBioBAM_RootDir}/src)
+set(PacBioBAM_SwigSourceDir ${PacBioBAM_RootDir}/src/swig)
+set(PacBioBAM_TestsDir      ${PacBioBAM_RootDir}/tests)
+set(PacBioBAM_ThirdPartyDir ${PacBioBAM_RootDir}/third-party)
+set(PacBioBAM_ToolsDir      ${PacBioBAM_RootDir}/tools)
+
+if(NOT PacBioBAM_OutputDir)
+    set(PacBioBAM_OutputDir ${CMAKE_CURRENT_BINARY_DIR})
+else()
+    if(${wrapping_swig})
+        message(FATAL_ERROR "SWIG bindings not currently supported in modular build.")
+    endif()
+endif()
+set(PacBioBAM_BinDir    ${PacBioBAM_OutputDir}/bin)
+set(PacBioBAM_LibDir    ${PacBioBAM_OutputDir}/lib)
+
+set(GeneratedDir ${CMAKE_BINARY_DIR}/generated)
+set(GeneratedTestDataDir ${GeneratedDir}/data)
+file(MAKE_DIRECTORY ${PacBioBAM_BinDir})
+file(MAKE_DIRECTORY ${PacBioBAM_LibDir})
+file(MAKE_DIRECTORY ${GeneratedDir})
+file(MAKE_DIRECTORY ${GeneratedTestDataDir})
+
+# project configuration (keep this order)
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake ${CMAKE_MODULE_PATH})
+include(pbbam-ccache)
+include(pbbam-compilerflags)
+include(pbbam-libtype)
+include(pbbam-dependencies)
+
+# project components (keep this order)
+add_subdirectory(src)
+add_subdirectory(tools)
+add_subdirectory(docs)
+add_subdirectory(tests)
diff --git a/INSTALL.md b/INSTALL.md

new file mode 100644 (file)

index 0000000..86dddda
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,3 @@
+# PacBio::BAM - building & integrating\r
+\r
+Detailed build instructions can be found [here](http://pbbam.readthedocs.org/en/latest/getting_started.html).\r
diff --git a/LICENSE.txt b/LICENSE.txt

new file mode 100644 (file)

index 0000000..77e9557
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,34 @@
+Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted (subject to the limitations in the
+disclaimer below) provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following
+   disclaimer in the documentation and/or other materials provided
+   with the distribution.
+
+ * Neither the name of Pacific Biosciences nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
diff --git a/README.md b/README.md

new file mode 100644 (file)

index 0000000..046296e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,29 @@
+# pbbam
+
+[![Build Status](https://travis-ci.org/PacificBiosciences/pbbam.svg?branch=master)](https://travis-ci.org/PacificBiosciences/pbbam) [![Documentation Status](https://readthedocs.org/projects/pbbam/badge/?version=latest)](http://pbbam.readthedocs.org/en/latest/?badge=latest)
+ 
+As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard BAM
+format for (both aligned and unaligned) basecall data files. We have also formulated
+a BAM companion file format (bam.pbi) enabling fast access to a richer set of per-read
+information as well as compatibility for software built around the legacy cmp.h5 format.
+ 
+The **pbbam** software package provides components to create, query, & edit PacBio BAM
+files and associated indices. These components include a core C++ library, bindings for
+additional languages, and command-line utilities.
+
+### Note:
+
+This library is **not** intended to be used as a general-purpose BAM utility - all input & output BAMs must adhere to the [PacBio BAM format specification](https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst). Non-PacBio BAMs will cause exceptions to be thrown.
+ 
+##  Documentation
+
+  - [Documentation Home](http://pbbam.readthedocs.org/en/latest/index.html)
+    - [Getting Started](http://pbbam.readthedocs.org/en/latest/getting_started.html)
+    - [C++ API Reference](http://pbbam.readthedocs.org/en/latest/api_reference.html)
+
+  - [Changelog](https://github.com/PacificBiosciences/pbbam/blob/master/CHANGELOG.md)
+
+## License
+
+ - [PacBio open source license](https://github.com/PacificBiosciences/pbbam/blob/master/LICENSE.txt)
+
diff --git a/cmake/FindCSharp.cmake b/cmake/FindCSharp.cmake

new file mode 100644 (file)

index 0000000..08d09a7
--- /dev/null
+++ b/cmake/FindCSharp.cmake
@@ -0,0 +1,72 @@
+#
+# A CMake Module for finding and using C# (.NET and Mono).
+#
+# The following variables are set:
+#   CSHARP_FOUND - set to ON if C# is found
+#   CSHARP_USE_FILE - the path to the C# use file
+#   CSHARP_TYPE - the type of the C# compiler (eg. ".NET" or "Mono")
+#   CSHARP_VERSION - the version of the C# compiler (eg. "v4.0" or "2.10.2")
+#   CSHARP_COMPILER - the path to the C# compiler executable (eg. "C:/Windows/Microsoft.NET/Framework/v4.0.30319/csc.exe" or "/usr/bin/gmcs")
+#   CSHARP_INTERPRETER - the path to interpreter needed to run CSharp executables
+#   CSHARP_PLATFORM - the C# target platform
+#   CSHARP_SDK - the SDK commandline switch (empty for .NET, for Mono eg. "/sdk:2" or "/sdk:4")
+#
+# This file is based on the work of GDCM:
+#   http://gdcm.svn.sf.net/viewvc/gdcm/trunk/CMake/FindCSharp.cmake
+# Copyright (c) 2006-2010 Mathieu Malaterre <mathieu.malaterre@gmail.com>
+#
+
+# TODO: ADD ABILITY TO SELECT WHICH C# COMPILER eg. .NET or Mono (if both exist). For the moment, .NET is selected above Mono.
+
+# Make sure find package macros are included
+include( FindPackageHandleStandardArgs )
+
+unset( CSHARP_COMPILER CACHE )
+unset( CSHARP_INTERPRETER CACHE )
+unset( CSHARP_TYPE CACHE )
+unset( CSHARP_VERSION CACHE )
+unset( CSHARP_FOUND CACHE )
+
+# By default use anycpu platform, allow the user to override
+set( CSHARP_PLATFORM "anycpu" CACHE STRING "C# target platform: x86, x64, anycpu, or itanium" )
+if( NOT ${CSHARP_PLATFORM} MATCHES "x86|x64|anycpu|itanium" )
+  message( FATAL_ERROR "The C# target platform '${CSHARP_PLATFORM}' is not valid. Please enter one of the following: x86, x64, anycpu, or itanium" )
+endif( )
+
+if( WIN32 )
+  find_package( DotNetFrameworkSdk )
+  if( NOT CSHARP_DOTNET_FOUND )
+    find_package( Mono )
+  endif( )
+else( UNIX )
+  find_package( Mono )
+endif( )
+
+if( CSHARP_DOTNET_FOUND )
+  set( CSHARP_TYPE ".NET" CACHE STRING "Using the .NET compiler" )
+  set( CSHARP_VERSION ${CSHARP_DOTNET_VERSION} CACHE STRING "C# .NET compiler version" FORCE )
+  set( CSHARP_COMPILER ${CSHARP_DOTNET_COMPILER_${CSHARP_DOTNET_VERSION}} CACHE STRING "Full path to .NET compiler" FORCE )
+  set( CSHARP_INTERPRETER "" CACHE INTERNAL "Interpretor not required for .NET" FORCE )
+elseif( CSHARP_MONO_FOUND )
+  set( CSHARP_TYPE "Mono" CACHE STRING "Using the Mono compiler" )
+  set( CSHARP_VERSION ${CSHARP_MONO_VERSION} CACHE STRING "C# Mono compiler version" FORCE )
+  set( CSHARP_COMPILER ${CSHARP_MONO_COMPILER_${CSHARP_MONO_VERSION}} CACHE STRING "Full path to Mono compiler" FORCE )
+  set( CSHARP_INTERPRETER ${CSHARP_MONO_INTERPRETER_${CSHARP_MONO_VERSION}} CACHE STRING "Full path to Mono interpretor" FORCE )
+  set( CSHARP_SDK "/sdk:4.5" CACHE STRING "C# Mono SDK commandline switch (e.g. /sdk:2, /sdk:4, /sdk:5)" )
+endif( )
+
+# Handle WIN32 specific issues
+if ( WIN32 )
+  if ( CSHARP_COMPILER MATCHES "bat" )
+    set( CSHARP_COMPILER "call ${CSHARP_COMPILER}" )
+  endif ( )
+endif( )
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(CSharp DEFAULT_MSG CSHARP_TYPE CSHARP_VERSION CSHARP_COMPILER)
+
+mark_as_advanced( CSHARP_TYPE CSHARP_VERSION CSHARP_COMPILER CSHARP_INTERPRETER CSHARP_PLATFORM CSHARP_SDK )
+
+# Set the USE_FILE path
+# http://public.kitware.com/Bug/view.php?id=7757
+get_filename_component( current_list_path ${CMAKE_CURRENT_LIST_FILE} PATH )
+set( CSHARP_USE_FILE ${current_list_path}/UseCSharp.cmake )
diff --git a/cmake/FindDotNetFrameworkSdk.cmake b/cmake/FindDotNetFrameworkSdk.cmake

new file mode 100644 (file)

index 0000000..8e12c70
--- /dev/null
+++ b/cmake/FindDotNetFrameworkSdk.cmake
@@ -0,0 +1,29 @@
+# Set paths and vars for .NET compilers 
+# This is hand-rolled because I had problems with the one from SimpleITK
+
+#
+# The following variables are set:
+#   CSHARP_DOTNET_FOUND
+#   CSHARP_DOTNET_COMPILER_${version} eg. "CSHARP_DOTNET_COMPILER_v4.0.30319"
+#   CSHARP_DOTNET_VERSION eg. "v4.0.30319"
+#   CSHARP_DOTNET_VERSIONS eg. "v2.0.50727, v3.5, v4.0.30319"
+#   DotNetFrameworkSdk_USE_FILE
+#
+#   CSHARP_PROJECT_BUILDER (xbuild/msbuild)
+
+set(framework_dir "C:/Windows/Microsoft.NET/Framework")
+
+set(CSHARP_DOTNET_VERSION "v4.0.30319")
+set(CSHARP_DOTNET_VERSIONS "")
+set(CSHARP_DOTNET_COMPILER_${CSHARP_DOTNET_VERSION} "${framework_dir}/${CSHARP_DOTNET_VERSION}/csc.exe")
+set(CSHARP_PROJECT_BUILDER "${framework_dir}/${CSHARP_DOTNET_VERSION}/MSBuild.exe")
+
+if(EXISTS ${CSHARP_DOTNET_COMPILER_${CSHARP_DOTNET_VERSION}})
+       set(CSHARP_DOTNET_FOUND 1)
+else()
+       set(CSHARP_DOTNET_FOUND 0)
+endif()
+
+# Set USE_FILE
+get_filename_component( current_list_path ${CMAKE_CURRENT_LIST_FILE} PATH )
+set( DotNetFrameworkSdk_USE_FILE ${current_list_path}/UseDotNetFrameworkSdk.cmake )
+\ No newline at end of file
diff --git a/cmake/FindMono.cmake b/cmake/FindMono.cmake

new file mode 100644 (file)

index 0000000..0fab116
--- /dev/null
+++ b/cmake/FindMono.cmake
@@ -0,0 +1,167 @@
+#
+# A CMake Module for finding Mono.
+#
+# The following variables are set:
+#   CSHARP_MONO_FOUND
+#   CSHARP_MONO_COMPILER_${version} eg. "CSHARP_MONO_COMPILER_2.10.2"
+#   CSHARP_MONO_INTERPRETOR_${version} eg. "CSHARP_MONO_INTERPRETOR_2.10.2"
+#   CSHARP_MONO_VERSION eg. "2.10.2"
+#   CSHARP_MONO_VERSIONS eg. "2.10.2, 2.6.7"
+#
+# Additional references can be found here:
+#   http://www.mono-project.com/Main_Page
+#   http://www.mono-project.com/CSharp_Compiler
+#   http://mono-project.com/FAQ:_Technical (How can I tell where the Mono runtime is installed)
+#
+# This file is based on the work of GDCM:
+#   http://gdcm.svn.sf.net/viewvc/gdcm/trunk/CMake/FindMono.cmake
+# Copyright (c) 2006-2010 Mathieu Malaterre <mathieu.malaterre@gmail.com>
+#
+
+set( csharp_mono_valid 1 )
+if( DEFINED CSHARP_MONO_FOUND )
+  # The Mono compiler has already been found
+  # It may have been reset by the user, verify it is correct
+  if( NOT DEFINED CSHARP_MONO_COMPILER_${CSHARP_MONO_VERSION} )
+    set( csharp_mono_version_user ${CSHARP_MONO_VERSION} )
+    set( csharp_mono_valid 0 )
+    set( CSHARP_MONO_FOUND 0 )
+    set( CSHARP_MONO_VERSION "CSHARP_MONO_VERSION-NOTVALID" CACHE STRING "C# Mono compiler version, choices: ${CSHARP_MONO_VERSIONS}" FORCE )
+    message( FATAL_ERROR "The C# Mono version '${csharp_mono_version_user}' is not valid. Please enter one of the following: ${CSHARP_MONO_VERSIONS}" )
+  endif( NOT DEFINED CSHARP_MONO_COMPILER_${CSHARP_MONO_VERSION} )
+endif( DEFINED CSHARP_MONO_FOUND )
+
+unset( CSHARP_MONO_VERSIONS CACHE ) # Clear versions
+if( WIN32 )
+  # Search for Mono on Win32 systems
+  # See http://mono-project.com/OldReleases and http://www.go-mono.com/mono-downloads/download.html
+  set( csharp_mono_bin_dirs )
+  set( csharp_mono_search_hints
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.11.2;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.9;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.8;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.7;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.6;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.5;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.4;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.3;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.2;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10.1;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.10;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.8;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.6.7;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.6.4;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.6.3;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.6.1;SdkInstallRoot]/bin"
+    "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono\\2.6;SdkInstallRoot]/bin"
+  )
+  foreach( csharp_mono_search_hint ${csharp_mono_search_hints} )
+    get_filename_component( csharp_mono_bin_dir "${csharp_mono_search_hint}" ABSOLUTE )
+    if ( EXISTS "${csharp_mono_bin_dir}" )
+      set( csharp_mono_bin_dirs ${csharp_mono_bin_dirs} ${csharp_mono_bin_dir} )
+    endif ( EXISTS "${csharp_mono_bin_dir}" )
+  endforeach( csharp_mono_search_hint )
+  # TODO: Use HKLM_LOCAL_MACHINE\Software\Novell\Mono\DefaultCLR to specify default version
+  # get_filename_component( test "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Novell\\Mono;DefaultCLR]" NAME )
+
+  foreach ( csharp_mono_bin_dir ${csharp_mono_bin_dirs} )
+    string( REPLACE "\\" "/" csharp_mono_bin_dir ${csharp_mono_bin_dir} )
+    if (EXISTS "${csharp_mono_bin_dir}/dmcs.bat")
+      set( csharp_mono_executable "${csharp_mono_bin_dir}/dmcs.bat")
+    elseif (EXISTS "${csharp_mono_bin_dir}/gmcs.bat")
+      set( csharp_mono_executable "${csharp_mono_bin_dir}/gmcs.bat")
+    elseif (EXISTS "${csharp_mono_bin_dir}/mcs.bat")
+      set( csharp_mono_executable "${csharp_mono_bin_dir}/mcs.bat")
+    endif (EXISTS "${csharp_mono_bin_dir}/dmcs.bat")
+
+    if( csharp_mono_valid )
+      # Extract version number (eg. 2.10.2)
+      string(REGEX MATCH "([0-9]*)([.])([0-9]*)([.]*)([0-9]*)" csharp_mono_version_temp ${csharp_mono_bin_dir})
+      set( CSHARP_MONO_VERSION ${csharp_mono_version_temp} CACHE STRING "C# Mono compiler version" )
+      mark_as_advanced( CSHARP_MONO_VERSION )
+
+      # Add variable holding executable
+      set( CSHARP_MONO_COMPILER_${csharp_mono_version_temp} ${csharp_mono_executable} CACHE STRING "C# Mono compiler ${csharp_mono_version_temp}" FORCE )
+      mark_as_advanced( CSHARP_MONO_COMPILER_${csharp_mono_version_temp} )
+
+      # Set interpreter
+      if (EXISTS "${csharp_mono_bin_dir}/mono.exe")
+        set( CSHARP_MONO_INTERPRETER_${csharp_mono_version_temp} "${csharp_mono_bin_dir}/mono.exe" CACHE STRING "C# Mono interpreter ${csharp_mono_version_temp}" FORCE )
+        mark_as_advanced( CSHARP_MONO_INTERPRETER_${csharp_mono_version_temp} )
+      endif (EXISTS "${csharp_mono_bin_dir}/mono.exe")
+    endif( csharp_mono_valid )
+
+    # Create a list of supported compiler versions
+    if( NOT DEFINED CSHARP_MONO_VERSIONS )
+      set( CSHARP_MONO_VERSIONS "${csharp_mono_version_temp}" CACHE STRING "Available C# Mono compiler versions" FORCE )
+    else( NOT DEFINED CSHARP_MONO_VERSIONS )
+      set( CSHARP_MONO_VERSIONS "${CSHARP_MONO_VERSIONS}, ${csharp_mono_version_temp}"  CACHE STRING "Available C# Mono versions" FORCE )
+    endif( NOT DEFINED CSHARP_MONO_VERSIONS )
+    mark_as_advanced( CSHARP_MONO_VERSIONS )
+
+    # We found at least one Mono compiler version
+    set( CSHARP_MONO_FOUND 1 CACHE INTERNAL "Boolean indicating if C# Mono was found" )
+  endforeach( csharp_mono_bin_dir )
+
+else( UNIX )
+  # Search for Mono on non-Win32 systems
+  set( chsarp_mono_names "mcs" "mcs.exe" "dmcs" "dmcs.exe" "smcs" "smcs.exe" "gmcs" "gmcs.exe" )
+  set(
+    csharp_mono_paths
+    "/usr/bin/"
+    "/usr/local/bin/"
+    "/usr/lib/mono/2.0"
+    "/opt/novell/mono/bin"
+  )
+  find_program(
+    csharp_mono_compiler # variable is added to the cache, we removed it below
+    NAMES ${chsarp_mono_names}
+    PATHS ${csharp_mono_paths}
+  )
+
+  if( EXISTS ${csharp_mono_compiler} )
+    # Determine version
+    find_program(
+      csharp_mono_interpreter # variable is added to the cache, we removed it below
+      NAMES mono
+      PATHS ${csharp_mono_paths}
+    )
+    if ( EXISTS ${csharp_mono_interpreter} )
+      execute_process(
+        COMMAND ${csharp_mono_interpreter} -V
+        OUTPUT_VARIABLE csharp_mono_version_string
+      )
+      string( REGEX MATCH "([0-9]*)([.])([0-9]*)([.]*)([0-9]*)" csharp_mono_version_temp ${csharp_mono_version_string} )
+      set( CSHARP_MONO_INTERPRETER_${CSHARP_MONO_VERSION} ${csharp_mono_interpreter} CACHE STRING "C# Mono interpreter ${csharp_mono_version_temp}" FORCE )
+      mark_as_advanced( CSHARP_MONO_INTERPRETER_${CSHARP_MONO_VERSION} )
+    endif ( EXISTS ${csharp_mono_interpreter} )
+    unset( csharp_mono_interpreter CACHE )
+
+    # We found Mono compiler
+    set( CSHARP_MONO_VERSION ${csharp_mono_version_temp} CACHE STRING "C# Mono compiler version" )
+    mark_as_advanced( CSHARP_MONO_VERSION )
+    set( CSHARP_MONO_COMPILER_${CSHARP_MONO_VERSION} ${csharp_mono_compiler} CACHE STRING "C# Mono compiler ${CSHARP_MONO_VERSION}" FORCE )
+    mark_as_advanced( CSHARP_MONO_COMPILER_${CSHARP_MONO_VERSION} )
+    set( CSHARP_MONO_VERSIONS ${CSHARP_MONO_VERSION} CACHE STRING "Available C# Mono compiler versions" FORCE )
+    mark_as_advanced( CSHARP_MONO_VERSIONS )
+    set( CSHARP_MONO_FOUND 1 CACHE INTERNAL "Boolean indicating if C# Mono was found" )
+
+    # Assume xbuild is just xbuild.
+    set(CSHARP_PROJECT_BUILDER "xbuild")
+
+
+  endif( EXISTS ${csharp_mono_compiler} )
+
+  # Remove temp variable from cache
+  unset( csharp_mono_compiler CACHE )
+
+endif( WIN32 )
+
+if( CSHARP_MONO_FOUND )
+  # Report the found versions
+  message( STATUS "Found the following C# Mono versions: ${CSHARP_MONO_VERSIONS}" )
+endif( CSHARP_MONO_FOUND )
+
+# Set USE_FILE
+get_filename_component( current_list_path ${CMAKE_CURRENT_LIST_FILE} PATH )
+set( Mono_USE_FILE ${current_list_path}/UseMono.cmake )
diff --git a/cmake/FindR.cmake b/cmake/FindR.cmake

new file mode 100644 (file)

index 0000000..6ae4354
--- /dev/null
+++ b/cmake/FindR.cmake
@@ -0,0 +1,48 @@
+
+#
+# - This module locates an installed R distribution.
+#
+# Defines the following:
+#
+# R_INCLUDE_DIR      - Path to R include directory
+# R_LIBRARIES        - Path to R library
+# R_LIBRARY_BASE     -
+# R_COMMAND          - Path to R command
+# RSCRIPT_EXECUTABLE - Path to Rscript command
+#
+
+
+# Make sure find package macros are included
+include( FindPackageHandleStandardArgs )
+
+set(TEMP_CMAKE_FIND_APPBUNDLE ${CMAKE_FIND_APPBUNDLE})
+set(CMAKE_FIND_APPBUNDLE "NEVER")
+find_program(R_COMMAND R DOC "R executable.")
+if(R_COMMAND)
+  execute_process(WORKING_DIRECTORY . COMMAND ${R_COMMAND} RHOME OUTPUT_VARIABLE R_BASE_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
+  set(R_HOME ${R_BASE_DIR} CACHE PATH "R home directory obtained from R RHOME")
+  mark_as_advanced(R_HOME)
+endif(R_COMMAND)
+
+find_program(RSCRIPT_EXECUTABLE Rscript DOC "Rscript executable.")
+
+set(CMAKE_FIND_APPBUNDLE ${TEMP_CMAKE_FIND_APPBUNDLE})
+
+# R.h gets installed in all sorts of places -
+# ubuntu: /usr/share/R/include, RHEL/Fedora: /usr/include/R/R.h
+find_path(R_INCLUDE_DIR R.h PATHS ${R_INCLUDE_DIR_HINT} /usr/local/lib /usr/local/lib64 /usr/share /usr/include ${R_BASE_DIR} PATH_SUFFIXES include R R/include DOC "Path to file R.h")
+find_library(R_LIBRARY_BASE R PATHS ${R_BASE_DIR} PATH_SUFFIXES /lib DOC "R library (example libR.a, libR.dylib, etc.).")
+
+set(R_LIBRARIES ${R_LIBRARY_BASE})
+mark_as_advanced(RSCRIPT_EXECUTABLE R_LIBRARIES R_INCLUDE_DIR R_COMMAND R_LIBRARY_BASE)
+
+
+set( _REQUIRED_R_VARIABLES R_INCLUDE_DIR R_COMMAND )
+
+if( APPLE )
+  # On linux platform some times the libR.so is not available, however
+  # on apple a link error results if the library is linked.
+  list(  APPEND _REQUIRED_R_VARIABLES R_LIBRARIES R_LIBRARY_BASE )
+endif()
+
+find_package_handle_standard_args(R DEFAULT_MSG ${_REQUIRED_R_VARIABLES} )
diff --git a/cmake/PbbamTool.cmake b/cmake/PbbamTool.cmake

new file mode 100644 (file)

index 0000000..daed917
--- /dev/null
+++ b/cmake/PbbamTool.cmake
@@ -0,0 +1,23 @@
+include(CMakeParseArguments)
+
+function(create_pbbam_tool)
+
+    # parse args
+    set(oneValueArgs TARGET)
+    set(multiValueArgs SOURCES)
+    cmake_parse_arguments(create_pbbam_tool "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    # create executable
+    include_directories(
+        ${ToolsCommonDir}           # shared tool code
+        ${GeneratedDir}             # generated version headers
+        ${PacBioBAM_INCLUDE_DIRS}   # pbbam/htslib includes
+    )
+    add_executable(${create_pbbam_tool_TARGET} ${create_pbbam_tool_SOURCES})
+    set_target_properties(
+        ${create_pbbam_tool_TARGET} PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY ${PacBioBAM_BinDir}
+    )
+    target_link_libraries(${create_pbbam_tool_TARGET} pbbam)
+
+endfunction(create_pbbam_tool)
diff --git a/cmake/UseCSharp.cmake b/cmake/UseCSharp.cmake

new file mode 100644 (file)

index 0000000..dac4537
--- /dev/null
+++ b/cmake/UseCSharp.cmake
@@ -0,0 +1,111 @@
+# CMake Module for finding and using C# (.NET and Mono).
+#
+# The following global variables are assumed to exist:
+#   CSHARP_SOURCE_DIRECTORY - path to C# sources
+#   CSHARP_BINARY_DIRECTORY - path to place resultant C# binary files
+#
+# The following variables are set:
+#   CSHARP_TYPE - the type of the C# compiler (eg. ".NET" or "Mono")
+#   CSHARP_COMPILER - the path to the C# compiler executable (eg. "C:/Windows/Microsoft.NET/Framework/v4.0.30319/csc.exe")
+#   CSHARP_VERSION - the version number of the C# compiler (eg. "v4.0.30319")
+#
+# The following macros are defined:
+#   CSHARP_ADD_EXECUTABLE( name references [files] [output_dir] ) - Define C# executable with the given name
+#   CSHARP_ADD_LIBRARY( name references [files] [output_dir] ) - Define C# library with the given name
+#
+# Examples:
+#   CSHARP_ADD_EXECUTABLE( MyExecutable "" "Program.cs" )
+#   CSHARP_ADD_EXECUTABLE( MyExecutable "ref1.dll ref2.dll" "Program.cs File1.cs" )
+#   CSHARP_ADD_EXECUTABLE( MyExecutable "ref1.dll;ref2.dll" "Program.cs;File1.cs" )
+#
+# This file is based on the work of GDCM:
+#   http://gdcm.svn.sf.net/viewvc/gdcm/trunk/CMake/UseCSharp.cmake
+# Copyright (c) 2006-2010 Mathieu Malaterre <mathieu.malaterre@gmail.com>
+#
+
+# TODO: ADD SUPPORT FOR LINK LIBRARIES
+
+# Check something was found
+if( NOT CSHARP_COMPILER )
+  message( WARNING "A C# compiler executable was not found on your system" )
+endif( NOT CSHARP_COMPILER )
+
+# Include type-based USE_FILE
+if( CSHARP_TYPE MATCHES ".NET" )
+  include( ${DotNetFrameworkSdk_USE_FILE} )
+elseif ( CSHARP_TYPE MATCHES "Mono" )
+  include( ${Mono_USE_FILE} )
+endif ( CSHARP_TYPE MATCHES ".NET" )
+
+macro( CSHARP_ADD_LIBRARY name )
+  CSHARP_ADD_PROJECT( "library" ${name} ${ARGN} )
+endmacro( CSHARP_ADD_LIBRARY )
+
+macro( CSHARP_ADD_EXECUTABLE name )
+  CSHARP_ADD_PROJECT( "exe" ${name} ${ARGN} )
+endmacro( CSHARP_ADD_EXECUTABLE )
+
+# Private macro
+macro( CSHARP_ADD_PROJECT type name )
+  set( refs "/reference:System.dll" )
+  set( sources )
+  set( sources_dep )
+
+  if( ${type} MATCHES "library" )
+    set( output "dll" )
+  elseif( ${type} MATCHES "exe" )
+    set( output "exe" )
+  endif( ${type} MATCHES "library" )
+
+  # Step through each argument
+  foreach( it ${ARGN} )
+    if( ${it} MATCHES "(.*)(dll)" )
+       # Argument is a dll, add reference
+       list( APPEND refs /reference:${it} )
+    else( )
+      # Argument is a source file
+      if( EXISTS ${it} )
+        list( APPEND sources ${it} )
+        list( APPEND sources_dep ${it} )
+      elseif( EXISTS ${CSHARP_SOURCE_DIRECTORY}/${it} )
+        list( APPEND sources ${CSHARP_SOURCE_DIRECTORY}/${it} )
+        list( APPEND sources_dep ${CSHARP_SOURCE_DIRECTORY}/${it} )
+      elseif( ${it} MATCHES "[*]" )
+        # For dependencies, we need to expand wildcards
+        FILE( GLOB it_glob ${it} )
+        list( APPEND sources ${it} )
+        list( APPEND sources_dep ${it_glob} )
+      endif( )
+    endif ( )
+  endforeach( )
+
+  # Check we have at least one source
+  list( LENGTH sources_dep sources_length )
+  if ( ${sources_length} LESS 1 )
+    MESSAGE( SEND_ERROR "No C# sources were specified for ${type} ${name}" )
+  endif ()
+  list( SORT sources_dep )
+
+  # Perform platform specific actions
+  if (WIN32)
+    string( REPLACE "/" "\\" sources ${sources} )
+  else (UNIX)
+    string( REPLACE "\\" "/" sources ${sources} )
+  endif (WIN32)
+
+  # Add custom target and command
+  MESSAGE( STATUS "Adding C# ${type} ${name}: '${CSHARP_COMPILER} /t:${type} /out:${name}.${output} /platform:${CSHARP_PLATFORM} ${CSHARP_SDK} ${refs} ${sources}'" )
+  add_custom_command(
+    COMMENT "Compiling C# ${type} ${name}: '${CSHARP_COMPILER} /t:${type} /out:${name}.${output} /platform:${CSHARP_PLATFORM} ${CSHARP_SDK} ${refs} ${sources}'"
+    OUTPUT ${CSHARP_BINARY_DIRECTORY}/${name}.${output}
+    COMMAND ${CSHARP_COMPILER}
+    ARGS /t:${type} /out:${name}.${output} /platform:${CSHARP_PLATFORM} ${CSHARP_SDK} ${refs} ${sources}
+    WORKING_DIRECTORY ${CSHARP_BINARY_DIRECTORY}
+    DEPENDS ${sources_dep}
+  )
+  add_custom_target(
+    ${name} ALL
+    DEPENDS ${CSHARP_BINARY_DIRECTORY}/${name}.${output}
+    SOURCES ${sources_dep}
+  )
+endmacro( CSHARP_ADD_PROJECT )
diff --git a/cmake/UseDotNetFrameworkSdk.cmake b/cmake/UseDotNetFrameworkSdk.cmake

new file mode 100644 (file)

index 0000000..6be4027
--- /dev/null
+++ b/cmake/UseDotNetFrameworkSdk.cmake
@@ -0,0 +1,16 @@
+#
+# A CMake Module for using Mono.
+#
+# The following variables are set:
+#   (none)
+#
+# Additional references can be found here:
+#   http://www.mono-project.com/Main_Page
+#   http://www.mono-project.com/CSharp_Compiler
+#
+# This file is based on the work of GDCM:
+#   http://gdcm.svn.sf.net/viewvc/gdcm/trunk/CMake/FindMono.cmake
+# Copyright (c) 2006-2010 Mathieu Malaterre <mathieu.malaterre@gmail.com>
+#
+
+message( STATUS "Using .NET compiler version ${CSHARP_DOTNET_VERSION}" )
+\ No newline at end of file
diff --git a/cmake/UseMono.cmake b/cmake/UseMono.cmake

new file mode 100644 (file)

index 0000000..16a80ae
--- /dev/null
+++ b/cmake/UseMono.cmake
@@ -0,0 +1,16 @@
+#
+# A CMake Module for using Mono.
+#
+# The following variables are set:
+#   (none)
+#
+# Additional references can be found here:
+#   http://www.mono-project.com/Main_Page
+#   http://www.mono-project.com/CSharp_Compiler
+#
+# This file is based on the work of GDCM:
+#   http://gdcm.svn.sf.net/viewvc/gdcm/trunk/CMake/FindMono.cmake
+# Copyright (c) 2006-2010 Mathieu Malaterre <mathieu.malaterre@gmail.com>
+#
+
+message( STATUS "Using Mono compiler version ${CSHARP_MONO_VERSION}" )
diff --git a/cmake/pbbam-ccache.cmake b/cmake/pbbam-ccache.cmake

new file mode 100644 (file)

index 0000000..21b8ac5
--- /dev/null
+++ b/cmake/pbbam-ccache.cmake
@@ -0,0 +1,8 @@
+
+if(PacBioBAM_use_ccache)
+    find_program(CCACHE_FOUND ccache)
+    if(CCACHE_FOUND)
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK    ccache)
+    endif()
+endif()
diff --git a/cmake/pbbam-compilerflags.cmake b/cmake/pbbam-compilerflags.cmake

new file mode 100644 (file)

index 0000000..fcfd321
--- /dev/null
+++ b/cmake/pbbam-compilerflags.cmake
@@ -0,0 +1,44 @@
+
+include(CheckCXXCompilerFlag)
+
+# C++11 check & enabling
+if (CMAKE_VERSION VERSION_LESS "3.1")
+    if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")    # clang
+    else()
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")        # gcc
+    endif()
+else() # 3.1+
+    set(CMAKE_CXX_STANDARD          11)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+endif()
+
+# shared CXX flags for src & tests
+if (MSVC)
+    set(PacBioBAM_CXX_FLAGS "/Wall")
+else()
+    set(PacBioBAM_CXX_FLAGS "-Wall")
+endif()
+
+# NOTE: -Wno-unused-local-typedefs used to quash clang warnings w/ Boost
+check_cxx_compiler_flag("-Wno-unused-local-typedefs" HAS_NO_UNUSED_LOCAL_TYPEDEFS)
+if(HAS_NO_UNUSED_LOCAL_TYPEDEFS)
+    set(PacBioBAM_CXX_FLAGS "${PacBioBAM_CXX_FLAGS} -Wno-unused-local-typedefs")
+endif()
+
+check_cxx_compiler_flag("-Wno-sign-compare" HAS_NO_SIGN_COMPARE)
+if(HAS_NO_SIGN_COMPARE)
+    set(PacBioBAM_CXX_FLAGS "${PacBioBAM_CXX_FLAGS} -Wno-sign-compare")
+endif()
+
+# Turn on windows-style filepath resolution.
+# We need to add this #define early (not just in the C# SWIG wrapper)
+if(WIN32 AND PacBioBAM_wrap_csharp)
+    add_definitions(-DPBBAM_WIN_FILEPATHS)
+endif()
+
+# For now, keep @rpath out of install names on OS X, as it causes SWIG
+# tests to fail.
+if(APPLE)
+    set(CMAKE_MACOSX_RPATH OFF)
+endif()
diff --git a/cmake/pbbam-dependencies.cmake b/cmake/pbbam-dependencies.cmake

new file mode 100644 (file)

index 0000000..c2e21e6
--- /dev/null
+++ b/cmake/pbbam-dependencies.cmake
@@ -0,0 +1,23 @@
+
+# pthreads
+find_package(Threads REQUIRED)
+
+# boost
+if(NOT Boost_INCLUDE_DIRS)
+    find_package(Boost REQUIRED)
+endif()
+
+# Winsock for htslib on Windows
+if(WIN32)
+    set(SOCKET_LIBRARIES "ws2_32")
+endif()
+
+# zlib
+if(NOT ZLIB_INCLUDE_DIRS OR NOT ZLIB_LIBRARIES)
+    find_package(ZLIB REQUIRED)
+endif()
+
+# htslib
+if(NOT HTSLIB_INCLUDE_DIRS OR NOT HTSLIB_LIBRARIES)
+    add_subdirectory(third-party/htslib external/htslib)
+endif()
diff --git a/cmake/pbbam-libtype.cmake b/cmake/pbbam-libtype.cmake

new file mode 100644 (file)

index 0000000..4b9c0dd
--- /dev/null
+++ b/cmake/pbbam-libtype.cmake
@@ -0,0 +1,21 @@
+
+# determine if we need a shared lib
+if(PacBioBAM_build_shared OR ${wrapping_swig})
+    set(BUILD_SHARED_LIBS ON)
+    set(htslib_build_shared ON CACHE BOOL "force htslibConfig to export proper library name")
+    set(PB_LIB_MODE SHARED)
+    set(PB_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
+else()
+    set(BUILD_SHARED_LIBS OFF)
+    set(PB_LIB_MODE STATIC)
+    set(PB_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
+endif()
+
+if(WIN32)
+    # Limit the number of DLLs we will have to bundle
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -static-libstdc++")
+  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -static-libstdc++")
+endif()
+
+
+
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt

new file mode 100644 (file)

index 0000000..ff044b9
--- /dev/null
+++ b/docs/CMakeLists.txt
@@ -0,0 +1,11 @@
+find_package(Doxygen)
+
+if(DOXYGEN_FOUND)
+    configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${PacBioBAM_DocsDir}/Doxyfile @ONLY )
+    add_custom_target(doc
+        ${DOXYGEN_EXECUTABLE} ${PacBioBAM_DocsDir}/Doxyfile
+        WORKING_DIRECTORY ${PacBioBAM_DocsDir}
+        COMMENT "Generating API documentation with Doxygen"
+        VERBATIM
+    )
+endif()
diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in

new file mode 100644 (file)

index 0000000..90f6f63
--- /dev/null
+++ b/docs/Doxyfile.in
@@ -0,0 +1,1602 @@
+# Doxyfile 1.6.3
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file 
+# that follow. The default is UTF-8 which is also the encoding used for all 
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the 
+# iconv built into libc) for the transcoding. See 
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded 
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = @PacBioBAM_NAME@
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. 
+# This could be handy for archiving the generated documentation or 
+# if some version control system is used.
+
+PROJECT_NUMBER         = @PacBioBAM_VERSION@
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
+# base path where the generated documentation will be put. 
+# If a relative path is entered, it will be relative to the location 
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = @PacBioBAM_DocsDir@
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
+# 4096 sub-directories (in 2 levels) under the output directory of each output 
+# format and will distribute the generated files over these directories. 
+# Enabling this option can be useful when feeding doxygen a huge amount of 
+# source files, where putting all generated files in the same directory would 
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all 
+# documentation generated by doxygen is written. Doxygen will use this 
+# information to generate all constant output in the proper language. 
+# The default language is English, other supported languages are: 
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, 
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, 
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English 
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, 
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, 
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will 
+# include brief member descriptions after the members that are listed in 
+# the file and class documentation (similar to JavaDoc). 
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend 
+# the brief description of a member or function before the detailed description. 
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the 
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator 
+# that is used to form the text in various listings. Each string 
+# in this list, if found as the leading text of the brief description, will be 
+# stripped from the text and the result after processing the whole list, is 
+# used as the annotated text. Otherwise, the brief description is used as-is. 
+# If left blank, the following values are used ("$name" is automatically 
+# replaced with the name of the entity): "The $name class" "The $name widget" 
+# "The $name file" "is" "provides" "specifies" "contains" 
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then 
+# Doxygen will generate a detailed section even if there is only a brief 
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all 
+# inherited members of a class in the documentation of that class as if those 
+# members were ordinary class members. Constructors, destructors and assignment 
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full 
+# path before files name in the file list and in the header files. If set 
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag 
+# can be used to strip a user-defined part of the path. Stripping is 
+# only done if one of the specified strings matches the left-hand part of 
+# the path. The tag can be used to show relative paths in the file list. 
+# If left blank the directory from which doxygen is run is used as the 
+# path to strip.
+
+STRIP_FROM_PATH        = 
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of 
+# the path mentioned in the documentation of a class, which tells 
+# the reader which header file to include in order to use a class. 
+# If left blank only the name of the header file containing the class 
+# definition is used. Otherwise one should specify the include paths that 
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    = @PacBioBAM_IncludeDir@
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 
+# (but less readable) file names. This can be useful is your file systems 
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen 
+# will interpret the first line (until the first dot) of a JavaDoc-style 
+# comment as the brief description. If set to NO, the JavaDoc 
+# comments will behave just like regular Qt-style comments 
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will 
+# interpret the first line (until the first dot) of a Qt-style 
+# comment as the brief description. If set to NO, the comments 
+# will behave just like regular Qt-style comments (thus requiring 
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen 
+# treat a multi-line C++ special comment block (i.e. a block of //! or /// 
+# comments) as a brief description. This used to be the default behaviour. 
+# The new default is to treat a multi-line C++ comment block as a detailed 
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented 
+# member inherits the documentation from any documented member that it 
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce 
+# a new page for each member. If set to NO, the documentation of a member will 
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. 
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 1
+
+# This tag can be used to specify a number of aliases that acts 
+# as commands in the documentation. An alias has the form "name=value". 
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to 
+# put the command \sideeffect (or @sideeffect) in the documentation, which 
+# will result in a user-defined paragraph with heading "Side Effects:". 
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+#samSpecURL=http://samtools.sourceforge.net/SAM1.pdf
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C 
+# sources only. Doxygen will then generate output that is more tailored for C. 
+# For instance, some of the names that are used will be different. The list 
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Java. For instance, namespaces will be presented as packages, qualified 
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL 
+# sources. Doxygen will then generate output that is tailored for 
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it parses. 
+# With this tag you can assign which parser to use for a given extension. 
+# Doxygen has a built-in mapping, but you can override or extend it using this tag. 
+# The format is ext=language, where ext is a file extension, and language is one of 
+# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP, 
+# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat 
+# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran), 
+# use: inc=Fortran f=C. Note that for custom extensions you also need to set
+# FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      = 
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want 
+# to include (a tag file for) the STL sources as input, then you should 
+# set this tag to YES in order to let doxygen match functions declarations and 
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. 
+# func(std::string) {}). This also make the inheritance and collaboration 
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to 
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. 
+# Doxygen will parse them like normal C++ but will assume all classes use public 
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter 
+# and setter methods for a property. Setting this option to YES (the default) 
+# will make doxygen to replace the get and set methods by a property in the 
+# documentation. This will only work if the methods are indeed getting or 
+# setting a simple type. If this is not the case, or you want to show the 
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC 
+# tag is set to YES, then doxygen will reuse the documentation of the first 
+# member in the group (if any) for the other members of the group. By default 
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of 
+# the same type (for instance a group of public functions) to be put as a 
+# subgroup of that type (e.g. under the Public Functions section). Set it to 
+# NO to prevent subgrouping. Alternatively, this can be done per class using 
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum 
+# is documented as struct, union, or enum with the name of the typedef. So 
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct 
+# with name TypeT. When disabled the typedef will appear as a member of a file, 
+# namespace, or class. And the struct will be named TypeS. This can typically 
+# be useful for C code in case the coding convention dictates that all compound 
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to 
+# determine which symbols to keep in memory and which to flush to disk. 
+# When the cache is full, less often used symbols will be written to disk. 
+# For small to medium size projects (<1000 input files) the default value is 
+# probably good enough. For larger projects a too small cache size can cause 
+# doxygen to be busy swapping symbols to and from disk most of the time 
+# causing a significant performance penality. 
+# If the system has enough physical memory increasing the cache will improve the 
+# performance by keeping more symbols in memory. Note that the value works on 
+# a logarithmic scale so increasing the size by one will rougly double the 
+# memory usage. The cache size is given by this formula: 
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, 
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in 
+# documentation are documented, even if no documentation was available. 
+# Private class members and static file members will be hidden unless 
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class 
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file 
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) 
+# defined locally in source files will be included in the documentation. 
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local 
+# methods, which are defined in the implementation section but not in 
+# the interface are included in the documentation. 
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be 
+# extracted and appear in the documentation as a namespace called 
+# 'anonymous_namespace{file}', where file will be replaced with the base 
+# name of the file that contains the anonymous namespace. By default 
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all 
+# undocumented members of documented classes, files or namespaces. 
+# If set to NO (the default) these members will be included in the 
+# various overviews, but no documentation section is generated. 
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all 
+# undocumented classes that are normally visible in the class hierarchy. 
+# If set to NO (the default) these classes will be included in the various 
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all 
+# friend (class|struct|union) declarations. 
+# If set to NO (the default) these declarations will be included in the 
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any 
+# documentation blocks found inside the body of a function. 
+# If set to NO (the default) these blocks will be appended to the 
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation 
+# that is typed after a \internal command is included. If the tag is set 
+# to NO (the default) then the documentation will be excluded. 
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate 
+# file names in lower-case letters. If set to YES upper-case letters are also 
+# allowed. This is useful if you have classes or files whose names only differ 
+# in case and if your file system supports case sensitive file names. Windows 
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen 
+# will show members with their full class and namespace scopes in the 
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen 
+# will put a list of the files that are included by a file in the documentation 
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen 
+# will list include files with double quotes in the documentation 
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] 
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen 
+# will sort the (detailed) documentation of file and class members 
+# alphabetically by member name. If set to NO the members will appear in 
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the 
+# brief documentation of file, namespace and class members alphabetically 
+# by member name. If set to NO (the default) the members will appear in 
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the 
+# hierarchy of group names into alphabetical order. If set to NO (the default) 
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be 
+# sorted by fully-qualified names, including namespaces. If set to 
+# NO (the default), the class list will be sorted only by class name, 
+# not including the namespace part. 
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. 
+# Note: This option applies only to the class list, not to the 
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or 
+# disable (NO) the todo list. This list is created by putting \todo 
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or 
+# disable (NO) the test list. This list is created by putting \test 
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or 
+# disable (NO) the bug list. This list is created by putting \bug 
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or 
+# disable (NO) the deprecated list. This list is created by putting 
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional 
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       = 
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines 
+# the initial value of a variable or define consists of for it to appear in 
+# the documentation. If the initializer consists of more lines than specified 
+# here it will be hidden. Use a value of 0 to hide initializers completely. 
+# The appearance of the initializer of individual variables and defines in the 
+# documentation can be controlled using \showinitializer or \hideinitializer 
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated 
+# at the bottom of the documentation of classes and structs. If set to YES the 
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories 
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy 
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. 
+# This will remove the Files entry from the Quick Index and from the 
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the 
+# Namespaces page.  This will remove the Namespaces entry from the Quick Index 
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that 
+# doxygen should invoke to get the current version for each file (typically from 
+# the version control system). Doxygen will invoke the program by executing (via 
+# popen()) the command <command> <input-file>, where <command> is the value of 
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file 
+# provided by doxygen. Whatever the program writes to standard output 
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    = 
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by 
+# doxygen. The layout file controls the global structure of the generated output files 
+# in an output format independent way. The create the layout file that represents 
+# doxygen's defaults, run doxygen with the -l option. You can optionally specify a 
+# file name after the option, if omitted DoxygenLayout.xml will be used as the name 
+# of the layout file.
+
+LAYOUT_FILE            = 
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated 
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are 
+# generated by doxygen. Possible values are YES and NO. If left blank 
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings 
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will 
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for 
+# potential errors in the documentation, such as not documenting some 
+# parameters in a documented function, or documenting parameters that 
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for 
+# functions that are documented, but have no documentation for their parameters 
+# or return value. If set to NO (the default) doxygen will only warn about 
+# wrong or incomplete parameter documentation, but not about the absence of 
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that 
+# doxygen can produce. The string should contain the $file, $line, and $text 
+# tags, which will be replaced by the file and line number from which the 
+# warning originated and the warning text. Optionally the format may contain 
+# $version, which will be replaced by the version of the file (if it could 
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning 
+# and error messages should be written. If left blank the output is written 
+# to stderr.
+
+WARN_LOGFILE           = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain 
+# documented source files. You may enter file names like "myfile.cpp" or 
+# directories like "/usr/src/myproject". Separate the files or directories 
+# with spaces.
+
+INPUT                  = @PacBioBAM_IncludeDir@
+
+# This tag can be used to specify the character encoding of the source files 
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
+# also the default input encoding. Doxygen uses libiconv (or the iconv built 
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for 
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the 
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank the following patterns are tested: 
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx 
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.d \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.idl \
+                         *.odl \
+                         *.cs \
+                         *.php \
+                         *.php3 \
+                         *.inc \
+                         *.m \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.f90 \
+                         *.f \
+                         *.vhd \
+                         *.vhdl
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories 
+# should be searched for input files as well. Possible values are YES and NO. 
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should 
+# excluded from the INPUT source files. This way you can easily exclude a 
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                = @PacBioBAM_IncludeDir@/pbbam/internal 
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or 
+# directories that are symbolic links (a Unix filesystem feature) are excluded 
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the 
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude 
+# certain files from those directories. Note that the wildcards are matched 
+# against the file with absolute path, so to exclude all test directories 
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = 
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names 
+# (namespaces, classes, functions, etc.) that should be excluded from the 
+# output. The symbol name can be a fully qualified name, a word, or if the 
+# wildcard * is used, a substring. Examples: ANamespace, AClass, 
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        = pugi, PacBio::BAM::internal
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or 
+# directories that contain example code fragments that are included (see 
+# the \include command).
+
+EXAMPLE_PATH           = examples 
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the 
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank all files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be 
+# searched for input files to be used with the \include or \dontinclude 
+# commands irrespective of the value of the RECURSIVE tag. 
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or 
+# directories that contain image that are included in the documentation (see 
+# the \image command).
+
+IMAGE_PATH             = 
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should 
+# invoke to filter for each input file. Doxygen will invoke the filter program 
+# by executing (via popen()) the command <filter> <input-file>, where <filter> 
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an 
+# input file. Doxygen will then use the output that the filter program writes 
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be 
+# ignored.
+
+INPUT_FILTER           = 
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern 
+# basis.  Doxygen will compare the file name with each pattern and apply the 
+# filter if there is a match.  The filters are a list of the form: 
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further 
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER 
+# is applied to all files.
+
+FILTER_PATTERNS        = 
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using 
+# INPUT_FILTER) will be used to filter the input files when producing source 
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will 
+# be generated. Documented entities will be cross-referenced with these sources. 
+# Note: To get rid of all source code in the generated output, make sure also 
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body 
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct 
+# doxygen to hide any special comment blocks from generated source code 
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES 
+# then for each documented function all documented 
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES 
+# then for each documented function all documented entities 
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) 
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from 
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will 
+# link to the source code.  Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code 
+# will point to the HTML generated by the htags(1) tool instead of doxygen 
+# built-in source browser. The htags tool is part of GNU's global source 
+# tagging system (see http://www.gnu.org/software/global/global.html). You 
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen 
+# will generate a verbatim copy of the header file for each class for 
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index 
+# of all compounds will be generated. Enable this if the project 
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then 
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns 
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all 
+# classes will be put under the same header in the alphabetical index. 
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that 
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will 
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for 
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank 
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard header.
+
+HTML_HEADER            = 
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard footer.
+
+HTML_FOOTER            = 
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading 
+# style sheet that is used by each HTML page. It can be used to 
+# fine-tune the look of the HTML output. If the tag is left blank doxygen 
+# will generate a default style sheet. Note that doxygen will try to copy 
+# the style sheet file to the HTML output directory, so don't put your own 
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        = 
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML 
+# page will contain the date and time when the page was generated. Setting 
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, 
+# files or namespaces will be aligned in HTML using tables. If set to 
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML 
+# documentation will contain sections that can be hidden and shown after the 
+# page has loaded. For this to work a browser that supports 
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox 
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = YES
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files 
+# will be generated that can be used as input for Apple's Xcode 3 
+# integrated development environment, introduced with OSX 10.5 (Leopard). 
+# To create a documentation set, doxygen will generate a Makefile in the 
+# HTML output directory. Running make will produce the docset in that 
+# directory and running "make install" will install the docset in 
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find 
+# it at startup. 
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the 
+# feed. A documentation feed provides an umbrella under which multiple 
+# documentation sets from a single provider (such as a company or product suite) 
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that 
+# should uniquely identify the documentation set bundle. This should be a 
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen 
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files 
+# will be generated that can be used as input for tools like the 
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) 
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can 
+# be used to specify the file name of the resulting .chm file. You 
+# can add a path in front of the file if the result should not be 
+# written to the html output directory.
+
+CHM_FILE               = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can 
+# be used to specify the location (absolute path including file name) of 
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run 
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag 
+# controls if a separate .chi index file is generated (YES) or that 
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING 
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file 
+# content.
+
+CHM_INDEX_ENCODING     = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag 
+# controls whether a binary table of contents is generated (YES) or a 
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members 
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER 
+# are set, an additional index file will be generated that can be used as input for 
+# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated 
+# HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can 
+# be used to specify the file name of the resulting .qch file. 
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               = 
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating 
+# Qt Help Project output. For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating 
+# Qt Help Project output. For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add. 
+# For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   = 
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see 
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  = 
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's 
+# filter section matches. 
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  = 
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can 
+# be used to specify the location of Qt's qhelpgenerator. 
+# If non-empty doxygen will try to run qhelpgenerator on the generated 
+# .qhp file.
+
+QHG_LOCATION           = 
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files  
+# will be generated, which together with the HTML files, form an Eclipse help  
+# plugin. To install this plugin and make it available under the help contents 
+# menu in Eclipse, the contents of the directory containing the HTML and XML 
+# files needs to be copied into the plugins directory of eclipse. The name of 
+# the directory within the plugins directory should be the same as 
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin 
+# the directory name containing the HTML and XML files should also have 
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at 
+# top of each HTML page. The value NO (the default) enables the index and 
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20]) 
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index 
+# structure should be generated to display hierarchical information. 
+# If the tag value is set to YES, a side panel will be generated 
+# containing a tree-like index structure (just like the one that 
+# is generated for HTML Help). For this to work a browser that supports 
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). 
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, 
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be 
+# used to set the initial width (in pixels) of the frame in which the tree 
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# Use this tag to change the font size of Latex formulas included 
+# as images in the HTML documentation. The default is 10. Note that 
+# when you change the font size after a successful doxygen run you need 
+# to manually remove any form_*.png images from the HTML output directory 
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript 
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should 
+# typically be disabled. For large projects the javascript based search engine 
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index 
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvances is that it is more difficult to setup 
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 
+# invoked. If left blank `latex' will be used as the default command name. 
+# Note that when enabling USE_PDFLATEX this option is only used for 
+# generating bitmaps for formulas in the HTML output, but not in the 
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 
+# generate index for LaTeX. If left blank `makeindex' will be used as the 
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact 
+# LaTeX documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used 
+# by the printer. Possible values are: a4, a4wide, letter, legal and 
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX 
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         = 
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for 
+# the generated latex document. The header should contain everything until 
+# the first chapter. If it is left blank doxygen will generate a 
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           = 
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated 
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will 
+# contain links (just like the HTML output) instead of page references 
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of 
+# plain latex in the generated Makefile. Set this option to YES to get a 
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. 
+# command to the generated LaTeX files. This will instruct LaTeX to keep 
+# running if errors occur, instead of asking the user for help. 
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not 
+# include the index chapters (such as File Index, Compound Index, etc.) 
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output 
+# The RTF output is optimized for Word 97 and may not look very pretty with 
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact 
+# RTF documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated 
+# will contain hyperlink fields. The RTF file will 
+# contain links (just like the HTML output) instead of page references. 
+# This makes the output suitable for online browsing using WORD or other 
+# programs which support those fields. 
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's 
+# config file, i.e. a series of assignments. You only have to provide 
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    = 
+
+# Set optional variables used in the generation of an rtf document. 
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will 
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to 
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output, 
+# then it will generate one additional man file for each entity 
+# documented in the real man page(s). These additional files 
+# only source the real man page, but without them the man command 
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will 
+# generate an XML file that captures the structure of 
+# the code including all documentation.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_SCHEMA             = 
+
+# The XML_DTD tag can be used to specify an XML DTD, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_DTD                = 
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will 
+# dump the program listings (including syntax highlighting 
+# and cross-referencing information) to the XML output. Note that 
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will 
+# generate an AutoGen Definitions (see autogen.sf.net) file 
+# that captures the structure of the code including all 
+# documentation. Note that this feature is still experimental 
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will 
+# generate a Perl module file that captures the structure of 
+# the code including all documentation. Note that this 
+# feature is still experimental and incomplete at the 
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate 
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able 
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be 
+# nicely formatted so it can be parsed by a human reader.  This is useful 
+# if you want to understand what is going on.  On the other hand, if this 
+# tag is set to NO the size of the Perl module output will be much smaller 
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file 
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. 
+# This is useful so different doxyrules.make files included by the same 
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will 
+# evaluate all C-preprocessor directives found in the sources and include 
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro 
+# names in the source code. If set to NO (the default) only conditional 
+# compilation will be performed. Macro expansion can be done in a controlled 
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES 
+# then the macro expansion is limited to the macros specified with the 
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files 
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that 
+# contain include files that are not input files but should be processed by 
+# the preprocessor.
+
+INCLUDE_PATH           = 
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard 
+# patterns (like *.h and *.hpp) to filter out the header-files in the 
+# directories. If left blank, the patterns specified with FILE_PATTERNS will 
+# be used.
+
+INCLUDE_FILE_PATTERNS  = 
+
+# The PREDEFINED tag can be used to specify one or more macro names that 
+# are defined before the preprocessor is started (similar to the -D option of 
+# gcc). The argument of the tag is a list of macros of the form: name 
+# or name=definition (no spaces). If the definition and the = are 
+# omitted =1 is assumed. To prevent a macro definition from being 
+# undefined via #undef or recursively expanded use the := operator 
+# instead of the = operator.
+
+PREDEFINED             = 
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then 
+# this tag can be used to specify a list of macro names that should be expanded. 
+# The macro definition that is found in the sources will be used. 
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      = 
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then 
+# doxygen's preprocessor will remove all function-like macros that are alone 
+# on a line, have an all uppercase name, and do not end with a semicolon. Such 
+# function macros are typically used for boiler-plate code, and will confuse 
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. 
+# Optionally an initial location of the external documentation 
+# can be added for each tagfile. The format of a tag file without 
+# this location is as follows: 
+#   TAGFILES = file1 file2 ... 
+# Adding location for the tag files is done as follows: 
+#   TAGFILES = file1=loc1 "file2 = loc2" ... 
+# where "loc1" and "loc2" can be relative or absolute paths or 
+# URLs. If a location is present for each tag, the installdox tool 
+# does not have to be run to correct the links. 
+# Note that each tag file must have a unique name 
+# (where the name does NOT include the path) 
+# If a tag file is not located in the directory in which doxygen 
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               = 
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create 
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       = 
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed 
+# in the class index. If set to NO only the inherited external classes 
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed 
+# in the modules index. If set to NO, only the current project's groups will 
+# be listed.
+
+EXTERNAL_GROUPS        = NO
+
+# The PERL_PATH should be the absolute path and name of the perl script 
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will 
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base 
+# or super classes. Setting the tag to NO turns the diagrams off. Note that 
+# this option is superseded by the HAVE_DOT option below. This is only a 
+# fallback. It is recommended to install and use dot, since it yields more 
+# powerful graphs.
+
+CLASS_DIAGRAMS         = NO
+
+# You can define message sequence charts within doxygen comments using the \msc 
+# command. Doxygen will then run the mscgen tool (see 
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the 
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where 
+# the mscgen tool resides. If left empty the tool is assumed to be found in the 
+# default search path.
+
+MSCGEN_PATH            = 
+
+# If set to YES, the inheritance and collaboration graphs will hide 
+# inheritance and usage relations if the target is undocumented 
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is 
+# available from the path. This tool is part of Graphviz, a graph visualization 
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section 
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# By default doxygen will write a font called FreeSans.ttf to the output 
+# directory and reference it in all dot files that doxygen generates. This 
+# font does not include all possible unicode characters however, so when you need 
+# these (or just want a differently looking font) you can specify the font name 
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font, 
+# which can be done by putting it in a standard location or by setting the 
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory 
+# containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. 
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the output directory to look for the 
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a 
+# different font using DOT_FONTNAME you can set the path where dot 
+# can find it using this tag.
+
+DOT_FONTPATH           = 
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect inheritance relations. Setting this tag to YES will force the 
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect implementation dependencies (inheritance, containment, and 
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and 
+# collaboration diagrams in a style similar to the OMG's Unified Modeling 
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the 
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT 
+# tags are set to YES then doxygen will generate a graph for each documented 
+# file showing the direct and indirect include dependencies of the file with 
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and 
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each 
+# documented header file showing the documented files that directly or 
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then 
+# doxygen will generate a call dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable call graphs 
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then 
+# doxygen will generate a caller dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable caller 
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen 
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES 
+# then doxygen will show the dependencies a directory has on other directories 
+# in a graphical way. The dependency relations are determined by the #include 
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images 
+# generated by dot. Possible values are png, jpg, or gif 
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be 
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               = 
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that 
+# contain dot files that are included in the documentation (see the 
+# \dotfile command).
+
+DOTFILE_DIRS           = 
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of 
+# nodes that will be shown in the graph. If the number of nodes in a graph 
+# becomes larger than this value, doxygen will truncate the graph, which is 
+# visualized by representing a node as a red box. Note that doxygen if the 
+# number of direct children of the root node in a graph is already larger than 
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note 
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the 
+# graphs generated by dot. A depth value of 3 means that only nodes reachable 
+# from the root by following a path via at most 3 edges will be shown. Nodes 
+# that lay further from the root node will be omitted. Note that setting this 
+# option to 1 or 2 may greatly reduce the computation time needed for large 
+# code bases. Also note that the size of a graph can be further restricted by 
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent 
+# background. This is disabled by default, because dot on Windows does not 
+# seem to support this out of the box. Warning: Depending on the platform used, 
+# enabling this option may lead to badly anti-aliased labels on the edges of 
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output 
+# files in one run (i.e. multiple -o and -T options on the command line). This 
+# makes dot run faster, but since only newer versions of dot (>1.8.10) 
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will 
+# generate a legend page explaining the meaning of the various boxes and 
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will 
+# remove the intermediate dot files that are used to generate 
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/docs/Makefile b/docs/Makefile

new file mode 100644 (file)

index 0000000..14e0fb1
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,168 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = build
+SOURCEDIR        = source
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SOURCEDIR) 
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SOURCEDIR)
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext fig
+
+help:
+       @echo "Please use \`make <target>' where <target> is one of"
+       @echo "  html       to make standalone HTML files"
+       @echo "  dirhtml    to make HTML files named index.html in directories"
+       @echo "  singlehtml to make a single large HTML file"
+       @echo "  pickle     to make pickle files"
+       @echo "  json       to make JSON files"
+       @echo "  htmlhelp   to make HTML files and a HTML help project"
+       @echo "  qthelp     to make HTML files and a qthelp project"
+       @echo "  devhelp    to make HTML files and a Devhelp project"
+       @echo "  epub       to make an epub"
+       @echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+       @echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+       @echo "  text       to make text files"
+       @echo "  man        to make manual pages"
+       @echo "  texinfo    to make Texinfo files"
+       @echo "  info       to make Texinfo files and run them through makeinfo"
+       @echo "  gettext    to make PO message catalogs"
+       @echo "  changes    to make an overview of all changed/added/deprecated items"
+       @echo "  linkcheck  to check all external links for integrity"
+       @echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+       -rm -rf $(BUILDDIR)/*
+
+html: basefig MANY_CLUSTER.png
+       $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+       @echo
+       @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+       $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+       @echo
+       @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+       $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+       @echo
+       @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+       $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+       @echo
+       @echo "Build finished; now you can process the pickle files."
+
+json:
+       $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+       @echo
+       @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+       $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+       @echo
+       @echo "Build finished; now you can run HTML Help Workshop with the" \
+             ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+       $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+       @echo
+       @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+             ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+       @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pbtoolkits.qhcp"
+       @echo "To view the help file:"
+       @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pbtoolkits.qhc"
+
+devhelp:
+       $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+       @echo
+       @echo "Build finished."
+       @echo "To view the help file:"
+       @echo "# mkdir -p $$HOME/.local/share/devhelp/pbtoolkits"
+       @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pbtoolkits"
+       @echo "# devhelp"
+
+epub:
+       $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+       @echo
+       @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo
+       @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+       @echo "Run \`make' in that directory to run these through (pdf)latex" \
+             "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo "Running LaTeX files through pdflatex..."
+       $(MAKE) -C $(BUILDDIR)/latex all-pdf
+       @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+       $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+       @echo
+       @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+       $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+       @echo
+       @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+       $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+       @echo
+       @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+       @echo "Run \`make' in that directory to run these through makeinfo" \
+             "(use \`make info' here to do that automatically)."
+
+info:
+       $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+       @echo "Running Texinfo files through makeinfo..."
+       make -C $(BUILDDIR)/texinfo info
+       @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+       $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+       @echo
+       @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+       $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+       @echo
+       @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+       $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+       @echo
+       @echo "Link check complete; look for any errors in the above output " \
+             "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+       $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+       @echo "Testing of doctests in the sources finished, look at the " \
+             "results in $(BUILDDIR)/doctest/output.txt."
+
+basefig:
+       dot -Tpng $(SOURCEDIR)/dependencies.dot > $(SOURCEDIR)/$@
+       grep -v "\"pbsmrtpipe\" ->" $(SOURCEDIR)/dependencies.dot  \
+               | grep -v "> \"pbcore\"" \
+               | sed 's/All/Sparse/' > $(SOURCEDIR)/sparse_dependencies.dot  
+       dot -Tpng $(SOURCEDIR)/sparse_dependencies.dot \
+               > $(SOURCEDIR)/sparse_dependencies.png
+
+%.png: basefig
+       grep -v $* $(SOURCEDIR)/sparse_dependencies.dot | \
+       grep -v \? | sed 's/Sparse dependencies/Module bundles/' | \
+       dot -Tpng > $(SOURCEDIR)/$@
+
diff --git a/docs/examples/code/BarcodeQuery.txt b/docs/examples/code/BarcodeQuery.txt

new file mode 100644 (file)

index 0000000..3fe8fce
--- /dev/null
+++ b/docs/examples/code/BarcodeQuery.txt
@@ -0,0 +1,17 @@
+// using C++11 range-based for loop
+BarcodeQuery query(42, dataset);
+for (const BamRecord& r : query) {
+    assert(r.HasBarcodes());
+    assert(r.BarcodeForward() == 42 || r.barcodeReverse() == 42);
+}
+
+// OR
+
+// using iterators directly
+BarcodeQuery query(42, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert(iter->HasBarcodes());
+    assert(iter->BarcodeForward() == 42 || iter->barcodeReverse() == 42);
+} 
diff --git a/docs/examples/code/Compare.txt b/docs/examples/code/Compare.txt

new file mode 100644 (file)

index 0000000..deecd8d
--- /dev/null
+++ b/docs/examples/code/Compare.txt
@@ -0,0 +1,3 @@
+// sort on increasing ZMW hole number
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::Zmw());
diff --git a/docs/examples/code/Compare_AlignedEnd.txt b/docs/examples/code/Compare_AlignedEnd.txt

new file mode 100644 (file)

index 0000000..d34ed67
--- /dev/null
+++ b/docs/examples/code/Compare_AlignedEnd.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::AlignedEnd());
diff --git a/docs/examples/code/Compare_AlignedStart.txt b/docs/examples/code/Compare_AlignedStart.txt

new file mode 100644 (file)

index 0000000..68de3e2
--- /dev/null
+++ b/docs/examples/code/Compare_AlignedStart.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::AlignedStart());
diff --git a/docs/examples/code/Compare_AlignedStrand.txt b/docs/examples/code/Compare_AlignedStrand.txt

new file mode 100644 (file)

index 0000000..6c22cdc
--- /dev/null
+++ b/docs/examples/code/Compare_AlignedStrand.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::AlignedStrand());
diff --git a/docs/examples/code/Compare_BarcodeForward.txt b/docs/examples/code/Compare_BarcodeForward.txt

new file mode 100644 (file)

index 0000000..1967341
--- /dev/null
+++ b/docs/examples/code/Compare_BarcodeForward.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::BarcodeForward());
diff --git a/docs/examples/code/Compare_BarcodeQuality.txt b/docs/examples/code/Compare_BarcodeQuality.txt

new file mode 100644 (file)

index 0000000..144f483
--- /dev/null
+++ b/docs/examples/code/Compare_BarcodeQuality.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::BarcodeQuality());
diff --git a/docs/examples/code/Compare_BarcodeReverse.txt b/docs/examples/code/Compare_BarcodeReverse.txt

new file mode 100644 (file)

index 0000000..9d3b245
--- /dev/null
+++ b/docs/examples/code/Compare_BarcodeReverse.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::BarcodeReverse());
diff --git a/docs/examples/code/Compare_FullName.txt b/docs/examples/code/Compare_FullName.txt

new file mode 100644 (file)

index 0000000..4b392b9
--- /dev/null
+++ b/docs/examples/code/Compare_FullName.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::FullName());
diff --git a/docs/examples/code/Compare_LocalContextFlag.txt b/docs/examples/code/Compare_LocalContextFlag.txt

new file mode 100644 (file)

index 0000000..aeab944
--- /dev/null
+++ b/docs/examples/code/Compare_LocalContextFlag.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::LocalContextFlag());
diff --git a/docs/examples/code/Compare_MapQuality.txt b/docs/examples/code/Compare_MapQuality.txt

new file mode 100644 (file)

index 0000000..fe22821
--- /dev/null
+++ b/docs/examples/code/Compare_MapQuality.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::MapQuality());
diff --git a/docs/examples/code/Compare_MovieName.txt b/docs/examples/code/Compare_MovieName.txt

new file mode 100644 (file)

index 0000000..cddcb64
--- /dev/null
+++ b/docs/examples/code/Compare_MovieName.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::MovieName());
diff --git a/docs/examples/code/Compare_NumDeletedBases.txt b/docs/examples/code/Compare_NumDeletedBases.txt

new file mode 100644 (file)

index 0000000..aa6dd4b
--- /dev/null
+++ b/docs/examples/code/Compare_NumDeletedBases.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumDeletedBases());
diff --git a/docs/examples/code/Compare_NumInsertedBases.txt b/docs/examples/code/Compare_NumInsertedBases.txt

new file mode 100644 (file)

index 0000000..917d87f
--- /dev/null
+++ b/docs/examples/code/Compare_NumInsertedBases.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumInsertedBases());
diff --git a/docs/examples/code/Compare_NumMatches.txt b/docs/examples/code/Compare_NumMatches.txt

new file mode 100644 (file)

index 0000000..47e3081
--- /dev/null
+++ b/docs/examples/code/Compare_NumMatches.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumMatches());
diff --git a/docs/examples/code/Compare_NumMismatches.txt b/docs/examples/code/Compare_NumMismatches.txt

new file mode 100644 (file)

index 0000000..12affb1
--- /dev/null
+++ b/docs/examples/code/Compare_NumMismatches.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumMismatches());
diff --git a/docs/examples/code/Compare_QueryEnd.txt b/docs/examples/code/Compare_QueryEnd.txt

new file mode 100644 (file)

index 0000000..d664d28
--- /dev/null
+++ b/docs/examples/code/Compare_QueryEnd.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::QueryEnd());
diff --git a/docs/examples/code/Compare_QueryStart.txt b/docs/examples/code/Compare_QueryStart.txt

new file mode 100644 (file)

index 0000000..12f6244
--- /dev/null
+++ b/docs/examples/code/Compare_QueryStart.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::QueryStart());
diff --git a/docs/examples/code/Compare_ReadAccuracy.txt b/docs/examples/code/Compare_ReadAccuracy.txt

new file mode 100644 (file)

index 0000000..9454309
--- /dev/null
+++ b/docs/examples/code/Compare_ReadAccuracy.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReadAccuracy());
diff --git a/docs/examples/code/Compare_ReadGroupId.txt b/docs/examples/code/Compare_ReadGroupId.txt

new file mode 100644 (file)

index 0000000..dab3497
--- /dev/null
+++ b/docs/examples/code/Compare_ReadGroupId.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReadGroupId());
diff --git a/docs/examples/code/Compare_ReadGroupNumericId.txt b/docs/examples/code/Compare_ReadGroupNumericId.txt

new file mode 100644 (file)

index 0000000..5ad8f9d
--- /dev/null
+++ b/docs/examples/code/Compare_ReadGroupNumericId.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReadGroupNumericId());
diff --git a/docs/examples/code/Compare_ReferenceEnd.txt b/docs/examples/code/Compare_ReferenceEnd.txt

new file mode 100644 (file)

index 0000000..ed42d05
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceEnd.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceEnd());
diff --git a/docs/examples/code/Compare_ReferenceId.txt b/docs/examples/code/Compare_ReferenceId.txt

new file mode 100644 (file)

index 0000000..5628427
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceId.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceId());
diff --git a/docs/examples/code/Compare_ReferenceName.txt b/docs/examples/code/Compare_ReferenceName.txt

new file mode 100644 (file)

index 0000000..1f76e7e
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceName.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceName());
diff --git a/docs/examples/code/Compare_ReferenceStart.txt b/docs/examples/code/Compare_ReferenceStart.txt

new file mode 100644 (file)

index 0000000..0ccaf36
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceStart.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceStart());
diff --git a/docs/examples/code/Compare_TypeFromOperator.txt b/docs/examples/code/Compare_TypeFromOperator.txt

new file mode 100644 (file)

index 0000000..afb0848
--- /dev/null
+++ b/docs/examples/code/Compare_TypeFromOperator.txt
@@ -0,0 +1,2 @@
+Compare::Type type = Compare::TypeFromOperator("!=");
+assert(type == Compare::NOT_EQUAL);
diff --git a/docs/examples/code/Compare_TypeToName.txt b/docs/examples/code/Compare_TypeToName.txt

new file mode 100644 (file)

index 0000000..c44e1cb
--- /dev/null
+++ b/docs/examples/code/Compare_TypeToName.txt
@@ -0,0 +1,2 @@
+string name = Compare::TypeToName(Compare::LESS_THAN);
+assert(name = "Compare::LESS_THAN");
diff --git a/docs/examples/code/Compare_Zmw.txt b/docs/examples/code/Compare_Zmw.txt

new file mode 100644 (file)

index 0000000..b02c426
--- /dev/null
+++ b/docs/examples/code/Compare_Zmw.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::Zmw());
diff --git a/docs/examples/code/EntireFileQuery.txt b/docs/examples/code/EntireFileQuery.txt

new file mode 100644 (file)

index 0000000..d3fcc2c
--- /dev/null
+++ b/docs/examples/code/EntireFileQuery.txt
@@ -0,0 +1,15 @@
+// using C++11 range-based for loop
+EntireFileQuery query(dataset);
+for (const BamRecord& record : query) {
+    // ... do stuff ...
+}
+
+// OR
+
+// using iterators
+EntireFileQuery query(dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    // ... do stuff ...
+}  
diff --git a/docs/examples/code/EntireFileQuery_BamFilename.txt b/docs/examples/code/EntireFileQuery_BamFilename.txt

new file mode 100644 (file)

index 0000000..484db61
--- /dev/null
+++ b/docs/examples/code/EntireFileQuery_BamFilename.txt
@@ -0,0 +1,4 @@
+EntireFileQuery query("foo.bam");
+for (const BamRecord& record : query) {
+    // do stuff
+}
diff --git a/docs/examples/code/EntireFileQuery_NonConst.txt b/docs/examples/code/EntireFileQuery_NonConst.txt

new file mode 100644 (file)

index 0000000..a0a092e
--- /dev/null
+++ b/docs/examples/code/EntireFileQuery_NonConst.txt
@@ -0,0 +1,4 @@
+EntireFileQuery query("foo.bam");
+for (BamRecord& record : query) {
+    // ok to modify 'record' here
+} 
diff --git a/docs/examples/code/GenomicIntervalQuery.txt b/docs/examples/code/GenomicIntervalQuery.txt

new file mode 100644 (file)

index 0000000..651f254
--- /dev/null
+++ b/docs/examples/code/GenomicIntervalQuery.txt
@@ -0,0 +1,16 @@
+// using C++11 range-based for loop
+GenomicIntervalQuery query(GenomicInterval("chr1:1000-2000"), dataset);
+for (const BamRecord& record : query) {
+    // ... do stuff ...
+}
+
+// OR
+
+// using iterators directly
+GenomicIntervalQuery query(GenomicInterval("chr1:1000-2000"), dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    // ... do stuff ...
+}
+
diff --git a/docs/examples/code/GenomicIntervalQuery_Reuse.txt b/docs/examples/code/GenomicIntervalQuery_Reuse.txt

new file mode 100644 (file)

index 0000000..339ae95
--- /dev/null
+++ b/docs/examples/code/GenomicIntervalQuery_Reuse.txt
@@ -0,0 +1,8 @@
+DataSet ds("data.xml");
+GenomicIntervalQuery query(GenomicInterval(), ds);
+for (const GenomicInterval& interval : intervals) {
+    query.Interval(interval);
+    for (const BamRecord& record : query) {}
+        // do stuff
+    }
+}
+\ No newline at end of file
diff --git a/docs/examples/code/PbiAlignedEndFilter.txt b/docs/examples/code/PbiAlignedEndFilter.txt

new file mode 100644 (file)

index 0000000..bac1a46
--- /dev/null
+++ b/docs/examples/code/PbiAlignedEndFilter.txt
@@ -0,0 +1,4 @@
+PbiFilterQuery query(PbiAlignedEndFilter{3000, Compare::GREATER_THAN});
+for (const BamRecord& record : query) {
+    assert(record.AlignedEnd() > 3000);
+}
diff --git a/docs/examples/code/PbiAlignedLengthFilter.txt b/docs/examples/code/PbiAlignedLengthFilter.txt

new file mode 100644 (file)

index 0000000..38dc3ff
--- /dev/null
+++ b/docs/examples/code/PbiAlignedLengthFilter.txt
@@ -0,0 +1,4 @@
+PbiFilterQuery query(PbiAlignedLengthFilter{1000, Compare::GREATER_THAN});
+for (const BamRecord& record : query) {
+    assert((record.AlignedEnd() - record.AlignedStart()) > 1000);
+}
diff --git a/docs/examples/code/PbiAlignedStartFilter.txt b/docs/examples/code/PbiAlignedStartFilter.txt

new file mode 100644 (file)

index 0000000..b78bb2c
--- /dev/null
+++ b/docs/examples/code/PbiAlignedStartFilter.txt
@@ -0,0 +1,4 @@
+PbiFilterQuery query(PbiAlignedStartFilter{3000, Compare::GREATER_THAN});
+for (const BamRecord& record : query) {
+    assert(record.AlignedStart() > 3000);
+}
diff --git a/docs/examples/code/PbiAlignedStrandFilter.txt b/docs/examples/code/PbiAlignedStrandFilter.txt

new file mode 100644 (file)

index 0000000..9f9a885
--- /dev/null
+++ b/docs/examples/code/PbiAlignedStrandFilter.txt
@@ -0,0 +1,5 @@
+PbiFilterQuery query(PbiAlignedStrandFilter{Strand::FORWARD});
+for (const BamRecord& record : query) {
+    assert(record.AlignedStrand() == Strand::FORWARD);
+}
+
diff --git a/docs/examples/code/PbiBarcodeFilter.txt b/docs/examples/code/PbiBarcodeFilter.txt

new file mode 100644 (file)

index 0000000..c7ce5cb
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeFilter.txt
@@ -0,0 +1,17 @@
+// single value
+PbiFilter filter{ PbiBarcodeFilter{17} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const auto barcodes = record.Barcodes();
+    assert(barcodes.first == 17 || barcodes.second == 17);
+}
+
+// whitelist
+vector<int16_t> whitelist = { 50, 100 };
+PbiFilter filter{ PbiBarcodeFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const auto barcodes = record.Barcodes();
+    assert(barcodes.first == 50  || barcodes.second == 50 ||
+           barcodes.first == 100 || barcodes.second == 100);
+}
diff --git a/docs/examples/code/PbiBarcodeForwardFilter.txt b/docs/examples/code/PbiBarcodeForwardFilter.txt

new file mode 100644 (file)

index 0000000..a6c12fd
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeForwardFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiBarcodeForwardFilter{50} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeForward() == 50);
+}
+
+// whitelist
+vector<int16_t> whitelist = { 50, 100 };
+PbiFilter filter{ PbiBarcodeForwardFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeForward() == 50 || record.BarcodeForward() == 100);
+}
+
diff --git a/docs/examples/code/PbiBarcodeQualityFilter.txt b/docs/examples/code/PbiBarcodeQualityFilter.txt

new file mode 100644 (file)

index 0000000..34311d0
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeQualityFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiBarcodeQualityFilter{42, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeQuality() >= 42);
+}
diff --git a/docs/examples/code/PbiBarcodeReverseFilter.txt b/docs/examples/code/PbiBarcodeReverseFilter.txt

new file mode 100644 (file)

index 0000000..24134f8
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeReverseFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiBarcodeReverseFilter{50} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeReverse() == 50);
+}
+
+// whitelist
+vector<int16_t> whitelist = { 50, 100 };
+PbiFilter filter{ PbiBarcodeReverseFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeReverse() == 50 || record.BarcodeReverse() == 100);
+}
+
diff --git a/docs/examples/code/PbiBarcodesFilter.txt b/docs/examples/code/PbiBarcodesFilter.txt

new file mode 100644 (file)

index 0000000..a655c57
--- /dev/null
+++ b/docs/examples/code/PbiBarcodesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiBarcodesFilter{17, 18} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeForward() == 17 && 
+           record.BarcodeReverse() == 18);
+}
diff --git a/docs/examples/code/PbiBuilder_WithReader.txt b/docs/examples/code/PbiBuilder_WithReader.txt

new file mode 100644 (file)

index 0000000..e2748c2
--- /dev/null
+++ b/docs/examples/code/PbiBuilder_WithReader.txt
@@ -0,0 +1,30 @@
+// To simply create a PBI file from BAM, the following is the easiest method:
+//
+#include <pbbam/BamFile.h>
+#include <pbbam/PbiFile.h>
+
+BamFile bamFile("data.bam");
+PbiFile::CreateFrom(bamFile);
+
+
+// However if you need to perform additional operations while reading the BAM file, 
+// you can do something like the following:
+//
+{
+    BamFile bamFile("data.bam");
+    PbiBuilder builder(bamFile.PacBioIndexFilename(), 
+                       bamFile.Header().Sequences().size());
+    BamReader reader(bamFile);
+    BamRecord b;
+    int64_t offset = reader.VirtualTell(); // first record's vOffset
+    while (reader.GetNext(b)) {
+
+        // store PBI recrod entry & get next record's vOffset
+        builder.AddRecord(b, offset);
+        offset = reader.VirtualTell();
+   
+        // ... additional stuff as needed ...
+    }
+
+} // <-- PBI data will only be written here, as PbiBuilder goes out of scope
+
diff --git a/docs/examples/code/PbiBuilder_WithWriter.txt b/docs/examples/code/PbiBuilder_WithWriter.txt

new file mode 100644 (file)

index 0000000..0c7d6d1
--- /dev/null
+++ b/docs/examples/code/PbiBuilder_WithWriter.txt
@@ -0,0 +1,12 @@
+BamWriter writer(...);
+PbiBuilder pbiBuilder(...);
+int64_t vOffset;
+BamRecord record;
+while (...) {
+
+    // ... populate record data ...
+
+    // write record to BAM and add PBI entry
+    writer.Write(record, &vOffset);
+    pbiBuilder.AddRecord(record, vOffset);
+}
diff --git a/docs/examples/code/PbiFilterQuery.txt b/docs/examples/code/PbiFilterQuery.txt

new file mode 100644 (file)

index 0000000..4914eab
--- /dev/null
+++ b/docs/examples/code/PbiFilterQuery.txt
@@ -0,0 +1,22 @@
+// setup filter
+PbiFilter filter;
+filter.Add(PbiZmwFilter(42));
+filter.Add(PbiReadAccuracyFilter(0.9, Compare::GREATER_THAN_EQUAL));
+
+// using C++11 range-based for loop
+PbiFilterQuery query(filter, dataset);
+for (const BamRecord& r : query) {
+    assert(r.HoleNumber() == 42);
+    assert(r.ReadAccuracy() >= 0.9);
+}
+
+// OR
+
+// using iterators directly
+PbiFilterQuery query(filter, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert(iter->HoleNumber() == 42);
+    assert(iter->ReadAccuracy() >= 0.9);
+} 
diff --git a/docs/examples/code/PbiFilter_Composition.txt b/docs/examples/code/PbiFilter_Composition.txt

new file mode 100644 (file)

index 0000000..22cc6ff
--- /dev/null
+++ b/docs/examples/code/PbiFilter_Composition.txt
@@ -0,0 +1,8 @@
+// (f1 && f2) || f3
+
+PbiFilter f1;
+PbiFilter f2;
+PbiFilter intersect_f1_f2 = PbiFilter::Intersection(f1, f2);
+
+PbiFilter f3;
+PbiFilter final = PbiFilter::Union(intersect_f1_f2, f3);
diff --git a/docs/examples/code/PbiFilter_CustomFilter.txt b/docs/examples/code/PbiFilter_CustomFilter.txt

new file mode 100644 (file)

index 0000000..f9cdd21
--- /dev/null
+++ b/docs/examples/code/PbiFilter_CustomFilter.txt
@@ -0,0 +1,21 @@
+struct MyCustomFilter
+{
+    bool Accepts(const PbiRawData& index, const size_t row) const
+    {
+        // Look up data for record at the provided row. Do any calculations
+        // necessary, then return whether that record passes your 
+        // filter criteria. 
+        
+        return true;
+    }
+};
+
+// use in composite filters
+PbiFilter f;
+f.Add(PbiMovieNameFilter("foo"));
+f.Add(MyCustomFilter());
+
+// pass directly to PbiFilterQuery
+PbiFilterQuery query(MyCustomFilter(), "foo.bam");
+for (const BamRecord& record : query)
+    // ... do stuff ...
diff --git a/docs/examples/code/PbiFilter_Interface.txt b/docs/examples/code/PbiFilter_Interface.txt

new file mode 100644 (file)

index 0000000..0fea900
--- /dev/null
+++ b/docs/examples/code/PbiFilter_Interface.txt
@@ -0,0 +1 @@
+bool Accepts(const PbiRawData& index, const size_t row) const;
diff --git a/docs/examples/code/PbiFilter_Intersection_Copy.txt b/docs/examples/code/PbiFilter_Intersection_Copy.txt

new file mode 100644 (file)

index 0000000..ec0a7ac
--- /dev/null
+++ b/docs/examples/code/PbiFilter_Intersection_Copy.txt
@@ -0,0 +1,3 @@
+PbiFilter result{ PbiFilter::INTERSECT };
+result.Add(filters);
+return result;
diff --git a/docs/examples/code/PbiFilter_Intersection_Move.txt b/docs/examples/code/PbiFilter_Intersection_Move.txt

new file mode 100644 (file)

index 0000000..2b06106
--- /dev/null
+++ b/docs/examples/code/PbiFilter_Intersection_Move.txt
@@ -0,0 +1,3 @@
+PbiFilter result{ PbiFilter::INTERSECT };
+result.Add(std::move(filters));
+return result;
diff --git a/docs/examples/code/PbiFilter_Union_Copy.txt b/docs/examples/code/PbiFilter_Union_Copy.txt

new file mode 100644 (file)

index 0000000..7e2a192
--- /dev/null
+++ b/docs/examples/code/PbiFilter_Union_Copy.txt
@@ -0,0 +1,3 @@
+PbiFilter result{ PbiFilter::UNION };
+result.Add(filters);
+return result;
diff --git a/docs/examples/code/PbiFilter_Union_Move.txt b/docs/examples/code/PbiFilter_Union_Move.txt

new file mode 100644 (file)

index 0000000..2e98d91
--- /dev/null
+++ b/docs/examples/code/PbiFilter_Union_Move.txt
@@ -0,0 +1,3 @@
+PbiFilter result{ PbiFilter::UNION };
+result.Add(std::move(filters));
+return result;
diff --git a/docs/examples/code/PbiIdentityFilter.txt b/docs/examples/code/PbiIdentityFilter.txt

new file mode 100644 (file)

index 0000000..6fcb8d0
--- /dev/null
+++ b/docs/examples/code/PbiIdentityFilter.txt
@@ -0,0 +1,6 @@
+// single value
+PbiFilter filter{ PbiIdentityFilter{ 0.5, Compare::GREATER_THAN_EQUAL } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    // ... at least 50% of record was aligned ...
+}
diff --git a/docs/examples/code/PbiLocalContextFilter.txt b/docs/examples/code/PbiLocalContextFilter.txt

new file mode 100644 (file)

index 0000000..0aaa3eb
--- /dev/null
+++ b/docs/examples/code/PbiLocalContextFilter.txt
@@ -0,0 +1,22 @@
+
+// --------------------
+// has adapter_before
+// --------------------
+
+PbiFilter filter{ PbiLocalContextFilter{LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const bool hasAdapterBefore = (record.LocalContextFlags() & LocalContextFlags::ADAPTER_BEFORE) != 0;
+    assert(hasAdapterBefore);
+}
+
+// ----------------------------------
+// has any adapters, barcodes, etc.
+// ----------------------------------
+
+PbiFilter filter{ PbiLocalContextFilter{LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const bool hasContext = (record.LocalContextFlags() != LocalContextFlags::NO_LOCAL_CONTEXT);
+    assert(hasContext);
+}
diff --git a/docs/examples/code/PbiMapQualityFilter.txt b/docs/examples/code/PbiMapQualityFilter.txt

new file mode 100644 (file)

index 0000000..67fb5dc
--- /dev/null
+++ b/docs/examples/code/PbiMapQualityFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiMapQualityFilter{75, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.MapQuality() >= 75);
+} 
diff --git a/docs/examples/code/PbiMovieNameFilter.txt b/docs/examples/code/PbiMovieNameFilter.txt

new file mode 100644 (file)

index 0000000..dd124e2
--- /dev/null
+++ b/docs/examples/code/PbiMovieNameFilter.txt
@@ -0,0 +1,14 @@
+// single value
+PbiFilter filter{ PbiMovieFilter{ "foo" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.MovieName() == "foo");
+}
+
+// whitelist
+vector<string> whitelist = { "foo", "bar" };
+PbiFilter filter{ PbiMovieNameFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.MovieName() == "foo" || record.MovieName() == "bar");
+}
diff --git a/docs/examples/code/PbiNumDeletedBasesFilter.txt b/docs/examples/code/PbiNumDeletedBasesFilter.txt

new file mode 100644 (file)

index 0000000..e1e3d1f
--- /dev/null
+++ b/docs/examples/code/PbiNumDeletedBasesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumDeletedBasesFilter{50, Compare::LESS_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumDeletedBases() < 50);
+}
+
diff --git a/docs/examples/code/PbiNumInsertedBasesFilter.txt b/docs/examples/code/PbiNumInsertedBasesFilter.txt

new file mode 100644 (file)

index 0000000..ab385e4
--- /dev/null
+++ b/docs/examples/code/PbiNumInsertedBasesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumInsertedBasesFilter{50, Compare::LESS_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumInsertedBases() < 50);
+}
+
diff --git a/docs/examples/code/PbiNumMatchesFilter.txt b/docs/examples/code/PbiNumMatchesFilter.txt

new file mode 100644 (file)

index 0000000..4e1b97d
--- /dev/null
+++ b/docs/examples/code/PbiNumMatchesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumMatchesFilter{2000, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumMatches() >= 2000);
+}
+
diff --git a/docs/examples/code/PbiNumMismatchesFilter.txt b/docs/examples/code/PbiNumMismatchesFilter.txt

new file mode 100644 (file)

index 0000000..690e4a1
--- /dev/null
+++ b/docs/examples/code/PbiNumMismatchesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumMismatchesFilter{500, Compare::LESS_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumMismatches() < 500);
+}
+
diff --git a/docs/examples/code/PbiQueryEndFilter.txt b/docs/examples/code/PbiQueryEndFilter.txt

new file mode 100644 (file)

index 0000000..f85166b
--- /dev/null
+++ b/docs/examples/code/PbiQueryEndFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiQueryEndFilter{3000, Compare::GREATER_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.QueryEnd() > 3000);
+} 
diff --git a/docs/examples/code/PbiQueryLengthFilter.txt b/docs/examples/code/PbiQueryLengthFilter.txt

new file mode 100644 (file)

index 0000000..123412a
--- /dev/null
+++ b/docs/examples/code/PbiQueryLengthFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiQueryLengthFilter{2000, Compare::GREATER_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert( (record.QueryEnd() - record.QueryStart()) > 2000 );
+}
diff --git a/docs/examples/code/PbiQueryNameFilter.txt b/docs/examples/code/PbiQueryNameFilter.txt

new file mode 100644 (file)

index 0000000..f1e51c7
--- /dev/null
+++ b/docs/examples/code/PbiQueryNameFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiQueryNameFilter{ "movie_1/42/100_200" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.FullName() == "movie_1/42/100_200");
+}
+
+// whitelist
+vector<string> whitelist = { "movie_1/42/100_200", "movie_3/24/300_500" };
+PbiFilter filter{ PbiQueryNameFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.FullName() == "movie_1/42/100_200" || 
+           record.FullName() == "movie_3/24/300_500");
+}
diff --git a/docs/examples/code/PbiQueryStartFilter.txt b/docs/examples/code/PbiQueryStartFilter.txt

new file mode 100644 (file)

index 0000000..56353df
--- /dev/null
+++ b/docs/examples/code/PbiQueryStartFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiQueryStartFilter{3000, Compare::GREATER_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.QueryStart() > 3000);
+} 
diff --git a/docs/examples/code/PbiReadAccuracyFilter.txt b/docs/examples/code/PbiReadAccuracyFilter.txt

new file mode 100644 (file)

index 0000000..dd2df32
--- /dev/null
+++ b/docs/examples/code/PbiReadAccuracyFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiReadAccuracyFilter{0.8, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadAccuracy() >= 0.8);
+}
diff --git a/docs/examples/code/PbiReadGroupFilter.txt b/docs/examples/code/PbiReadGroupFilter.txt

new file mode 100644 (file)

index 0000000..9af096d
--- /dev/null
+++ b/docs/examples/code/PbiReadGroupFilter.txt
@@ -0,0 +1,64 @@
+// -------------------------
+// numeric ID
+// -------------------------
+
+// single value
+PbiFilter filter{ PbiReadGroupFilter{ 2458765 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupNumericId() == 2458765);
+}
+
+// whitelist
+vector<int32_t> whitelist = { 2458765, -32143 };
+PbiFilter filter{ PbiReadGroupFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupNumericId() == 2458765 ||
+           record.ReadGroupNumericId() == -32143);
+}
+
+// -------------------------
+// printable ID
+// -------------------------
+
+// single value 
+PbiFilter filter{ PbiReadGroupFilter{ "12B33F00" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupId() == "12B33F00");
+}
+
+// whitelist
+vector<string> whitelist = { "12B33F00", "123ABC77" };
+PbiFilter filter{ PbiReadGroupFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupId() == "12B33F00" ||
+           record.ReadGroupId() == "123ABC77");
+}
+
+
+// -------------------------
+// read group 
+// -------------------------
+
+BamFile file("foo.bam");
+BamHeader header = file.Header();
+assert(header.ReadGroups().size() > 1);
+
+// single value 
+PbiFilter filter{ PbiReadGroupFilter{ header.ReadGroups()[0] } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroup() == header.ReadGroups()[0]);
+}
+
+// whitelist
+vector<ReadGroupInfo> whitelist = { header.ReadGroups()[0], header.ReadGroups()[1] };
+PbiFilter filter{ PbiReadGroupFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroup() == header.ReadGroups()[0] ||
+           record.ReadGroup() == header.ReadGroups()[1]);
+}
diff --git a/docs/examples/code/PbiReferenceEndFilter.txt b/docs/examples/code/PbiReferenceEndFilter.txt

new file mode 100644 (file)

index 0000000..ce005c6
--- /dev/null
+++ b/docs/examples/code/PbiReferenceEndFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiReferenceEndFilter{ 2000 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceEnd() == 2000);
+}
diff --git a/docs/examples/code/PbiReferenceIdFilter.txt b/docs/examples/code/PbiReferenceIdFilter.txt

new file mode 100644 (file)

index 0000000..d963d28
--- /dev/null
+++ b/docs/examples/code/PbiReferenceIdFilter.txt
@@ -0,0 +1,16 @@
+// single value
+PbiFilter filter{ PbiReferenceIdFilter{ 4 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceId() == 4);
+}
+
+// whitelist
+vector<int32_t> whitelist = { 0, 1 };
+PbiFilter filter{ PbiReferenceIdFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceId() == 0 || 
+           record.ReferenceId() == 1);
+}
+
diff --git a/docs/examples/code/PbiReferenceNameFilter.txt b/docs/examples/code/PbiReferenceNameFilter.txt

new file mode 100644 (file)

index 0000000..c86b14a
--- /dev/null
+++ b/docs/examples/code/PbiReferenceNameFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiReferenceNameFilter{ "chr1" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceName() == "chr1");
+}
+
+// whitelist
+vector<string> whitelist = { "chr1", "chr5" };
+PbiFilter filter{ PbiReferenceNameFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceName() == "chr1" ||
+           record.ReferenceName() == "chr5");
+}
diff --git a/docs/examples/code/PbiReferenceStartFilter.txt b/docs/examples/code/PbiReferenceStartFilter.txt

new file mode 100644 (file)

index 0000000..d3ffdbb
--- /dev/null
+++ b/docs/examples/code/PbiReferenceStartFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiReferenceStartFilter{ 2000 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceStart() == 2000);
+}
diff --git a/docs/examples/code/PbiZmwFilter.txt b/docs/examples/code/PbiZmwFilter.txt

new file mode 100644 (file)

index 0000000..c63a804
--- /dev/null
+++ b/docs/examples/code/PbiZmwFilter.txt
@@ -0,0 +1,16 @@
+// single value
+PbiFilter filter{ PbiZmwFilter{ 4000 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.HoleNumber() == 4000);
+}
+
+// whitelist
+vector<int32_t> whitelist = { 4000, 8000 };
+PbiFilter filter{ PbiZmwFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.HoleNumber() == 4000 || 
+           record.HoleNumber() == 8000);
+}
+
diff --git a/docs/examples/code/ReadAccuracyQuery.txt b/docs/examples/code/ReadAccuracyQuery.txt

new file mode 100644 (file)

index 0000000..5b0404f
--- /dev/null
+++ b/docs/examples/code/ReadAccuracyQuery.txt
@@ -0,0 +1,15 @@
+// using C++11 range-based for loop
+ReadAccuracyQuery query(0.9, Compare::GREATER_THAN_EQUAL, dataset);
+for (const BamRecord& r : query) {
+    assert(r.ReadAccuracy() >= 0.9);
+}
+
+// OR
+
+// using iterators directly
+ReadAccuracyQuery query(0.9, Compare::GREATER_THAN_EQUAL, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert(iter->ReadAccuracy() >= 0.9);
+} 
diff --git a/docs/examples/code/SubreadLengthQuery.txt b/docs/examples/code/SubreadLengthQuery.txt

new file mode 100644 (file)

index 0000000..466a1d9
--- /dev/null
+++ b/docs/examples/code/SubreadLengthQuery.txt
@@ -0,0 +1,15 @@
+// using C++11 range-based for loop
+SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, dataset);
+for (const BamRecord& r : query) {
+    assert((r.QueryEnd() - r.QueryStart()) >= 500);  
+}
+
+// OR
+
+// using iterators directly
+SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert((iter->QueryEnd() - iter->QueryStart()) >= 500);
+} 
diff --git a/docs/examples/code/Tag_AsciiCtor.txt b/docs/examples/code/Tag_AsciiCtor.txt

new file mode 100644 (file)

index 0000000..057d22f
--- /dev/null
+++ b/docs/examples/code/Tag_AsciiCtor.txt
@@ -0,0 +1,10 @@
+// One-step construction
+// 
+// This is useful in situations that require a const Tag.
+//
+const auto t = Tag('A', TagModifier::ASCII_CHAR);
+
+// or two-step construction
+auto t = Tag('A');
+t.Modifier(TagModifier::ASCII_CHAR);
+
diff --git a/docs/examples/code/WhitelistedZmwReadStitcher.txt b/docs/examples/code/WhitelistedZmwReadStitcher.txt

new file mode 100644 (file)

index 0000000..a94c27b
--- /dev/null
+++ b/docs/examples/code/WhitelistedZmwReadStitcher.txt
@@ -0,0 +1,6 @@
+vector<int32_t> zmws = { ... };
+WhitelistedZmwReadStitcher reader(zmws, "primary.bam", "scraps.bam");
+while(reader.HasNext()) {
+    auto virtualRecord = reader.Next();
+    // ... do stuff ...
+}
diff --git a/docs/examples/code/ZmwGroupQuery.txt b/docs/examples/code/ZmwGroupQuery.txt

new file mode 100644 (file)

index 0000000..1d728ac
--- /dev/null
+++ b/docs/examples/code/ZmwGroupQuery.txt
@@ -0,0 +1,23 @@
+bool allHoleNumbersEqual(const vector<BamRecord>& group) 
+{
+    if (group.empty()) 
+        return true;
+    const auto firstHoleNumber = group[0].HoleNumber();
+    for (size_t i = 1; i < group.size(); ++i) {
+       if (group[i].HoleNumber() != firstHoleNumber)
+           return false;
+    }
+    return true;
+}
+
+vector<int32_t> whitelist = { 50, 100 };
+ZmwGroupQuery query(whitelist, dataset);
+for(const vector<BamRecord>& group : query) {
+
+    assert(allHoleNumbersEqual(group));
+
+    for (const BamRecord& record : group) {
+        assert(record.HoleNumber() == 50 ||
+               record.HoleNumber() == 100);
+    }
+}
diff --git a/docs/examples/code/ZmwQuery.txt b/docs/examples/code/ZmwQuery.txt

new file mode 100644 (file)

index 0000000..59c22c4
--- /dev/null
+++ b/docs/examples/code/ZmwQuery.txt
@@ -0,0 +1,6 @@
+vector<int32_t> whitelist = { 50, 100 };
+ZmwQuery query(whitelist, dataset);
+for (const BamRecord& record : query) {
+    assert(record.HoleNumber() == 50 ||
+           record.HoleNumber() == 100);
+}
diff --git a/docs/examples/plaintext/AlignmentPrinterOutput.txt b/docs/examples/plaintext/AlignmentPrinterOutput.txt

new file mode 100644 (file)

index 0000000..21d948b
--- /dev/null
+++ b/docs/examples/plaintext/AlignmentPrinterOutput.txt
@@ -0,0 +1,13 @@
+Read        : singleInsertion2
+Reference   : lambda_NEB3011
+
+Read-length : 49
+Concordance : 0.96
+
+5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249
+       |||||||| ||||||||||||||||||| |||||||||||
+   0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG :   39
+
+5249 : ACTGGCTGAT : 5259
+       ||||||||||
+  39 : ACTGGCTGAT :   49
diff --git a/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt b/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt

new file mode 100644 (file)

index 0000000..5b5e8c2
--- /dev/null
+++ b/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt
@@ -0,0 +1,14 @@
+<Filters>
+  <Filter>
+    <Properties>
+      <Property />  # A
+      <Property />  # B
+    </Properties>
+  </Filter>
+  <Filter>
+    <Properties>
+      <Property />  # C
+      <Property />  # D
+    </Properties> 
+  </Filter>
+</Filters>
diff --git a/docs/source/api/Accuracy.rst b/docs/source/api/Accuracy.rst

new file mode 100644 (file)

index 0000000..f88b722
--- /dev/null
+++ b/docs/source/api/Accuracy.rst
@@ -0,0 +1,11 @@
+Accuracy
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/Accuracy.h>
+
+.. doxygenclass:: PacBio::BAM::Accuracy
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/AlignmentPrinter.rst b/docs/source/api/AlignmentPrinter.rst

new file mode 100644 (file)

index 0000000..ef0b191
--- /dev/null
+++ b/docs/source/api/AlignmentPrinter.rst
@@ -0,0 +1,11 @@
+AlignmentPrinter
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/AlignmentPrinter.h>
+
+.. doxygenclass:: PacBio::BAM::AlignmentPrinter 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/AlignmentSet.rst b/docs/source/api/AlignmentSet.rst

new file mode 100644 (file)

index 0000000..1817962
--- /dev/null
+++ b/docs/source/api/AlignmentSet.rst
@@ -0,0 +1,11 @@
+AlignmentSet
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::AlignmentSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BaiIndexedBamReader.rst b/docs/source/api/BaiIndexedBamReader.rst

new file mode 100644 (file)

index 0000000..aab136f
--- /dev/null
+++ b/docs/source/api/BaiIndexedBamReader.rst
@@ -0,0 +1,11 @@
+BaiIndexedBamReader
+===================
+
+.. code-block:: cpp
+
+   #include <pbbam/BaiIndexedBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::BaiIndexedBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamFile.rst b/docs/source/api/BamFile.rst

new file mode 100644 (file)

index 0000000..c7e48fb
--- /dev/null
+++ b/docs/source/api/BamFile.rst
@@ -0,0 +1,11 @@
+BamFile
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/BamFile.h>
+
+.. doxygenclass:: PacBio::BAM::BamFile
+   :members:
+   :protected-members:
+   :undoc-members:
diff --git a/docs/source/api/BamHeader.rst b/docs/source/api/BamHeader.rst

new file mode 100644 (file)

index 0000000..6cf06af
--- /dev/null
+++ b/docs/source/api/BamHeader.rst
@@ -0,0 +1,11 @@
+BamHeader
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamHeader.h>
+
+.. doxygenclass:: PacBio::BAM::BamHeader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamReader.rst b/docs/source/api/BamReader.rst

new file mode 100644 (file)

index 0000000..e0b6f3c
--- /dev/null
+++ b/docs/source/api/BamReader.rst
@@ -0,0 +1,11 @@
+BamReader
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamReader.h>
+
+.. doxygenclass:: PacBio::BAM::BamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecord.rst b/docs/source/api/BamRecord.rst

new file mode 100644 (file)

index 0000000..a749775
--- /dev/null
+++ b/docs/source/api/BamRecord.rst
@@ -0,0 +1,17 @@
+BamRecord
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecord.h>
+
+.. doxygenenum:: PacBio::BAM::ClipType
+
+.. doxygenenum:: PacBio::BAM::RecordType
+
+.. doxygenenum:: PacBio::BAM::FrameEncodingType
+
+.. doxygenclass:: PacBio::BAM::BamRecord
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecordBuilder.rst b/docs/source/api/BamRecordBuilder.rst

new file mode 100644 (file)

index 0000000..ce477b4
--- /dev/null
+++ b/docs/source/api/BamRecordBuilder.rst
@@ -0,0 +1,11 @@
+BamRecordBuilder
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecordBuilder.h>
+
+.. doxygenclass:: PacBio::BAM::BamRecordBuilder
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecordImpl.rst b/docs/source/api/BamRecordImpl.rst

new file mode 100644 (file)

index 0000000..92b6759
--- /dev/null
+++ b/docs/source/api/BamRecordImpl.rst
@@ -0,0 +1,11 @@
+BamRecordImpl
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecordImpl.h>
+
+.. doxygenclass:: PacBio::BAM::BamRecordImpl
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecordView.rst b/docs/source/api/BamRecordView.rst

new file mode 100644 (file)

index 0000000..2bc8fc4
--- /dev/null
+++ b/docs/source/api/BamRecordView.rst
@@ -0,0 +1,11 @@
+BamRecordView
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecord.h>
+
+.. doxygenclass:: PacBio::BAM::BamRecordView
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamTagCodec.rst b/docs/source/api/BamTagCodec.rst

new file mode 100644 (file)

index 0000000..9307421
--- /dev/null
+++ b/docs/source/api/BamTagCodec.rst
@@ -0,0 +1,11 @@
+BamTagCodec
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamTagCodec.h>
+
+.. doxygenclass:: PacBio::BAM::BamTagCodec
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamWriter.rst b/docs/source/api/BamWriter.rst

new file mode 100644 (file)

index 0000000..2e2951b
--- /dev/null
+++ b/docs/source/api/BamWriter.rst
@@ -0,0 +1,11 @@
+BamWriter
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamWriter.h>
+
+.. doxygenclass:: PacBio::BAM::BamWriter
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BarcodeLookupData.rst b/docs/source/api/BarcodeLookupData.rst

new file mode 100644 (file)

index 0000000..2dac47d
--- /dev/null
+++ b/docs/source/api/BarcodeLookupData.rst
@@ -0,0 +1,11 @@
+BarcodeLookupData
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiLookupData.h>
+
+.. doxygenclass:: PacBio::BAM::BarcodeLookupData 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BarcodeQuery.rst b/docs/source/api/BarcodeQuery.rst

new file mode 100644 (file)

index 0000000..5836059
--- /dev/null
+++ b/docs/source/api/BarcodeQuery.rst
@@ -0,0 +1,11 @@
+BarcodeQuery
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/BarcodeQuery.h>
+
+.. doxygenclass:: PacBio::BAM::BarcodeQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BarcodeSet.rst b/docs/source/api/BarcodeSet.rst

new file mode 100644 (file)

index 0000000..a7ee056
--- /dev/null
+++ b/docs/source/api/BarcodeSet.rst
@@ -0,0 +1,11 @@
+BarcodeSet
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::BarcodeSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BasicLookupData.rst b/docs/source/api/BasicLookupData.rst

new file mode 100644 (file)

index 0000000..b991fdf
--- /dev/null
+++ b/docs/source/api/BasicLookupData.rst
@@ -0,0 +1,11 @@
+BasicLookupData
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiLookupData.h>
+
+.. doxygenclass:: PacBio::BAM::BasicLookupData
+   :members:
+   :protected-members:
+   :undoc-members:
diff --git a/docs/source/api/Cigar.rst b/docs/source/api/Cigar.rst

new file mode 100644 (file)

index 0000000..cea30d5
--- /dev/null
+++ b/docs/source/api/Cigar.rst
@@ -0,0 +1,11 @@
+Cigar
+=====
+
+.. code-block:: cpp
+
+   #include <pbbam/Cigar.h>
+
+.. doxygenclass:: PacBio::BAM::Cigar
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/CigarOperation.rst b/docs/source/api/CigarOperation.rst

new file mode 100644 (file)

index 0000000..856400a
--- /dev/null
+++ b/docs/source/api/CigarOperation.rst
@@ -0,0 +1,13 @@
+CigarOperation
+==============
+
+.. code-block:: cpp
+
+   #include <pbbam/CigarOperation.h>
+   
+.. doxygenenum:: PacBio::BAM::CigarOperationType   
+
+.. doxygenclass:: PacBio::BAM::CigarOperation
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Compare.rst b/docs/source/api/Compare.rst

new file mode 100644 (file)

index 0000000..bb28a7e
--- /dev/null
+++ b/docs/source/api/Compare.rst
@@ -0,0 +1,8 @@
+Compare
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/Compare.h>
+
+.. doxygenfile:: Compare.h
+\ No newline at end of file
diff --git a/docs/source/api/Config.rst b/docs/source/api/Config.rst

new file mode 100644 (file)

index 0000000..c4be9e4
--- /dev/null
+++ b/docs/source/api/Config.rst
@@ -0,0 +1,8 @@
+Config
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/Conifig.h>
+
+.. doxygenfile:: Config.h
+\ No newline at end of file
diff --git a/docs/source/api/ConsensusAlignmentSet.rst b/docs/source/api/ConsensusAlignmentSet.rst

new file mode 100644 (file)

index 0000000..bc5a7e5
--- /dev/null
+++ b/docs/source/api/ConsensusAlignmentSet.rst
@@ -0,0 +1,11 @@
+ConsensusAlignmentSet
+=====================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ConsensusAlignmentSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ConsensusReadSet.rst b/docs/source/api/ConsensusReadSet.rst

new file mode 100644 (file)

index 0000000..846698d
--- /dev/null
+++ b/docs/source/api/ConsensusReadSet.rst
@@ -0,0 +1,11 @@
+ConsensusReadSet
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ConsensusReadSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ContigSet.rst b/docs/source/api/ContigSet.rst

new file mode 100644 (file)

index 0000000..96bb20b
--- /dev/null
+++ b/docs/source/api/ContigSet.rst
@@ -0,0 +1,11 @@
+ContigSet
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ContigSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/DataSet.rst b/docs/source/api/DataSet.rst

new file mode 100644 (file)

index 0000000..8b3f0db
--- /dev/null
+++ b/docs/source/api/DataSet.rst
@@ -0,0 +1,11 @@
+DataSet
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSet.h>
+
+.. doxygenclass:: PacBio::BAM::DataSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/DataSetBase.rst b/docs/source/api/DataSetBase.rst

new file mode 100644 (file)

index 0000000..f23fbb5
--- /dev/null
+++ b/docs/source/api/DataSetBase.rst
@@ -0,0 +1,11 @@
+DataSetBase
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::DataSetBase
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/DataSetMetadata.rst b/docs/source/api/DataSetMetadata.rst

new file mode 100644 (file)

index 0000000..eea260d
--- /dev/null
+++ b/docs/source/api/DataSetMetadata.rst
@@ -0,0 +1,11 @@
+DataSetMetadata
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::DataSetMetadata
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/EntireFileQuery.rst b/docs/source/api/EntireFileQuery.rst

new file mode 100644 (file)

index 0000000..4e7b86b
--- /dev/null
+++ b/docs/source/api/EntireFileQuery.rst
@@ -0,0 +1,11 @@
+EntireFileQuery
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/EntireFileQuery.h>
+
+.. doxygenclass:: PacBio::BAM::EntireFileQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ExtensionElement.rst b/docs/source/api/ExtensionElement.rst

new file mode 100644 (file)

index 0000000..980303e
--- /dev/null
+++ b/docs/source/api/ExtensionElement.rst
@@ -0,0 +1,11 @@
+ExtensionElement
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ExtensionElement
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Extensions.rst b/docs/source/api/Extensions.rst

new file mode 100644 (file)

index 0000000..6704807
--- /dev/null
+++ b/docs/source/api/Extensions.rst
@@ -0,0 +1,11 @@
+Extensions
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::Extensions
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ExternalResource.rst b/docs/source/api/ExternalResource.rst

new file mode 100644 (file)

index 0000000..03ab0d3
--- /dev/null
+++ b/docs/source/api/ExternalResource.rst
@@ -0,0 +1,11 @@
+ExternalResource
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ExternalResource
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ExternalResources.rst b/docs/source/api/ExternalResources.rst

new file mode 100644 (file)

index 0000000..bd72ea4
--- /dev/null
+++ b/docs/source/api/ExternalResources.rst
@@ -0,0 +1,11 @@
+ExternalResources
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ExternalResources
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/FileIndex.rst b/docs/source/api/FileIndex.rst

new file mode 100644 (file)

index 0000000..c117214
--- /dev/null
+++ b/docs/source/api/FileIndex.rst
@@ -0,0 +1,11 @@
+FileIndex
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::FileIndex
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/FileIndices.rst b/docs/source/api/FileIndices.rst

new file mode 100644 (file)

index 0000000..b25720c
--- /dev/null
+++ b/docs/source/api/FileIndices.rst
@@ -0,0 +1,11 @@
+FileIndices
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::FileIndices
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Filter.rst b/docs/source/api/Filter.rst

new file mode 100644 (file)

index 0000000..6faa8aa
--- /dev/null
+++ b/docs/source/api/Filter.rst
@@ -0,0 +1,11 @@
+Filter
+======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::Filter
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Filters.rst b/docs/source/api/Filters.rst

new file mode 100644 (file)

index 0000000..7ea1620
--- /dev/null
+++ b/docs/source/api/Filters.rst
@@ -0,0 +1,11 @@
+Filters
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::Filters
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Frames.rst b/docs/source/api/Frames.rst

new file mode 100644 (file)

index 0000000..cf260f2
--- /dev/null
+++ b/docs/source/api/Frames.rst
@@ -0,0 +1,11 @@
+Frames
+======
+
+.. code-block:: cpp
+
+   #include <pbbam/Frames.h>
+
+.. doxygenclass:: PacBio::BAM::Frames
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/GenomicInterval.rst b/docs/source/api/GenomicInterval.rst

new file mode 100644 (file)

index 0000000..811b83a
--- /dev/null
+++ b/docs/source/api/GenomicInterval.rst
@@ -0,0 +1,11 @@
+GenomicInterval
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/GenomicInterval.h>
+
+.. doxygenclass:: PacBio::BAM::GenomicInterval
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/GenomicIntervalCompositeBamReader.rst b/docs/source/api/GenomicIntervalCompositeBamReader.rst

new file mode 100644 (file)

index 0000000..f658621
--- /dev/null
+++ b/docs/source/api/GenomicIntervalCompositeBamReader.rst
@@ -0,0 +1,11 @@
+GenomicIntervalCompositeBamReader
+=================================
+
+.. code-block:: cpp
+
+   #include <pbbam/CompositeBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::GenomicIntervalCompositeBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/GenomicIntervalQuery.rst b/docs/source/api/GenomicIntervalQuery.rst

new file mode 100644 (file)

index 0000000..7bae558
--- /dev/null
+++ b/docs/source/api/GenomicIntervalQuery.rst
@@ -0,0 +1,11 @@
+GenomicIntervalQuery
+====================
+
+.. code-block:: cpp
+
+   #include <pbbam/GenomicIntervalQuery.h>
+
+.. doxygenclass:: PacBio::BAM::GenomicIntervalQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/HdfSubreadSet.rst b/docs/source/api/HdfSubreadSet.rst

new file mode 100644 (file)

index 0000000..88bf008
--- /dev/null
+++ b/docs/source/api/HdfSubreadSet.rst
@@ -0,0 +1,11 @@
+HdfSubreadSet
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::HdfSubreadSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/IndexResultBlock.rst b/docs/source/api/IndexResultBlock.rst

new file mode 100644 (file)

index 0000000..fac804a
--- /dev/null
+++ b/docs/source/api/IndexResultBlock.rst
@@ -0,0 +1,17 @@
+IndexResultBlock
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiBasicTypes.h>
+
+.. doxygenstruct:: PacBio::BAM::IndexResultBlock
+   :members:
+   :protected-members:
+   :undoc-members:
+   
+.. doxygentypedef:: PacBio::BAM::IndexResultBlocks
+
+.. doxygentypedef:: PacBio::BAM::IndexList
+   
+.. doxygentypedef:: PacBio::BAM::IndexRange
+\ No newline at end of file
diff --git a/docs/source/api/IndexedFastaReader.rst b/docs/source/api/IndexedFastaReader.rst

new file mode 100644 (file)

index 0000000..7c46064
--- /dev/null
+++ b/docs/source/api/IndexedFastaReader.rst
@@ -0,0 +1,11 @@
+IndexedFastaReader
+==================
+
+.. code-block:: cpp
+
+   #include <pbbam/IndexedFastaReader.h>
+
+.. doxygenclass:: PacBio::BAM::IndexedFastaReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Interval.rst b/docs/source/api/Interval.rst

new file mode 100644 (file)

index 0000000..f506a19
--- /dev/null
+++ b/docs/source/api/Interval.rst
@@ -0,0 +1,11 @@
+Interval
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/Interval.h>
+
+.. doxygenclass:: PacBio::BAM::Interval
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/InvalidSequencingChemistryException.rst b/docs/source/api/InvalidSequencingChemistryException.rst

new file mode 100644 (file)

index 0000000..d521ecc
--- /dev/null
+++ b/docs/source/api/InvalidSequencingChemistryException.rst
@@ -0,0 +1,11 @@
+InvalidSequencingChemistryException
+===================================
+
+.. code-block:: cpp
+
+   #include <pbbam/exception/InvalidSequencingChemistryException.h>
+
+.. doxygenclass:: PacBio::BAM::InvalidSequencingChemistryException
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/LocalContextFlags.rst b/docs/source/api/LocalContextFlags.rst

new file mode 100644 (file)

index 0000000..8cd63be
--- /dev/null
+++ b/docs/source/api/LocalContextFlags.rst
@@ -0,0 +1,8 @@
+LocalContextFlags
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/LocalContextFlags.h>
+
+.. doxygenenum:: PacBio::BAM::LocalContextFlags
diff --git a/docs/source/api/MappedLookupData.rst b/docs/source/api/MappedLookupData.rst

new file mode 100644 (file)

index 0000000..7cf3c8b
--- /dev/null
+++ b/docs/source/api/MappedLookupData.rst
@@ -0,0 +1,11 @@
+MappedLookupData
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiLookupData.h>
+
+.. doxygenclass:: PacBio::BAM::MappedLookupData 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/NamespaceInfo.rst b/docs/source/api/NamespaceInfo.rst

new file mode 100644 (file)

index 0000000..c7613ec
--- /dev/null
+++ b/docs/source/api/NamespaceInfo.rst
@@ -0,0 +1,11 @@
+NamespaceInfo
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetXsd.h>
+
+.. doxygenclass:: PacBio::BAM::NamespaceInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/NamespaceRegistry.rst b/docs/source/api/NamespaceRegistry.rst

new file mode 100644 (file)

index 0000000..2f8f9a7
--- /dev/null
+++ b/docs/source/api/NamespaceRegistry.rst
@@ -0,0 +1,11 @@
+NamespaceRegistry
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetXsd.h>
+
+.. doxygenclass:: PacBio::BAM::NamespaceRegistry
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/OrderedLookup.rst b/docs/source/api/OrderedLookup.rst

new file mode 100644 (file)

index 0000000..d5b81b6
--- /dev/null
+++ b/docs/source/api/OrderedLookup.rst
@@ -0,0 +1,11 @@
+OrderedLookup
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiLookupData.h>
+
+.. doxygenclass:: PacBio::BAM::OrderedLookup
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Orientation.rst b/docs/source/api/Orientation.rst

new file mode 100644 (file)

index 0000000..e9bbc42
--- /dev/null
+++ b/docs/source/api/Orientation.rst
@@ -0,0 +1,8 @@
+Orientation
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/Orientation.h>
+
+.. doxygenenum:: PacBio::BAM::Orientation
diff --git a/docs/source/api/ParentTool.rst b/docs/source/api/ParentTool.rst

new file mode 100644 (file)

index 0000000..e2ffa1b
--- /dev/null
+++ b/docs/source/api/ParentTool.rst
@@ -0,0 +1,11 @@
+ParentTool
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ParentTool
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiBuilder.rst b/docs/source/api/PbiBuilder.rst

new file mode 100644 (file)

index 0000000..d795d0f
--- /dev/null
+++ b/docs/source/api/PbiBuilder.rst
@@ -0,0 +1,11 @@
+PbiBuilder
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiBuilder.h>
+
+.. doxygenclass:: PacBio::BAM::PbiBuilder
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFile.rst b/docs/source/api/PbiFile.rst

new file mode 100644 (file)

index 0000000..5a8b85a
--- /dev/null
+++ b/docs/source/api/PbiFile.rst
@@ -0,0 +1,14 @@
+PbiFile
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFile.h>
+
+.. doxygenenum:: PacBio::BAM::PbiFile::Section
+
+.. doxygentypedef:: PacBio::BAM::PbiFile::Sections
+
+.. doxygenenum:: PacBio::BAM::PbiFile::VersionEnum
+
+.. doxygenfunction:: PacBio::BAM::PbiFile::CreateFrom
diff --git a/docs/source/api/PbiFilter.rst b/docs/source/api/PbiFilter.rst

new file mode 100644 (file)

index 0000000..261498b
--- /dev/null
+++ b/docs/source/api/PbiFilter.rst
@@ -0,0 +1,11 @@
+PbiFilter
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFilter.h>
+
+.. doxygenclass:: PacBio::BAM::PbiFilter
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFilterCompositeBamReader.rst b/docs/source/api/PbiFilterCompositeBamReader.rst

new file mode 100644 (file)

index 0000000..7a69df3
--- /dev/null
+++ b/docs/source/api/PbiFilterCompositeBamReader.rst
@@ -0,0 +1,11 @@
+PbiFilterCompositeBamReader
+===========================
+
+.. code-block:: cpp
+
+   #include <pbbam/CompositeBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::PbiFilterCompositeBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFilterQuery.rst b/docs/source/api/PbiFilterQuery.rst

new file mode 100644 (file)

index 0000000..75bbc12
--- /dev/null
+++ b/docs/source/api/PbiFilterQuery.rst
@@ -0,0 +1,11 @@
+PbiFilterQuery
+==============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFilterQuery.h>
+
+.. doxygenclass:: PacBio::BAM::PbiFilterQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFilterTypes.rst b/docs/source/api/PbiFilterTypes.rst

new file mode 100644 (file)

index 0000000..052389b
--- /dev/null
+++ b/docs/source/api/PbiFilterTypes.rst
@@ -0,0 +1,8 @@
+PbiFilterTypes
+==============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFilterTypes.h>
+
+.. doxygenfile:: PbiFilterTypes.h
+\ No newline at end of file
diff --git a/docs/source/api/PbiIndex.rst b/docs/source/api/PbiIndex.rst

new file mode 100644 (file)

index 0000000..811bc68
--- /dev/null
+++ b/docs/source/api/PbiIndex.rst
@@ -0,0 +1,11 @@
+PbiIndex
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiIndex.h>
+
+.. doxygenclass:: PacBio::BAM::PbiIndex
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiIndexedBamReader.rst b/docs/source/api/PbiIndexedBamReader.rst

new file mode 100644 (file)

index 0000000..5450c8a
--- /dev/null
+++ b/docs/source/api/PbiIndexedBamReader.rst
@@ -0,0 +1,11 @@
+PbiIndexedBamReader
+===================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiIndexedBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::PbiIndexedBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawBarcodeData.rst b/docs/source/api/PbiRawBarcodeData.rst

new file mode 100644 (file)

index 0000000..c72ebfb
--- /dev/null
+++ b/docs/source/api/PbiRawBarcodeData.rst
@@ -0,0 +1,11 @@
+PbiRawBarcodeData
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawBarcodeData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawBasicData.rst b/docs/source/api/PbiRawBasicData.rst

new file mode 100644 (file)

index 0000000..2282387
--- /dev/null
+++ b/docs/source/api/PbiRawBasicData.rst
@@ -0,0 +1,11 @@
+PbiRawBasicData
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawBasicData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawData.rst b/docs/source/api/PbiRawData.rst

new file mode 100644 (file)

index 0000000..1a974e8
--- /dev/null
+++ b/docs/source/api/PbiRawData.rst
@@ -0,0 +1,11 @@
+PbiRawData
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawMappedData.rst b/docs/source/api/PbiRawMappedData.rst

new file mode 100644 (file)

index 0000000..42e1de1
--- /dev/null
+++ b/docs/source/api/PbiRawMappedData.rst
@@ -0,0 +1,11 @@
+PbiRawMappedData
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawMappedData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawReferenceData.rst b/docs/source/api/PbiRawReferenceData.rst

new file mode 100644 (file)

index 0000000..460cde4
--- /dev/null
+++ b/docs/source/api/PbiRawReferenceData.rst
@@ -0,0 +1,11 @@
+PbiRawReferenceData
+===================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawReferenceData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiReferenceEntry.rst b/docs/source/api/PbiReferenceEntry.rst

new file mode 100644 (file)

index 0000000..472e586
--- /dev/null
+++ b/docs/source/api/PbiReferenceEntry.rst
@@ -0,0 +1,11 @@
+PbiReferenceEntry
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiReferenceEntry
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Position.rst b/docs/source/api/Position.rst

new file mode 100644 (file)

index 0000000..3c945f2
--- /dev/null
+++ b/docs/source/api/Position.rst
@@ -0,0 +1,10 @@
+Position
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/Position.h>
+
+.. doxygentypedef:: PacBio::BAM::Position
+
+.. doxygenvariable:: PacBio::BAM::UnmappedPosition
+\ No newline at end of file
diff --git a/docs/source/api/ProgramInfo.rst b/docs/source/api/ProgramInfo.rst

new file mode 100644 (file)

index 0000000..b58c93a
--- /dev/null
+++ b/docs/source/api/ProgramInfo.rst
@@ -0,0 +1,11 @@
+ProgramInfo
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/ProgramInfo.h>
+
+.. doxygenclass:: PacBio::BAM::ProgramInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/QNameQuery.rst b/docs/source/api/QNameQuery.rst

new file mode 100644 (file)

index 0000000..b549436
--- /dev/null
+++ b/docs/source/api/QNameQuery.rst
@@ -0,0 +1,11 @@
+QNameQuery
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/QNameQuery.h>
+
+.. doxygenclass:: PacBio::BAM::QNameQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/QualityValue.rst b/docs/source/api/QualityValue.rst

new file mode 100644 (file)

index 0000000..3520c5a
--- /dev/null
+++ b/docs/source/api/QualityValue.rst
@@ -0,0 +1,11 @@
+QualityValue
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/QualityValue.h>
+
+.. doxygenclass:: PacBio::BAM::QualityValue
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/QualityValues.rst b/docs/source/api/QualityValues.rst

new file mode 100644 (file)

index 0000000..8f6dfa5
--- /dev/null
+++ b/docs/source/api/QualityValues.rst
@@ -0,0 +1,11 @@
+QualityValues
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/QualityValues.h>
+
+.. doxygenclass:: PacBio::BAM::QualityValues
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ReadAccuracyQuery.rst b/docs/source/api/ReadAccuracyQuery.rst

new file mode 100644 (file)

index 0000000..abfd1e6
--- /dev/null
+++ b/docs/source/api/ReadAccuracyQuery.rst
@@ -0,0 +1,11 @@
+ReadAccuracyQuery
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/ReadAccuracyQuery.h>
+
+.. doxygenclass:: PacBio::BAM::ReadAccuracyQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ReadGroupInfo.rst b/docs/source/api/ReadGroupInfo.rst

new file mode 100644 (file)

index 0000000..7fb4f69
--- /dev/null
+++ b/docs/source/api/ReadGroupInfo.rst
@@ -0,0 +1,21 @@
+ReadGroupInfo
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/ReadGroupInfo.h>
+
+.. doxygenenum:: PacBio::BAM::BaseFeature
+
+.. doxygenenum:: PacBio::BAM::FrameCodec
+
+.. doxygenenum:: PacBio::BAM::BarcodeModeType
+
+.. doxygenenum:: PacBio::BAM::BarcodeQualityType
+
+.. doxygenclass:: PacBio::BAM::ReadGroupInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+   
+.. doxygenfunction:: PacBio::BAM::MakeReadGroupId
+\ No newline at end of file
diff --git a/docs/source/api/ReferenceLookupData.rst b/docs/source/api/ReferenceLookupData.rst

new file mode 100644 (file)

index 0000000..20316fc
--- /dev/null
+++ b/docs/source/api/ReferenceLookupData.rst
@@ -0,0 +1,11 @@
+ReferenceLookupData
+===================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiLookupData.h>
+
+.. doxygenclass:: PacBio::BAM::ReferenceLookupData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ReferenceSet.rst b/docs/source/api/ReferenceSet.rst

new file mode 100644 (file)

index 0000000..22e4703
--- /dev/null
+++ b/docs/source/api/ReferenceSet.rst
@@ -0,0 +1,11 @@
+ReferenceSet
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ReferenceSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SamTagCodec.rst b/docs/source/api/SamTagCodec.rst

new file mode 100644 (file)

index 0000000..4f8d65d
--- /dev/null
+++ b/docs/source/api/SamTagCodec.rst
@@ -0,0 +1,11 @@
+SamTagCodec
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/SamTagCodec.h>
+
+.. doxygenclass:: PacBio::BAM::SamTagCodec
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SequenceInfo.rst b/docs/source/api/SequenceInfo.rst

new file mode 100644 (file)

index 0000000..393d5bb
--- /dev/null
+++ b/docs/source/api/SequenceInfo.rst
@@ -0,0 +1,11 @@
+SequenceInfo
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/SequenceInfo.h>
+
+.. doxygenclass:: PacBio::BAM::SequenceInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SequentialCompositeBamReader.rst b/docs/source/api/SequentialCompositeBamReader.rst

new file mode 100644 (file)

index 0000000..31ed3b1
--- /dev/null
+++ b/docs/source/api/SequentialCompositeBamReader.rst
@@ -0,0 +1,11 @@
+SequentialCompositeBamReader
+============================
+
+.. code-block:: cpp
+
+   #include <pbbam/CompositeBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::SequentialCompositeBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Strand.rst b/docs/source/api/Strand.rst

new file mode 100644 (file)

index 0000000..4978f72
--- /dev/null
+++ b/docs/source/api/Strand.rst
@@ -0,0 +1,8 @@
+Strand
+======
+
+.. code-block:: cpp
+
+   #include <pbbam/Strand.h>
+
+.. doxygenenum:: PacBio::BAM::Strand 
diff --git a/docs/source/api/SubDataSets.rst b/docs/source/api/SubDataSets.rst

new file mode 100644 (file)

index 0000000..d179065
--- /dev/null
+++ b/docs/source/api/SubDataSets.rst
@@ -0,0 +1,11 @@
+SubDataSets
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::SubDataSets
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SubreadLengthQuery.rst b/docs/source/api/SubreadLengthQuery.rst

new file mode 100644 (file)

index 0000000..23000b3
--- /dev/null
+++ b/docs/source/api/SubreadLengthQuery.rst
@@ -0,0 +1,11 @@
+SubreadLengthQuery
+==================
+
+.. code-block:: cpp
+
+   #include <pbbam/SubreadLengthQuery.h>
+
+.. doxygenclass:: PacBio::BAM::SubreadLengthQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SubreadSet.rst b/docs/source/api/SubreadSet.rst

new file mode 100644 (file)

index 0000000..bfc3c13
--- /dev/null
+++ b/docs/source/api/SubreadSet.rst
@@ -0,0 +1,11 @@
+SubreadSet
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::SubreadSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Tag.rst b/docs/source/api/Tag.rst

new file mode 100644 (file)

index 0000000..50b85c7
--- /dev/null
+++ b/docs/source/api/Tag.rst
@@ -0,0 +1,15 @@
+Tag
+===
+
+.. code-block:: cpp
+
+   #include <pbbam/Tag.h>
+
+.. doxygenenum:: PacBio::BAM::TagDataType
+
+.. doxygenenum:: PacBio::BAM::TagModifier
+
+.. doxygenclass:: PacBio::BAM::Tag
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/TagCollection.rst b/docs/source/api/TagCollection.rst

new file mode 100644 (file)

index 0000000..1314b13
--- /dev/null
+++ b/docs/source/api/TagCollection.rst
@@ -0,0 +1,11 @@
+TagCollection
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/TagCollection.h>
+
+.. doxygenclass:: PacBio::BAM::TagCollection
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/UnorderedLookup.rst b/docs/source/api/UnorderedLookup.rst

new file mode 100644 (file)

index 0000000..718e4e7
--- /dev/null
+++ b/docs/source/api/UnorderedLookup.rst
@@ -0,0 +1,11 @@
+UnorderedLookup
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiLookupData.h>
+
+.. doxygenclass:: PacBio::BAM::UnorderedLookup
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualPolymeraseBamRecord.rst b/docs/source/api/VirtualPolymeraseBamRecord.rst

new file mode 100644 (file)

index 0000000..06d5531
--- /dev/null
+++ b/docs/source/api/VirtualPolymeraseBamRecord.rst
@@ -0,0 +1,11 @@
+VirtualPolymeraseBamRecord
+==========================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualPolymeraseBamRecord.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualPolymeraseBamRecord
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualPolymeraseCompositeReader.rst b/docs/source/api/VirtualPolymeraseCompositeReader.rst

new file mode 100644 (file)

index 0000000..e6cab4e
--- /dev/null
+++ b/docs/source/api/VirtualPolymeraseCompositeReader.rst
@@ -0,0 +1,11 @@
+VirtualPolymeraseCompositeReader
+================================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualPolymeraseCompositeReader.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualPolymeraseCompositeReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualPolymeraseReader.rst b/docs/source/api/VirtualPolymeraseReader.rst

new file mode 100644 (file)

index 0000000..14a46e8
--- /dev/null
+++ b/docs/source/api/VirtualPolymeraseReader.rst
@@ -0,0 +1,11 @@
+VirtualPolymeraseReader
+=======================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualPolymeraseReader.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualPolymeraseReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualRegion.rst b/docs/source/api/VirtualRegion.rst

new file mode 100644 (file)

index 0000000..7a09846
--- /dev/null
+++ b/docs/source/api/VirtualRegion.rst
@@ -0,0 +1,11 @@
+VirtualRegion
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualRegion.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualRegion
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualRegionType.rst b/docs/source/api/VirtualRegionType.rst

new file mode 100644 (file)

index 0000000..4279200
--- /dev/null
+++ b/docs/source/api/VirtualRegionType.rst
@@ -0,0 +1,8 @@
+VirtualRegionType
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualRegionType.h>
+
+.. doxygenenum:: PacBio::BAM::VirtualRegionType
diff --git a/docs/source/api/VirtualRegionTypeMap.rst b/docs/source/api/VirtualRegionTypeMap.rst

new file mode 100644 (file)

index 0000000..eebe637
--- /dev/null
+++ b/docs/source/api/VirtualRegionTypeMap.rst
@@ -0,0 +1,11 @@
+VirtualRegionTypeMap
+====================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualRegionTypeMap.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualRegionTypeMap
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ZmwGroupQuery.rst b/docs/source/api/ZmwGroupQuery.rst

new file mode 100644 (file)

index 0000000..01fc18a
--- /dev/null
+++ b/docs/source/api/ZmwGroupQuery.rst
@@ -0,0 +1,11 @@
+ZmwGroupQuery
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/ZmwGroupQuery.h>
+
+.. doxygenclass:: PacBio::BAM::ZmwGroupQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ZmwQuery.rst b/docs/source/api/ZmwQuery.rst

new file mode 100644 (file)

index 0000000..375fcb0
--- /dev/null
+++ b/docs/source/api/ZmwQuery.rst
@@ -0,0 +1,11 @@
+ZmwQuery
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/ZmwQuery.h>
+
+.. doxygenclass:: PacBio::BAM::ZmwQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ZmwWhitelistVirtualReader.rst b/docs/source/api/ZmwWhitelistVirtualReader.rst

new file mode 100644 (file)

index 0000000..95d2d1a
--- /dev/null
+++ b/docs/source/api/ZmwWhitelistVirtualReader.rst
@@ -0,0 +1,11 @@
+ZmwWhitelistVirtualReader
+=========================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/ZmwWhitelistVirtualReader.h>
+
+.. doxygenclass:: PacBio::BAM::ZmwWhitelistVirtualReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst

new file mode 100644 (file)

index 0000000..354c0de
--- /dev/null
+++ b/docs/source/api_reference.rst
@@ -0,0 +1,12 @@
+.. _api_reference:
+
+C++ API Reference
+=================
+
+Watch this space for more recipes & how-tos. 
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+
+   api/*
diff --git a/docs/source/commandline_utilities.rst b/docs/source/commandline_utilities.rst

new file mode 100644 (file)

index 0000000..7f1bdaf
--- /dev/null
+++ b/docs/source/commandline_utilities.rst
@@ -0,0 +1,15 @@
+.. _command_line:
+
+Command Line Utilities
+======================
+
+In addition to the main library and wrappers, pbbam also provides a few basic
+utilities for working with PacBio indices (".pbi" files).
+
+.. toctree::
+   :maxdepth: 1
+
+   tools/bam2sam
+   tools/pbindex
+   tools/pbindexdump
+   tools/pbmerge
diff --git a/docs/source/conf.py b/docs/source/conf.py

new file mode 100755 (executable)

index 0000000..c1de190
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,332 @@
+# -*- coding: utf-8 -*-
+#
+# pbbam documentation build configuration file, created by
+# sphinx-quickstart on Fri Dec  4 10:08:52 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+import shlex
+import re
+import subprocess
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# get RTD to run doxygen first, per http://breathe.readthedocs.org/en/latest/readthedocs.html
+# but... we generate our actual Doxyfile via CMake in a normal build,
+# so we need to create one here, subbing actual values
+read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
+if read_the_docs_build:
+
+    # fetch directory info
+    this_dir = os.path.abspath(os.getcwd())
+    docs_dir = os.path.abspath(os.path.join(this_dir, '..'))
+    root_dir = os.path.abspath(os.path.join(docs_dir, '..'))
+    include_dir = os.path.abspath(os.path.join(root_dir, 'include'))
+
+    # get project version
+    version = ''
+    with open(os.path.abspath(os.path.join(root_dir, 'CMakeLists.txt')), 'r') as cmakeFile:
+        for line in cmakeFile:
+            if line.startswith('project'):
+                version = re.search(r'VERSION\s*([\d.]+)', line).group(1)
+                break
+
+    # read Doxyfile.in, replace markers with real values, and write Doxyfile
+    inDoxyfile = open(os.path.abspath(os.path.join(docs_dir, 'Doxyfile.in')), 'r')
+    configIn   = inDoxyfile.read()
+    configOut  = re.sub('@PacBioBAM_NAME@',       'pbbam', \
+                 re.sub('@PacBioBAM_VERSION@',    version, \
+                 re.sub('@PacBioBAM_DocsDir@',    docs_dir, \
+                 re.sub('@PacBioBAM_IncludeDir@', include_dir, configIn)))) 
+    outDoxyfile = open(os.path.abspath(os.path.join(docs_dir, 'Doxyfile')), 'w')
+    #print(configOut, outDoxyfile)
+    print >>outDoxyfile, configOut
+    outDoxyfile.close()
+
+    # now run Doxygen
+    subprocess.call('cd ..; doxygen', shell=True)
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['breathe']
+#extensions = [
+#    'sphinx.ext.autodoc',
+ #   'sphinx.ext.coverage',
+ #   'breathe',
+#]
+
+# Setup Breathe extension varialbes
+breathe_projects = { 'pbbam' : os.path.join(os.getcwd(), '..', 'xml') + os.path.sep }
+breathe_default_project = 'pbbam'
+breathe_default_members = ('members', 'undoc-members')
+breathe_implementation_filename_extensions = [ '.cpp', '.inl' ]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'pbbam'
+copyright = u'2015, Derek Barnett'
+author = u'Derek Barnett'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.7.4'
+# The full version, including alpha/beta/rc tags.
+release = '0.7.4'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = []
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'pacbio-theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+html_theme_path = ['.']
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
+#html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# Now only 'ja' uses this config value
+#html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pbbamdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+
+# Latex figure (float) alignment
+#'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, 'pbbam.tex', u'pbbam Documentation',
+   u'Derek Barnett', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pbbam', u'pbbam Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  (master_doc, 'pbbam', u'pbbam Documentation',
+   author, 'pbbam', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst

new file mode 100644 (file)

index 0000000..6860f9f
--- /dev/null
+++ b/docs/source/getting_started.rst
@@ -0,0 +1,144 @@
+
+.. _getting_started:
+
+Getting Started
+===============
+
+.. _getting_started-requirements:
+
+Requirements
+------------
+
+These components will almost certainly already be on your system. 
+ 
+* `gcc`_ (4.8+) OR `clang`_ (v3.1+)
+* pthreads
+* zlib
+
+Double-check your compiler version, to be sure it is compatible.
+
+.. code-block:: console
+
+   $ g++ -v    
+   $ clang -v  
+
+Additional requirements:
+
+* `Boost`_ (1.55+)
+* `CMake`_ (3.0+)
+* `Google Test`_
+* `htslib`_ (PacBio fork)
+
+For additional languages:
+
+* `SWIG`_ (3.0.5+)
+
+For building API documentation locally:
+
+* `Doxygen`_
+
+For maximal convenience, install htslib and google test in the same parent directory you plan to install pbbam.
+
+.. _Boost: http://www.boost.org/
+.. _clang: http://clang.llvm.org/
+.. _CMake: https://cmake.org/
+.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/
+.. _gcc: https://gcc.gnu.org/
+.. _Google Test: https://github.com/google/googletest
+.. _htslib: https://github.com/PacificBiosciences/htslib.git 
+.. _SWIG: http://www.swig.org/
+
+.. _getting_started-build:
+
+Clone & Build
+-------------
+
+.. note::
+
+   The following steps are for building the C++ library and command-line utilities. 
+   If you are integrating pbbam into a C#, Python, or R project, take a look at the 
+   instructions for :ref:`additional languages <swig_bindings>`.
+
+The basic steps for obtaining pbbam and building it from source are as follows:
+
+.. code-block:: console
+
+   $ git clone https://github.com/PacificBiosciences/pbbam.git
+   $ cd pbbam
+   $ mkdir build
+   $ cd build
+   $ cmake ..
+   $ make -j 4    # compiles using 4 threads
+
+Output:
+
+  * Library   : <pbbam_root>/lib
+  * Headers   : <pbbam_root>/include
+  * Utilities : <pbbam_root>/bin
+ 
+You may need to set a few options on the cmake command, to point to dependencies' install locations. 
+Common installation-related options include:
+
+  * HTSLIB_ROOTDIR
+  * GTEST_SRC_DIR
+  
+Add these using the '-D' argument, like this:
+
+.. code-block:: console
+
+   $ cmake .. -DHTSLIB_ROOTDIR="path/to/htslib"
+ 
+To run the test suite, run:
+
+.. code-block:: console
+
+   $ make test
+
+To build a local copy of the (Doxygen-style) API documentation, run:
+
+.. code-block:: console
+
+   $ make doc
+   
+And then open <pbbam_root>/docs/html/index.html in your favorite browser.
+
+.. _getting_started-integrate:
+
+Integrate
+---------
+
+CMake-based projects
+````````````````````
+
+For CMake-based projects that will "ship with" or otherwise live alongside pbbam, you can 
+use the approach described here.
+
+Before defining your library or executable, add the following:
+
+.. code-block:: cmake
+
+   add_subdirectory(<path/to/pbbam> external/build/pbbam)
+
+When it's time to run "make" this will ensure that pbbam will be built, inside your own project's 
+build directory. After this point in the CMakeLists.txt file(s), a few variables will be available 
+that can be used to setup your include paths and library linking targets:
+
+.. code-block:: cmake
+
+   include_directories( 
+       ${PacBioBAM_INCLUDE_DIRS} 
+       # other includes that your project needs
+   )
+
+   add_executable(foo)
+   
+   target_link_libraries(foo 
+       ${PacBioBAM_LIBRARIES}
+       # other libs that your project needs
+   )
+
+Non-CMake projects
+``````````````````
+
+If you're using something other than CMake for your project's build system, then you need to point 
+it to pbbam's include directory & library, as well as those of its dependencies (primarily htslib). 
diff --git a/docs/source/index.rst b/docs/source/index.rst

new file mode 100644 (file)

index 0000000..426c3c5
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,33 @@
+.. pbbam documentation master file, created by
+   sphinx-quickstart on Fri Dec  4 10:08:52 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+.. _home:
+
+pbbam documentation
+===================
+
+As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard BAM 
+format for (both aligned and unaligned) basecall data files. We have also formulated 
+a BAM companion file format (bam.pbi) enabling fast access to a richer set of per-read 
+information as well as compatibility for software built around the legacy cmp.h5 format.
+
+The **pbbam** software package provides components to create, query, & edit PacBio BAM
+files and associated indices. These components include a core C++ library, bindings for 
+additional languages, and command-line utilities.
+
+.. toctree::
+   :maxdepth: 1
+
+   getting_started
+   api_reference
+   swig_bindings
+   commandline_utilities
+
+
+Search:
+
+* :ref:`genindex`
+* :ref:`search`
+
diff --git a/docs/source/pacbio-theme/static/headerGradient.jpg b/docs/source/pacbio-theme/static/headerGradient.jpg

new file mode 100644 (file)

index 0000000..883f147

Binary files /dev/null and b/docs/source/pacbio-theme/static/headerGradient.jpg differ
diff --git a/docs/source/pacbio-theme/static/pacbio.css b/docs/source/pacbio-theme/static/pacbio.css

new file mode 100644 (file)

index 0000000..b4ab87f
--- /dev/null
+++ b/docs/source/pacbio-theme/static/pacbio.css
@@ -0,0 +1,238 @@
+/**
+ * Sphinx stylesheet -- default theme
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+ 
+@import url("basic.css");
+ 
+/* -- page layout ----------------------------------------------------------- */
+ 
+body {
+    font-family: Arial, sans-serif;
+    font-size: 100%;
+    background-color: #555;
+    color: #555;
+    margin: 0;
+    padding: 0;
+    min-width: 500px;
+    max-width: 956px;
+    margin: 0 auto;
+}
+
+div.documentwrapper {
+    float: left;
+    width: 100%;
+}
+
+div.bodywrapper {
+    margin: 0 0 0 230px;
+}
+
+hr{
+    border: 1px solid #B1B4B6;
+    
+}
+ 
+div.document {
+    background-color: #eee;
+}
+ 
+div.body {
+    background-color: #ffffff;
+    color: #3E4349;
+    padding: 30px 30px 30px 30px;
+    font-size: 0.8em;
+}
+ 
+div.footer {
+    color: #555;
+       background-color: #fff;
+    padding: 13px 0;
+    text-align: center;
+    font-size: 75%;
+
+}
+div.footer a {
+    color: #444;
+    text-decoration: underline;
+}
+ 
+div.related {
+    background: #fff url(headerGradient.jpg);
+    line-height: 80px;
+    color: #fff;
+    font-size: 0.80em;
+    height: 79px;
+    z-index: -1;
+}
+
+div.related ul {
+    background: url(pacbioLogo.png) 10px no-repeat;
+    padding: 0 0 0 200px;
+}
+ 
+div.related a {
+    color: #E2F3CC;
+}
+ 
+div.sphinxsidebar {
+    font-size: 0.75em;
+    line-height: 1.5em;
+}
+
+div.sphinxsidebarwrapper{
+    padding: 20px 0;
+}
+ 
+div.sphinxsidebar h3,
+div.sphinxsidebar h4 {
+    font-family: Arial, sans-serif;
+    color: #222;
+    font-size: 1.2em;
+    font-weight: bold;
+    margin: 0;
+    padding: 5px 10px 0 10px;
+}
+
+div.sphinxsidebar h4{
+    font-size: 1.1em;
+}
+ 
+div.sphinxsidebar h3 a {
+    color: #444;
+}
+ 
+ 
+div.sphinxsidebar p {
+    color: #888;
+    padding: 0px 20px;
+       margin-top: 5px;
+}
+ 
+div.sphinxsidebar p.topless {
+}
+ 
+div.sphinxsidebar ul {
+    margin: 5px 20px 10px 20px;
+    padding: 0;
+    color: #000;
+}
+ 
+div.sphinxsidebar a {
+    color: #444;
+}
+ 
+div.sphinxsidebar input {
+    border: 1px solid #ccc;
+    font-family: sans-serif;
+    font-size: 1em;
+}
+
+div.sphinxsidebar input[type=text]{
+    margin-left: 20px;
+}
+ 
+/* -- body styles ----------------------------------------------------------- */
+ 
+a {
+    color: #005B81;
+    text-decoration: none;
+}
+ 
+a:hover {
+    color: #E32E00;
+    text-decoration: underline;
+}
+ 
+div.body h1,
+div.body h2,
+div.body h3,
+div.body h4,
+div.body h5,
+div.body h6 {
+    font-family: Arial, sans-serif;
+    font-weight: bold;
+    color: #264868;
+    margin: 30px 0px 10px 0px;
+    padding: 5px 0 5px 0px;
+}
+ 
+div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 180%; font-weight: normal; }
+div.body h2 { font-size: 125%; }
+div.body h3 { font-size: 110%; }
+div.body h4 { font-size: 100%; }
+div.body h5 { font-size: 100%; }
+div.body h6 { font-size: 100%; }
+ 
+a.headerlink {
+    color: #c60f0f;
+    font-size: 0.8em;
+    padding: 0 4px 0 4px;
+    text-decoration: none;
+}
+ 
+a.headerlink:hover {
+    background-color: #c60f0f;
+    color: white;
+}
+ 
+div.body p, div.body dd, div.body li {
+    line-height: 1.5em;
+    font-size: 1em;
+}
+ 
+div.admonition p.admonition-title + p {
+    display: inline;
+}
+
+div.highlight{
+    background-color: white;
+}
+
+div.note {
+    background-color: #eee;
+    border: 1px solid #ccc;
+}
+ 
+div.seealso {
+    background-color: #ffc;
+    border: 1px solid #ff6;
+}
+ 
+div.topic {
+    background-color: #eee;
+}
+ 
+div.warning {
+    background-color: #ffe4e4;
+    border: 1px solid #f66;
+}
+ 
+p.admonition-title {
+    display: inline;
+}
+ 
+p.admonition-title:after {
+    content: ":";
+}
+ 
+pre {
+    padding: 10px;
+    background-color: White;
+    color: #222;
+    line-height: 1.2em;
+    border: 1px solid #C6C9CB;
+    font-size: 1.2em;
+    margin: 1.5em 0 1.5em 0;
+    -webkit-box-shadow: 1px 1px 1px #d8d8d8;
+    -moz-box-shadow: 1px 1px 1px #d8d8d8;
+}
+ 
+tt {
+    background-color: #ecf0f3;
+    color: #222;
+    padding: 1px 2px;
+    font-size: 1.2em;
+    font-family: monospace;
+}
+
diff --git a/docs/source/pacbio-theme/static/pacbioLogo.png b/docs/source/pacbio-theme/static/pacbioLogo.png

new file mode 100644 (file)

index 0000000..b2e4887

Binary files /dev/null and b/docs/source/pacbio-theme/static/pacbioLogo.png differ
diff --git a/docs/source/pacbio-theme/static/pygments.css b/docs/source/pacbio-theme/static/pygments.css

new file mode 100644 (file)

index 0000000..4588cde
--- /dev/null
+++ b/docs/source/pacbio-theme/static/pygments.css
@@ -0,0 +1,55 @@
+.c { color: #999988; font-style: italic } /* Comment */
+.k { font-weight: bold } /* Keyword */
+.o { font-weight: bold } /* Operator */
+.cm { color: #999988; font-style: italic } /* Comment.Multiline */
+.cp { color: #999999; font-weight: bold } /* Comment.preproc */
+.c1 { color: #999988; font-style: italic } /* Comment.Single */
+.gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
+.ge { font-style: italic } /* Generic.Emph */
+.gr { color: #aa0000 } /* Generic.Error */
+.gh { color: #999999 } /* Generic.Heading */
+.gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
+.go { color: #111 } /* Generic.Output */
+.gp { color: #555555 } /* Generic.Prompt */
+.gs { font-weight: bold } /* Generic.Strong */
+.gu { color: #aaaaaa } /* Generic.Subheading */
+.gt { color: #aa0000 } /* Generic.Traceback */
+.kc { font-weight: bold } /* Keyword.Constant */
+.kd { font-weight: bold } /* Keyword.Declaration */
+.kp { font-weight: bold } /* Keyword.Pseudo */
+.kr { font-weight: bold } /* Keyword.Reserved */
+.kt { color: #445588; font-weight: bold } /* Keyword.Type */
+.m { color: #009999 } /* Literal.Number */
+.s { color: #bb8844 } /* Literal.String */
+.na { color: #008080 } /* Name.Attribute */
+.nb { color: #999999 } /* Name.Builtin */
+.nc { color: #445588; font-weight: bold } /* Name.Class */
+.no { color: #ff99ff } /* Name.Constant */
+.ni { color: #800080 } /* Name.Entity */
+.ne { color: #990000; font-weight: bold } /* Name.Exception */
+.nf { color: #990000; font-weight: bold } /* Name.Function */
+.nn { color: #555555 } /* Name.Namespace */
+.nt { color: #000080 } /* Name.Tag */
+.nv { color: purple } /* Name.Variable */
+.ow { font-weight: bold } /* Operator.Word */
+.mf { color: #009999 } /* Literal.Number.Float */
+.mh { color: #009999 } /* Literal.Number.Hex */
+.mi { color: #009999 } /* Literal.Number.Integer */
+.mo { color: #009999 } /* Literal.Number.Oct */
+.sb { color: #bb8844 } /* Literal.String.Backtick */
+.sc { color: #bb8844 } /* Literal.String.Char */
+.sd { color: #bb8844 } /* Literal.String.Doc */
+.s2 { color: #bb8844 } /* Literal.String.Double */
+.se { color: #bb8844 } /* Literal.String.Escape */
+.sh { color: #bb8844 } /* Literal.String.Heredoc */
+.si { color: #bb8844 } /* Literal.String.Interpol */
+.sx { color: #bb8844 } /* Literal.String.Other */
+.sr { color: #808000 } /* Literal.String.Regex */
+.s1 { color: #bb8844 } /* Literal.String.Single */
+.ss { color: #bb8844 } /* Literal.String.Symbol */
+.bp { color: #999999 } /* Name.Builtin.Pseudo */
+.vc { color: #ff99ff } /* Name.Variable.Class */
+.vg { color: #ff99ff } /* Name.Variable.Global */
+.vi { color: #ff99ff } /* Name.Variable.Instance */
+.il { color: #009999 } /* Literal.Number.Integer.Long */
+
diff --git a/docs/source/pacbio-theme/theme.conf b/docs/source/pacbio-theme/theme.conf

new file mode 100644 (file)

index 0000000..dd24a1a
--- /dev/null
+++ b/docs/source/pacbio-theme/theme.conf
@@ -0,0 +1,4 @@
+[theme]
+inherit = default 
+stylesheet = pacbio.css
+pygments_style = tango
diff --git a/docs/source/requirements.txt b/docs/source/requirements.txt

new file mode 100644 (file)

index 0000000..cd6467e
--- /dev/null
+++ b/docs/source/requirements.txt
@@ -0,0 +1 @@
+breathe
diff --git a/docs/source/swig_bindings.rst b/docs/source/swig_bindings.rst

new file mode 100644 (file)

index 0000000..e9dc33a
--- /dev/null
+++ b/docs/source/swig_bindings.rst
@@ -0,0 +1,257 @@
+.. _swig_bindings:
+
+Additional Languages
+====================
+
+pbbam uses SWIG to generate bindings for other languages. Currently this includes support for C#, Python, and R.
+
+These bindings are disabled by default. See the entry below for your target language to configure pbbam & integrate
+the bindings into your project.
+
+.. _swig_bindings-csharp:
+
+C#
+------
+
+Building
+````````
+
+To build the support for C#, you need to tell CMake to enable it before building:
+
+.. code-block:: console
+
+   $ cmake .. -DPacBioBAM_wrap_csharp
+   $ make
+
+The 'make' step will build relevant libraries/wrappers, and then run a simple program using them, 
+as a quick sanity-check. 
+
+After building, the libraries and wrappers can be found under the pbbam/lib/csharp directory. 
+
+API Example
+```````````
+
+.. code-block:: c#
+
+   using PacBio.BAM;
+
+   namespace TestStuff
+   {
+       public class TestPbbam
+       {
+           public static void TestZmwQuery()
+           {
+               var d = new DataSet("foo.bam");
+               var q = new ZmwQuery(new IntList {1, 2, 3}, d);
+               var q2 = new ZmwQuery(new IntList { 14743 }, d);
+               if (0 != q.Count() || 4 != q2.Count())
+               {
+                   throw new Exception("ZmwQuery not working");
+               }
+               Console.WriteLine("TestZmwQuery - OK!");
+           }
+       }
+   }
+
+.. _swig_bindings-python:
+
+Python
+------
+
+Building
+````````
+
+To build the support for Python, you need to tell CMake to enable it:
+
+.. code-block:: console
+
+   $ cmake .. -DPacBioBAM_wrap_python
+   $ make
+
+The 'make' step will build relevant libraries/wrappers, and then run a simple program using them, 
+as a quick sanity-check. 
+
+After building, the libraries and wrappers can be found in the pbbam/lib/python directory. 
+'make test' will also include some Python-side unit tests as well.
+
+To use the PacBioBam module, you can set your PYTHONPATH before invoking your script:
+
+.. code-block:: console
+
+   $ PYTHONPATH="path/to/pbbam/lib/python" python myScript.py
+
+Or otherwise configure your environment to find the PacBioBam module. 
+
+API Example
+```````````
+
+.. code-block:: python
+
+   import PacBioBam
+   
+   try:
+       file = PacBioBam.BamFile('foo.bam')
+       writer = PacBioBam.BamWriter('new.bam', file.Header())
+       dataset = PacBioBam.DataSet(file)
+       entireFile = PacBioBam.EntireFileQuery(dataset)
+       for record in PacBioBam.Iterate(entireFile):
+           writer.Write(record)
+   except RuntimeError:
+       # found error
+   
+Python-Specific Notes
+`````````````````````
+   
+Iteration
+.........
+
+Iteration over dataset queries in Python will likely need to use the PacBioBam.Iterate() method. Thus
+file iteration loops will look something like the following:
+
+.. code-block:: python
+       
+   entireFile = PacBioBam.EntireFileQuery("input.bam")
+   for record in PacBioBam.Iterate(entireFile):
+       foo.bar(record)
+
+Exception Handling
+..................
+   
+Exceptions are used widely by the C++ library. To handle them from Python, you can use try blocks, looking for
+any RuntimeError:
+
+.. code-block:: python
+
+   try:
+       file = PacBioBam.BamFile("does_not_exist.bam")
+   except RuntimeError: 
+       print("caught expected error")
+   
+.. _swig_bindings-r:
+
+R
+------
+
+Building
+````````
+
+To build the support for R, you need to tell CMake to enable it:
+
+.. code-block:: console
+
+   $ cmake .. -DPacBioBAM_wrap_r
+   $ make
+   
+The 'make' step will build relevant libraries/wrappers, and then run a simple program using them, 
+as a quick sanity-check. 
+
+After building, the libraries and wrappers can be found in the pbbam/lib/R directory. 
+'make test' will also include some R-side unit tests as well.   
+
+To use the PacBioBam module in your script, nothing should be needed up front - simply invoke 'R' as normal. 
+You'll do the dynamic load of the R module near the beginning of your script:
+
+.. code-block:: r
+
+   # load pbbam R library
+   lib_path <- "path/to/pbbam/lib/R"
+   pbbam_libname <- paste(lib_path, "PacBioBam",   sep="/")
+   pbbam_wrapper <- paste(lib_path, "PacBioBam.R", sep="/")
+   dyn.load(paste(pbbam_libname, .Platform$dynlib.ext, sep=""))
+   source(pbbam_wrapper)
+   cacheMetaData(1) 
+
+
+API Example
+```````````
+
+.. code-block:: r
+
+   # load pbbam R library
+   lib_path <- "path/to/pbbam/lib/R"
+   pbbam_libname <- paste(lib_path, "PacBioBam",   sep="/")
+   pbbam_wrapper <- paste(lib_path, "PacBioBam.R", sep="/")
+   dyn.load(paste(pbbam_libname, .Platform$dynlib.ext, sep=""))
+   source(pbbam_wrapper)
+   cacheMetaData(1)    
+  
+   # sample method
+   copyFileAndFetchRecordNames <-function(inputFn, outputFn) {
+       
+       result <- tryCatch(
+       {
+           file   <- BamFile(inputFn)
+           writer <- BamWriter(outputFn, file$Header())
+           ds     <- DataSet(file)
+            
+           entireFile <- EntireFileQuery(ds)
+           iter <- entireFile$begin()
+           end  <- entireFile$end()
+                       
+           while ( iter$'__ne__'(end) ) {
+               record <- iter$value()
+                
+               names_in <- c(names_in, record$FullName())
+               writer$Write(record)
+               iter$incr()
+            }
+            writer$TryFlush()
+            return(names_in)
+        },
+        error = function(e) {
+            # handle error 
+            return(list())
+        })
+        return(result)
+   }
+
+R-Specific Notes
+````````````````
+
+Iteration
+.........
+
+To compare iterators, you'll need to explicitly use the '__eq__' or '__ne__' methods. Thus iterating over
+a data query, will look something like this:
+
+.. code-block:: r
+
+   iter <- query$begin()
+   end  <- query$end()
+   while ( iter$'__ne__'(end) ) {
+       record <- iter$value() 
+       
+       # do stuff with record
+   }
+   
+operator[]
+..........  
+   
+In C++, operator[] can be used in some classes to directly access elements in a sequence, e.g. Cigar string
+
+.. code-block:: cpp
+
+   CigarOperation op = cigar[0]; 
+   
+For the R wrapper, if you want to do the same sort of thing, you'll need to use the '__getitem__' method. 
+Please note that these are **0-based** indices, not 1-based as in much of R. 
+
+.. code-block:: r
+
+   op <- cigar$'__getitem__'(0) 
+   
+Exception Handling
+..................
+
+Exceptions are used widely by the C++ library. To handle them from R, you can use the 'tryCatch' block, listening for 
+'error' type exceptions.
+
+ .. code-block:: r
+ 
+    result <- tryCatch(
+    {
+        f <- BamFile("does_not_exist.bam") # this statement will throw
+    },
+    error = function(e) {
+        print(paste("caught expected erorr: ",e))
+    })
diff --git a/docs/source/tools/bam2sam.rst b/docs/source/tools/bam2sam.rst

new file mode 100644 (file)

index 0000000..4577686
--- /dev/null
+++ b/docs/source/tools/bam2sam.rst
@@ -0,0 +1,21 @@
+.. _bam2sam:
+
+bam2sam
+=======
+
+::
+
+  Usage: bam2sam [options] [input]
+
+  bam2sam converts a BAM file to SAM. It is essentially a stripped-down 'samtools
+  view', mostly useful for testing/debugging without requiring samtools. Input BAM
+  file is read from a file or stdin, and SAM output is written to stdout.
+
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+
+  Options:
+    input               Input BAM file. If not provided, stdin will be used as input.
+    --no-header         Omit header from output.
+    --header-only       Print only the header (no records).
diff --git a/docs/source/tools/pbindex.rst b/docs/source/tools/pbindex.rst

new file mode 100644 (file)

index 0000000..e7c491f
--- /dev/null
+++ b/docs/source/tools/pbindex.rst
@@ -0,0 +1,18 @@
+.. _pbindex:
+
+pbindex
+=======
+
+::
+
+  Usage: pbindex <input>
+
+  pbindex creates a index file that enables random-access to PacBio-specific data
+  in BAM files. Generated index filename will be the same as input BAM plus .pbi suffix.
+
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+
+  Input/Output:
+    input                 Input BAM file
diff --git a/docs/source/tools/pbindexdump.rst b/docs/source/tools/pbindexdump.rst

new file mode 100644 (file)

index 0000000..6829064
--- /dev/null
+++ b/docs/source/tools/pbindexdump.rst
@@ -0,0 +1,233 @@
+.. _pbindexdump:
+
+pbindexdump
+===========
+
+::
+
+  Usage: pbindexdump [options] [input]
+
+  pbindexdump prints a human-readable view of PBI data to stdout.
+
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+
+  Input/Output:
+    input               Input PBI file. If not provided, stdin will be used as input.
+    --format=STRING     Output format, one of:
+                            json, cpp
+
+                        json: pretty-printed JSON [default]
+
+                        cpp: copy/paste-able C++ code that can be used to
+                        construct the equivalent PacBio::BAM::PbiRawData object
+
+  JSON Formatting:
+    --json-indent-level=INT
+                        JSON indent level [4]
+    --json-raw          Prints fields in a manner that more closely reflects the
+                        PBI file format - presenting data as per-field columns,
+                        not per-record objects.
+
+JSON Output Schemas
+-------------------
+
+Normal JSON:
+
+.. code-block:: JSON
+
+    {
+      "type": "object",
+      "properties": {
+        "fileSections": {
+          "type": "array",
+          "items": { "type": "string" },
+        },
+        "numReads": { "type": "integer" },
+        "reads": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "aEnd": { "type": "integer" },
+              "aStart": { "type": "integer" },
+              "bcForward": { "type": "integer" },
+              "bcQuality": { "type": "integer" },
+              "bcReverse": { "type": "integer" },
+              "contextFlag": { "type": "integer" },
+              "fileOffset": { "type": "integer" },
+              "holeNumber": { "type": "integer" },
+              "mapQuality": { "type": "integer" },
+              "nM": { "type": "integer" },
+              "nMM": { "type": "integer" },
+              "qEnd": { "type": "integer" },
+              "qStart": { "type": "integer" },
+              "readQuality": { "type": "number" },
+              "reverseStrand": { "type": "integer" },
+              "rgId": { "type": "integer" },
+              "tEnd": { "type": "integer" },
+              "tId": { "type": "integer" },
+              "tStart: { "type": "integer" }
+            },
+            "required": [
+              "contextFlag",
+              "fileOffset",
+              "holeNumber",
+              "qEnd",
+              "qStart",
+              "readQuality",
+              "rgId"
+            ]
+          }
+        },
+        "references": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "beginRow": { "type": "integer" },
+              "endRow": { "type": "integer" },
+              "tId": { "type": "integer" }
+            },
+            "required" : [ "beginRow", "endRow","tId" ]
+          }
+        }q
+        "version": { "type": "string" }
+      },
+      "required": [
+        "fileSections",
+        "numReads",
+        "reads",
+        "version"
+      ]
+    }
+
+"Raw" JSON:
+
+.. code-block:: JSON
+
+    {
+      "type": "object",
+      "properties": {
+        "barcodeData" : {
+          "type" : "object",
+          "properties: {
+            "bcForward" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "bcQuality" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "bcReverse" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            }
+          }
+        },
+        "basicData" : {
+          "type" : "object",
+          "properties: {
+            "contextFlag" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "fileOffset" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "holeNumber" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "qEnd" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "qStart" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "readQuality" : {
+              "type": "array",
+              "items" : { "type": "number" }
+            },
+            "rgId : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            }
+          }
+        },
+        "fileSections": {
+          "type": "array",
+          "items": { "type": "string" },
+        },
+        "mappedData" : {
+          "type" : "object",
+          "properties: {
+            "aEnd" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "aStart" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "mapQuality" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "nM" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "nMM" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "readQuality" : {
+              "type": "array",
+              "items" : { "type": "number" }
+            },
+            "reverseStrand" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "tEnd" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "tId" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "tStart" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            }
+          }
+        },
+        "numReads": { "type": "integer" },
+        "references": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "beginRow": { "type": "integer" },
+              "endRow": { "type": "integer" },
+              "tId": { "type": "integer" }
+            },
+            "required" : [ "beginRow", "endRow","tId" ]
+          }
+        },
+        "version" : { "type": "string" }
+      },
+      "required": [
+        "fileSections",
+        "numReads",
+        "basicData",
+        "version"
+      ]
+    }
diff --git a/docs/source/tools/pbmerge.rst b/docs/source/tools/pbmerge.rst

new file mode 100644 (file)

index 0000000..937ec56
--- /dev/null
+++ b/docs/source/tools/pbmerge.rst
@@ -0,0 +1,30 @@
+.. _pbmerge:
+
+pbmerge
+=======
+
+::
+
+  Usage: pbmerge [options] [-o <out.bam>] <INPUT>
+
+  pbmerge merges PacBio BAM files. If the input is DataSetXML, any filters will be
+  applied. If no output filename is specified, new BAM will be written to stdout.
+
+  Options:
+  -h, --help            show this help message and exit
+  --version             show program's version number and exit
+
+  Input/Output:
+    -o output           Output BAM filename.
+    --no-pbi            Set this option to skip PBI index file creation. PBI
+                        creation is automatically skipped if no output filename
+                        is provided.
+    INPUT               Input may be one of:
+                            DataSetXML, list of BAM files, or FOFN
+
+                            fofn: pbmerge -o merged.bam bams.fofn
+
+                            bams: pbmerge -o merged.bam 1.bam 2.bam 3.bam
+
+                            xml:  pbmerge -o merged.bam foo.subreadset.xml
+
diff --git a/docs/specs/pbbam.rst b/docs/specs/pbbam.rst

new file mode 100644 (file)

index 0000000..6842371
--- /dev/null
+++ b/docs/specs/pbbam.rst
@@ -0,0 +1,631 @@
+=================================================================
+**pbbam Software Design & Functional Specification**
+=================================================================
+| *Version 0.1*
+| *Pacific Biosciences Engineering Group*
+| *Jan 29, 2016*
+
+1. Revision History
+===================
+
++-------------+---------------+--------------------+---------------------------+
+| **Date**    | **Revision**  | **Author(s)**      | **Comments**              |
++=============+===============+====================+===========================+
+| 01-29-2016  | 0.1           | Derek Barnett      | Initial draft created     |
+|             |               |                    |                           |
++-------------+---------------+--------------------+---------------------------+
+
+2. Introduction
+===============
+
+2.1. Document Specification Identifier
+--------------------------------------
+
++-----------------------------------+------------------------------------------+
+| **Document Specification Prefix** | **Description**                          |
++===================================+==========================================+
+| FS\_SA\_PBBAM\_                   | Functional spec for pbbam                |
++-----------------------------------+------------------------------------------+
+
+2.2. Purpose
+------------
+
+This document is intended to describe the requirements and interface of the pbbam
+library, which provides functionality for creating, querying, and editing PacBio
+BAM files and associated file formats.
+
+2.3. Scope of Document
+----------------------
+
+This document covers the expected usage of the pbbam library, as well as any
+desired or required performance characteristics with respect to quality or speed.
+
+This document does not provide installation instructions or API documentation.
+
+2.4. Glossary of Terms
+----------------------
+
+The table below specifies only terms specific to this document, and skips
+acronyms/terms that are specified in `Pacific Biosciences Software Glossary`_.
+
+.. _Pacific Biosciences Software Glossary: http://smrtanalysis-docs/pb_sw_glossary.html
+
++------------------+-----------------------------------------------------------+
+| **Acronym/Term** | **Description**                                           |
++==================+===========================================================+
+| API              | Application Programming Interface - a set of routines,    |
+|                  | protocols, and tools for building software applications.  |
+|                  | In this document , this will consist of one or more       |
+|                  | cooperating libraries that specify data structures,       |
+|                  | methods, etc. for use within a target programming         |
+|                  | language.                                                 |
++------------------+-----------------------------------------------------------+
+| Client           | An application that uses the library.                     |
++------------------+-----------------------------------------------------------+
+| I/O              | Input/output of data.                                     |
++------------------+-----------------------------------------------------------+
+
+2.5. References
+---------------
+
++-------------+------------------------------+--------------------------------------+
+| **Ref No.** | **Document Name, Link**      | **Description**                      |
++=============+==============================+======================================+
+| (1)         | `BAM format`_                | General SAM/BAM specification        |
++-------------+------------------------------+--------------------------------------+
+| (2)         | `PacBio BAM`_                | PacBio BAM specification             |
++-------------+------------------------------+--------------------------------------+
+| (3)         | `PacBio BAM index`_          | PacBio BAM index specification       |
++-------------+------------------------------+--------------------------------------+
+| (4)         | `DataSet XML`_               | PacBio DataSet XML specification     |
++-------------+------------------------------+--------------------------------------+
+| (5)         | `Software Style Guide`_      | PacBio coding standards              |
++-------------+------------------------------+--------------------------------------+
+| (6)         | `SMRT Analysis`_             | General SMRT Analysis infrastructure |
++-------------+------------------------------+--------------------------------------+
+
+.. _BAM format: https://samtools.github.io/hts-specs/SAMv1.pdf
+.. _PacBio BAM: http://pacbiofileformats.readthedocs.org/en/3.0/BAM.html
+.. _PacBio BAM index: http://pacbiofileformats.readthedocs.org/en/3.0/PacBioBamIndex.html
+.. _DataSet XML: https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/DataSet.rst
+.. _Software Style Guide: http://smrtanalysis-docs/_downloads/PBISoftwareStyleGuide.doc
+.. _SMRT Analysis: http://smrtanalysis-docs/smrt_docs.html
+
+3. Software Overview
+====================
+
+3.1. Product Description
+------------------------
+
+As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard
+`BAM format`_ (1) for (both aligned and unaligned) basecall data files. We have
+also formulated a BAM companion file format (.bam.pbi) enabling fast access to a
+richer set of per-read information as well as compatibility for software built
+around the legacy cmp.h5 format.
+
+The pbbam library provides components to create, query, & transform PacBio BAM
+data: sequence files and their associated indices. This includes a core C++
+library as well as bindings for additional programming languages.
+
+3.2. Product Functional Capabilities
+------------------------------------
+
+The library must be able to read and write BAM files that conform to the
+`PacBio BAM`_ specification (2). BAM records must be editable e.g. adding
+alignment information. Random access must be supported, whether by genomic
+region or by filtering record features. To this end, the library will be able to
+read, write, and create associated index files - both the standard BAM index
+(.bai) and the `PacBio BAM index`_ (.pbi) (3). In addition to working with
+individual files, datasets of related BAM files will be supported. These are
+described in a `DataSet XML`_ document. (4)
+
+3.3. User Characteristics
+-------------------------
+
++---------------------+--------------------------------------------------------+
+| **User Class/Role** | **User Knowledge and Skill Levels**                    |
++=====================+========================================================+
+| Developer           | Competence in one or more programming languages        |
+|                     | supported (C++, R, Python, C#). No knowledge of        |
+|                     | molecular biology wet lab techniques required.         |
++---------------------+--------------------------------------------------------+
+
+3.4. User Operations and Practices
+----------------------------------
+
+Developer users will interact with the software by incorporating the library
+into a client application.
+
+3.5. Operating Environment
+--------------------------
+
+The software is intended to be run in a Linux or OSX environment, with ideally 4
+or more cores.
+
+3.6. Design and Implementation Constraints
+------------------------------------------
+
+Currently there are no constraints outside the operating environment and speed
+requirements. In particular, as the library will be used for writing the BAM
+files coming off a Sequel instrument, it should be able to keep pace.
+
+3.7. Assumptions and Dependencies
+---------------------------------
+
+Input routines for the library will expect to receive files that conform to the
+`PacBio BAM`_ (2) or `DataSet XML`_ (4) specifications.
+
+The pbbam library depends on Boost, zlib, and htslib libraries.
+
+3.8. Other Software
+-------------------
+
+Output PacBio BAMs will be compatible with the `PacBio BAM`_ specification (2)
+and thus compatible with the general `BAM format`_ specification (1). This
+ensures that a wide variety of downstream tools can interact with data files.
+
+The software uses `CMake`_ as its build system.
+
+The core C++ API relies on the following 3rd party components:
+
+* `zlib`_
+* `htslib`_
+* `Boost`_ (header-only modules)
+
+Wrapper APIs for additional languages (Python, R, C#) are generated by `SWIG`_.
+
+API documentation is generated via `Doxygen`_.
+
+.. _CMake: https://cmake.org/
+.. _zlib: http://www.zlib.net/
+.. _htslib: https://github.com/samtools/htslib
+.. _Boost: http://www.boost.org/
+.. _SWIG: http://www.swig.org/
+.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/
+
+4. External Interfaces
+======================
+
+4.1. User Interfaces
+--------------------
+
+N/A
+
+4.2. Software Interfaces
+------------------------
+
+pbbam will require the following software:
+
+* `htslib`_ & `zlib`_ - provides low-level handling of compressed BAM data
+* `Boost`_ - provides utility classes
+
+Incoming data from upstream components will be compliant with
+PacBio BAM format - see `PacBio BAM`_ specification (2) for more detail.
+
+4.3. Hardware Interfaces
+------------------------
+
+N/A
+
+4.4. Communications Interfaces
+------------------------------
+
+N/A
+
+5. Functional Requirements
+==========================
+
+5.1. Query BAM data by genomic region
+-----------------------------------------
+
+5.1.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some genomic
+region of interest.
+
+5.1.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a standard index (.bai) for each source BAM file
+* genomic interval (e.g. "chr1:1000-2000")
+
+5.1.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Obtain an `htslib`_ "iterator" object for a given file and region. This will be
+wrapped by pbbam to hide the low-level nature of this type, as well as handling
+memory lifetime.
+
+5.1.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which are aligned to the requested genomic interval.
+
+For example:
+
+.. code:: c++
+
+    GenomicIntervalQuery query(interval, dataset);
+    for (const BamRecord& record : query) {
+        // ... do stuff ...
+    }
+
+
+5.1.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+5.2. Query BAM data by filter criteria
+-----------------------------------------
+
+5.2.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some filter
+criteria (e.g. only reads from ZMW hole number 200 with a read quality of >0.5).
+
+5.2.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a `PacBio BAM index`_ (.pbi) for each source BAM file
+* filters supported by data contained in the PBI
+
+5.2.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Query PBI files(s) for records that match the provided filter criteria. Merge
+contiguous runs of records into record blocks, to minimize seeks. Advancing the
+iterator either reads the next read from the current block or seeks to the next
+block and fetches the next record.
+
+5.2.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which satisfy the requested filter criteria.
+
+For example:
+
+.. code:: c++
+
+    PbiFilterQuery query(filter, dataset);
+    for (const BamRecord& record : query) {
+        // ... do stuff ...
+    }
+
+5.2.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+5.3. Write PacBio BAM data
+------------------------------------------
+
+5.3.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall be able to write `PacBio BAM`_ files conforming to the specification.
+
+5.3.2. Inputs
+~~~~~~~~~~~~~
+
+* filename
+* header information
+* BAM records
+
+5.3.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Create file handle for the provided filename, output initial header information.
+As records are passed in, write to file. Upon completion, flush any buffers and
+close file handle.
+
+Multithreading, provided by `htslib`_, will be utilized where possible to speed
+up the compression process - often then main bottleneck of BAM throughput.
+
+5.3.4. Outputs
+~~~~~~~~~~~~~~
+
+BAM file conforming to the `PacBio BAM`_ specification.
+
+5.3.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+5.4. Create PacBio BAM index file
+------------------------------------------
+
+5.4.1. Description
+~~~~~~~~~~~~~~~~~~
+
+Much of PacBio BAM data processing relies on the presence of a `PacBio BAM index`_
+file. pbbam shall be able to generate this file type for a `PacBio BAM`_ file.
+
+5.4.2. Inputs
+~~~~~~~~~~~~~
+
+`PacBio BAM`_ file
+
+5.4.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Read through the input BAM records, storing the values relevant to a PBI index.
+At end of file, write the index contents to a file and close.
+
+5.4.4. Outputs
+~~~~~~~~~~~~~~
+
+`PacBio BAM index`_ file
+
+5.4.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6. Non-Functional Requirements
+==============================
+
+6.1. Performance Requirements
+-----------------------------
+
+Since pbbam will be used to write all BAM files coming off a Sequel device, the
+library must keep pace with data generation requirements.
+
+** come back to this, hard numbers ?? **
+
+6.2. Safety Requirements
+------------------------
+
+N/A
+
+6.3. Security Requirements
+--------------------------
+
+N/A
+
+6.4. Quality Attributes
+-----------------------
+
+6.4.1. Availability
+~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6.4.2. Integrity
+~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+Files that do not meet this requirement will raise exceptions and will not be
+accepted.
+
+6.4.3. Interoperability
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+
+6.4.4. Reliability
+~~~~~~~~~~~~~~~~~~
+
+The developed software shall meet the overall product reliability requirements.
+
+6.4.5. Robustness
+~~~~~~~~~~~~~~~~~
+
+pbbam will raise exceptions upon encountering failure cases, allowing client
+applications to recover or report the error to a UI.
+
+6.4.6. Usability
+~~~~~~~~~~~~~~~~
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+Raised exceptions shall carry as much information as possible so that client
+applications can respond with appropriate actions or display useful messages.
+
+6.4.7. Maintainability
+~~~~~~~~~~~~~~~~~~~~~~
+
+The source code of the software covered in this functional specification shall
+adhere to the PacBio `Software Style Guide`_ (9) work instruction, to guarantee
+high quality of code that facilitates maintainability.
+
+6.4.8. Customizability
+~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6.5. Business Rules
+-------------------
+
+N/A
+
+6.6. Installation and Upgrade
+-----------------------------
+
+Installation and Upgrade of this software will be handled as part of the SMRT
+Analysis subsystem. See `SMRT Analysis`_ (6) specifications for more detail.
+
+Additionally, the library may be built independently, either from internal
+version control (Perforce) or from the public-facing Github repository. In
+either case, `CMake`_ is used to drive the build process.
+
+6.7. Administration
+-------------------
+
+N/A
+
+6.8. User Documentation
+-----------------------
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+The "offline" API documentation may be built directly from the source code, using
+`Doxygen`_. Online documentation will be generated via a continuous integration
+server, thus ensuring it is always pointing to the current codebase.
+
+7. High Level Design
+====================
+
+7.1. Top Level Context
+----------------------
+
+The pbbam library is intended to be linked in with client applications,
+providing programmatic access to data files.
+
+7.2. Use Cases
+--------------
+
+Primary use cases for pbbam include:
+
+* BAM file creation
+* BAM file query - iterable access to various subsets of data
+
+8. Detailed Design
+==================
+
+8.1. Structural Representation
+------------------------------
+
+ *image(s) here*
+
+8.2. Behavioral Representation
+------------------------------
+
+This section provides behavioral (dynamic) representation of how the
+elements of the system realize the required use cases.
+
+Describe how the significant subsystems and classes interact with each
+other to realize the architecturally significant use cases.
+
+Provide a link to a file containing Sequence Diagram or Activity Diagram, when applicable.
+The link may be provided with use of 'image' directive.
+
+Sequence Diagram shows one use case scenario, executed by class model,
+with sequence of operations over period of time (time increased from top
+to bottom). It shows interactions between objects, but does not show
+relationships between them.
+
+Activity Diagram is a virtual representation of the sequential flow and
+control logic of a set of related activities or actions. It is a type of
+flowchart, frequently called Swim Lane Diagram, because activities of
+each entity are presented within its swim lane.
+
+Note: You may use http://wsd tool to auto-generate a sequence diagram from
+a descriptive text file, save the diagram to the wsd site, get link to the image,
+and add this link to the document with use of 'image' directive.
+
+8.3. Information Storage
+------------------------
+
+pbbam software requires no persistent storage outside of availability of input
+and output during analysis.
+
+8.4. Technology Overview
+------------------------
+
+pbbam is implemented in C++-11 and should perform as designed on any UNIX-like
+operating system (Linux distributions, Apple OSX, etc.).
+
+8.5. SOUP Components
+--------------------
+
+pbbam utilizes CMake for its build system. The C++ library uses the following
+3rd-party software components: Boost, htslib, & zlib. Wrappers for additional
+languages are generated using SWIG.
+
+8.6. Deployment and Configuration
+---------------------------------
+
+Please refer to `SMRT Analysis`_ (6) documentation
+
+9. Automated Tests
+==================
+
+9.1. Unit Testing
+-----------------
+
+The library shall have unit tests for all classes & components.
+
+9.2. Performance Testing
+------------------------
+
+Unit tests may evaluate performance requirements as desired.
+
+9.3. Regression Testing
+-----------------------
+
+As its role is primarily in data I/O, pbbam has no "scientific quality/validity"
+metrics that would indicate a regression. Instead, passing its unit tests and
+end-to-end tests will indicate that a regression has not been introduced.
+
+These tests will be run after each check-in and nightly.
+
+10. Requirements Traceability Matrices
+======================================
+
+This section provides traces from requirements specified in PRD/DIR documents to the
+requirements covered in this functional specification, and from these
+functional requirements to corresponding Test Cases/Procedures.
+
+10.1. HPQC Functional Specifications
+------------------------------------
+
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| **PBI_ID**  | **Name**                  | **Description**                                   | **Comment** | **Metric** | **Owner** | **PRD/DIR Path**                          |
++=============+===========================+===================================================+=============+============+===========+===========================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query    |             |            | dbarnett  |                                           |
+|             | genomic region            | data, limited to some genomic region of interest. |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query    |             |            | dbarnett  |                                           |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only  |             |            |           |                                           |
+|             |                           | reads from ZMW hole number 200 with a read        |             |            |           |                                           |
+|             |                           | quality of >0.5).                                 |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to  |             |            | dbarnett  |                                           |
+|             |                           | the `PacBio BAM`_ specifictation.                 |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the  |             |            | dbarnett  |                                           |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam     |             |            |           |                                           |
+|             |                           | shall be able to generate this file type for a    |             |            |           |                                           |
+|             |                           | `PacBio BAM`_ file.                               |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+
+10.2. Automated Tests Coverage
+------------------------------
+
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| **FS Item** | **FS Item Title**         | **Use Case Description**                           | **Test Case Name/ID**                                            |
++=============+===========================+====================================================+==================================================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query     | TODO                                                             |
+|             | genomic region            | data, limited to some genomic region of interest.  |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query     | TODO                                                             |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only   |                                                                  |
+|             |                           | reads from ZMW hole number 200 with a read         |                                                                  |
+|             |                           | quality of >0.5).                                  |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to   | TODO                                                             |
+|             |                           | the `PacBio BAM`_ specifictation.                  |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the   | TODO                                                             |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam      |                                                                  |
+|             |                           | shall be able to generate this file type for a     |                                                                  |
+|             |                           | `PacBio BAM`_ file.                                |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+
diff --git a/docs/specs/pbbam_structure.png b/docs/specs/pbbam_structure.png

new file mode 100755 (executable)

index 0000000..40f50cf

Binary files /dev/null and b/docs/specs/pbbam_structure.png differ
diff --git a/docs/specs/pbbam_updated_release3_2.rst b/docs/specs/pbbam_updated_release3_2.rst

new file mode 100755 (executable)

index 0000000..72d9b76
--- /dev/null
+++ b/docs/specs/pbbam_updated_release3_2.rst
@@ -0,0 +1,618 @@
+=============================================================
+**Pbbam Core API Software Design & Functional Specification**
+=============================================================
+| *Version 0.2*
+| *Pacific Biosciences Engineering Group*
+| *Oct 17, 2016*
+
+1. Revision History
+===================
+
++-------------+---------------+--------------------+---------------------------------+
+| **Date**    | **Revision**  | **Author(s)**      | **Comments**                    |
++=============+===============+====================+=================================+
+| 01-29-2016  | 0.1           | Derek Barnett      | Initial draft created           |
+|             |               |                    |                                 |
++-------------+---------------+--------------------+---------------------------------+
+| 10-17-2016  | 0.2           | Derek Barnett      | Added behavioral representation |
+|             |               |                    | and structural representation   |
+|             |               |                    | diagram                         |
++-------------+---------------+--------------------+---------------------------------+
+
+2. Introduction
+===============
+
+2.1. Document Specification Identifier
+--------------------------------------
+
++-----------------------------------+------------------------------------------+
+| **Document Specification Prefix** | **Description**                          |
++===================================+==========================================+
+| FS\_SA\_PBBAM\_                   | Functional spec for pbbam                |
++-----------------------------------+------------------------------------------+
+
+2.2. Purpose
+------------
+
+This document is intended to describe the requirements and interface of the pbbam
+library, which provides functionality for creating, querying, and editing PacBio
+BAM files and associated file formats.
+
+2.3. Scope of Document
+----------------------
+
+This document covers the expected usage of the pbbam library, as well as any
+desired or required performance characteristics with respect to quality or speed.
+
+This document does not provide installation instructions or API documentation.
+
+2.4. Glossary of Terms
+----------------------
+
+The table below specifies only terms specific to this document, and skips
+acronyms/terms that are specified in `Pacific Biosciences Software Glossary`_.
+
+.. _Pacific Biosciences Software Glossary: http://smrtanalysis-docs/pb_sw_glossary.html
+
++------------------+-----------------------------------------------------------+
+| **Acronym/Term** | **Description**                                           |
++==================+===========================================================+
+| API              | Application Programming Interface - a set of routines,    |
+|                  | protocols, and tools for building software applications.  |
+|                  | In this document, this will consist of one or more        |
+|                  | cooperating libraries that specify data structures,       |
+|                  | methods, etc. for use within a target programming         |
+|                  | language.                                                 |
++------------------+-----------------------------------------------------------+
+| Client           | An application that uses the library.                     |
++------------------+-----------------------------------------------------------+
+| I/O              | Input/output of data.                                     |
++------------------+-----------------------------------------------------------+
+
+2.5. References
+---------------
+
++-------------+------------------------------+--------------------------------------+
+| **Ref No.** | **Document Name, Link**      | **Description**                      |
++=============+==============================+======================================+
+| (1)         | `BAM format`_                | General SAM/BAM specification        |
++-------------+------------------------------+--------------------------------------+
+| (2)         | `PacBio BAM`_                | PacBio BAM specification             |
++-------------+------------------------------+--------------------------------------+
+| (3)         | `PacBio BAM index`_          | PacBio BAM index specification       |
++-------------+------------------------------+--------------------------------------+
+| (4)         | `DataSet XML`_               | PacBio DataSet XML specification     |
++-------------+------------------------------+--------------------------------------+
+| (5)         | `Software Style Guide`_      | PacBio coding standards              |
++-------------+------------------------------+--------------------------------------+
+| (6)         | `SMRT Analysis`_             | General SMRT Analysis infrastructure |
++-------------+------------------------------+--------------------------------------+
+
+.. _BAM format: https://samtools.github.io/hts-specs/SAMv1.pdf
+.. _PacBio BAM: http://pacbiofileformats.readthedocs.org/en/3.0/BAM.html
+.. _PacBio BAM index: http://pacbiofileformats.readthedocs.org/en/3.0/PacBioBamIndex.html
+.. _DataSet XML: https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/DataSet.rst
+.. _Software Style Guide: http://smrtanalysis-docs/_downloads/PBISoftwareStyleGuide.doc
+.. _SMRT Analysis: http://smrtanalysis-docs/smrt_docs.html
+
+3. Software Overview
+====================
+
+3.1. Software Module Description
+--------------------------------
+
+As of the 3.0 release of SMRT Analysis, PacBio is embracing the industry standard
+`BAM format`_ (1) for (both aligned and unaligned) basecall data files. We have
+also formulated a BAM companion file format (.bam.pbi) enabling fast access to a
+richer set of per-read information as well as compatibility for software built
+around the legacy cmp.h5 format.
+
+The pbbam library provides components to create, query, & transform PacBio BAM
+data: sequence files and their associated indices. This includes a core C++
+library as well as bindings for additional programming languages.
+
+3.2. Software Module Functional Capabilities
+--------------------------------------------
+
+The library must be able to read and write BAM files that conform to the
+`PacBio BAM`_ specification (2). BAM records must be editable e.g. adding
+alignment information. Random access must be supported, whether by genomic
+region or by filtering record features. To this end, the library will be able to
+read, write, and create associated index files - both the standard BAM index
+(.bai) and the `PacBio BAM index`_ (.pbi) (3). In addition to working with
+individual files, datasets of related BAM files will be supported. These are
+described in a `DataSet XML`_ document. (4)
+
+3.3. User Characteristics
+-------------------------
+
++---------------------+--------------------------------------------------------+
+| **User Class/Role** | **User Knowledge and Skill Levels**                    |
++=====================+========================================================+
+| Developer           | Competence in one or more programming languages        |
+|                     | supported (C++, R, Python, C#). No knowledge of        |
+|                     | molecular biology wet lab techniques required.         |
++---------------------+--------------------------------------------------------+
+
+3.4. User Operations and Practices
+----------------------------------
+
+Developer users will interact with the software by incorporating the library
+into a client application.
+
+3.5. Operating Environment
+--------------------------
+
+The software is intended to be run in a Linux or OSX environment, with ideally 4
+or more cores.
+
+3.6. General Constraints
+------------------------
+
+Currently there are no constraints outside the operating environment and speed
+requirements. In particular, as the library will be used for writing the BAM
+files coming off a Sequel instrument, it should be able to keep pace.
+
+3.7. Assumptions and Dependencies
+---------------------------------
+
+Input routines for the library will expect to receive files that conform to the
+`PacBio BAM`_ (2) or `DataSet XML`_ (4) specifications.
+
+The pbbam library depends on Boost, zlib, and htslib libraries.
+
+3.8. Other Software
+-------------------
+
+Output PacBio BAMs will be compatible with the `PacBio BAM`_ specification (2)
+and thus compatible with the general `BAM format`_ specification (1). This
+ensures that a wide variety of downstream tools can interact with data files.
+
+The software uses `CMake`_ as its build system.
+
+The core C++ API relies on the following 3rd party components:
+
+* `zlib`_
+* `htslib`_
+* `Boost`_ (header-only modules)
+
+Wrapper APIs for additional languages (Python, R, C#) are generated by `SWIG`_.
+
+API documentation is generated via `Doxygen`_.
+
+.. _CMake: https://cmake.org/
+.. _zlib: http://www.zlib.net/
+.. _htslib: https://github.com/samtools/htslib
+.. _Boost: http://www.boost.org/
+.. _SWIG: http://www.swig.org/
+.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/
+
+4. External Interfaces
+======================
+
+4.1. User Interfaces
+--------------------
+
+N/A
+
+4.2. Software Interfaces
+------------------------
+
+pbbam will require the following software:
+
+* `htslib`_ & `zlib`_ - provides low-level handling of compressed BAM data
+* `Boost`_ - provides utility classes
+
+Incoming data from upstream components will be compliant with
+PacBio BAM format - see `PacBio BAM`_ specification (2) for more detail.
+
+4.3. Hardware Interfaces
+------------------------
+
+N/A
+
+4.4. Communications Interfaces
+------------------------------
+
+N/A
+
+5. Functional Requirements
+==========================
+
+5.1. Query BAM data by genomic region
+-------------------------------------
+
+5.1.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some genomic
+region of interest.
+
+5.1.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a standard index (.bai) for each source BAM file
+* genomic interval (e.g. "chr1:1000-2000")
+
+5.1.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Obtain an `htslib`_ "iterator" object for a given file and region. This will be
+wrapped by pbbam to hide the low-level nature of this type, as well as handling
+memory lifetime.
+
+5.1.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which are aligned to the requested genomic interval.
+
+For example:
+
+.. code:: c++
+
+    GenomicIntervalQuery query(interval, dataset);
+    for (const BamRecord& record : query) {
+        // ... use record data ...
+    }
+
+
+5.2. Query BAM data by filter criteria
+--------------------------------------
+
+5.2.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some filter
+criteria (e.g. only reads from ZMW hole number 200 with a read quality of >0.5).
+
+5.2.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a `PacBio BAM index`_ (.pbi) for each source BAM file
+* filters supported by data contained in the PBI
+
+5.2.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Query PBI files(s) for records that match the provided filter criteria. Merge
+contiguous runs of records into record blocks, to minimize seeks. Advancing the
+iterator either reads the next read from the current block or seeks to the next
+block and fetches the next record.
+
+5.2.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which satisfy the requested filter criteria.
+
+For example:
+
+.. code:: c++
+
+    PbiFilterQuery query(filter, dataset);
+    for (const BamRecord& record : query) {
+        // ... do stuff ...
+    }
+
+
+5.3. Write PacBio BAM data
+--------------------------
+
+5.3.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall be able to write `PacBio BAM`_ files conforming to the specification.
+
+5.3.2. Inputs
+~~~~~~~~~~~~~
+
+* filename
+* header information
+* BAM records
+
+5.3.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Create file handle for the provided filename, output initial header information.
+As records are passed in, write to file. Upon completion, flush any buffers and
+close file handle.
+
+Multithreading, provided by `htslib`_, will be utilized where possible to speed
+up the compression process - often then main bottleneck of BAM throughput.
+
+5.3.4. Outputs
+~~~~~~~~~~~~~~
+
+BAM file conforming to the `PacBio BAM`_ specification.
+
+5.4. Create PacBio BAM index file
+---------------------------------
+
+5.4.1. Description
+~~~~~~~~~~~~~~~~~~
+
+Much of PacBio BAM data processing relies on the presence of a `PacBio BAM index`_
+file. pbbam shall be able to generate this file type for a `PacBio BAM`_ file.
+
+5.4.2. Inputs
+~~~~~~~~~~~~~
+
+`PacBio BAM`_ file
+
+5.4.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Read through the input BAM records, storing the values relevant to a PBI index.
+At end of file, write the index contents to a file and close.
+
+5.4.4. Outputs
+~~~~~~~~~~~~~~
+
+`PacBio BAM index`_ file
+
+6. Non-Functional Requirements
+==============================
+
+6.1. Performance Requirements
+-----------------------------
+
+Since pbbam will be used to write all BAM files coming off a Sequel instrument, the
+library must keep pace with data generation requirements.
+
+6.2. Safety Requirements
+------------------------
+
+N/A
+
+6.3. Security Requirements
+--------------------------
+
+N/A
+
+6.4. Quality Attributes
+-----------------------
+
+6.4.1. Availability
+~~~~~~~~~~~~~~~~~~~
+
+The developed software shall meet the overall product availability requirements.
+
+6.4.2. Data Integrity
+~~~~~~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+Files that do not meet this requirement will raise exceptions and will not be
+accepted.
+
+6.4.3. Interoperability
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+
+6.4.4. Reliability
+~~~~~~~~~~~~~~~~~~
+
+The developed software shall meet the overall product reliability requirements.
+
+6.4.5. Robustness
+~~~~~~~~~~~~~~~~~
+
+pbbam will raise exceptions upon encountering failure cases, allowing client
+applications to recover or report the error to a UI.
+
+6.4.6. Usability
+~~~~~~~~~~~~~~~~
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+Raised exceptions shall carry as much information as possible so that client
+applications can respond with appropriate actions or display useful messages.
+
+6.4.7. Maintainability
+~~~~~~~~~~~~~~~~~~~~~~
+
+The source code of the software covered in this functional specification shall
+adhere to the PacBio `Software Style Guide`_ (9) work instruction, to guarantee
+high quality of code that facilitates maintainability.
+
+6.4.8. Customizability
+~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6.4.9. Compatibility
+~~~~~~~~~~~~~~~~~~~~
+
+pbbam shall support backward compatibility of the API and BAM format versions
+in order not to break existing clients.
+
+6.5. Business Rules
+-------------------
+
+N/A
+
+6.6. Compliance Requirements
+----------------------------
+
+N/A
+
+6.7. Alarms and Error Handling
+------------------------------
+
+Raised exceptions shall carry as much information as possible so that client
+applications can respond with appropriate actions or display useful messages.
+
+6.8. Persistence Requirements
+-----------------------------
+
+pbbam software requires no persistent storage outside of availability of input
+and output during analysis.
+
+6.9. Installation and Upgrade
+-----------------------------
+
+Installation and Upgrade of this software will be handled as part of the SMRT
+Analysis subsystem. See `SMRT Analysis`_ (6) specifications for more detail.
+
+Additionally, the library may be built independently, either from internal
+version control (Perforce) or from the public-facing Github repository. In
+either case, `CMake`_ is used to drive the build process.
+
+6.10. Administration and Maintenance
+------------------------------------
+
+N/A
+
+6.11. User Documentation
+------------------------
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+The "offline" API documentation may be built directly from the source code, using
+`Doxygen`_. Online documentation will be generated via a continuous integration
+server, thus ensuring it is always pointing to the current codebase.
+
+7. High Level Design
+====================
+
+7.1. Top Level Context
+----------------------
+
+The pbbam library is intended to be linked in with client applications,
+providing programmatic access to data files.
+
+7.2. Use Cases
+--------------
+
+Primary use cases for pbbam include:
+
+* BAM file creation
+* BAM file query - iterable access to various subsets of data
+
+8. Detailed Design
+==================
+
+8.1. Structural Representation
+------------------------------
+
+.. image:: ./pbbam_structure.png
+
+8.2. Behavioral Representation
+------------------------------
+
+The typical access pattern involves a client query against BAM data, optionally
+described in DataSet XML. The query may involve some number of filter criteria.
+
+pbbam queries the associated index files (*.pbi) to pre-determine which records
+pass filtering criteria and where they reside on disk. The client code is given
+an iterable object, such that each iteration of the main access loop returns a
+valid BAM record for analysis, modification, etc.
+
+8.3. Information Storage
+------------------------
+
+pbbam software requires no persistent storage outside of availability of input
+and output during analysis.
+
+8.4. Technology Overview
+------------------------
+
+pbbam is implemented in C++-11 and should perform as designed on any UNIX-like
+operating system (Linux distributions, Apple OSX, etc.).
+
+8.5. SOUP Components
+--------------------
+
+pbbam utilizes CMake for its build system. The C++ library uses the following
+3rd-party software components: `Boost`_, `htslib`_, & `zlib`_. Wrappers for additional
+languages are generated using SWIG.
+
+8.6. Deployment and Configuration
+---------------------------------
+
+Please refer to `SMRT Analysis`_ (6) documentation
+
+9. Automated Tests
+==================
+
+9.1. Unit Testing
+-----------------
+
+The library shall have unit tests for all classes & components.
+
+9.2. Performance Testing
+------------------------
+
+Unit tests may evaluate performance requirements as desired.
+
+9.3. Regression Testing
+-----------------------
+
+As its role is primarily in data I/O, pbbam has no "scientific quality/validity"
+metrics that would indicate a regression. Instead, passing its unit tests and
+end-to-end tests will indicate that a regression has not been introduced.
+
+These tests will be run after each check-in and nightly.
+
+10. Requirements Traceability Matrices
+======================================
+
+This section provides traces from requirements specified in PRD/DIR documents to the
+requirements covered in this functional specification, and from these
+functional requirements to corresponding Test Cases/Procedures.
+
+10.1. HPQC Functional Specifications
+------------------------------------
+
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| **PBI_ID**  | **Name**                  | **Description**                                   | **Comment** | **Metric** | **Owner** | **PRD/DIR Path**                                 |
++=============+===========================+===================================================+=============+============+===========+==================================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query    |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\Common\APIs\\     |
+|             | genomic region            | data, limited to some genomic region of interest. |             |            |           | Software shall provide an API to allow 3rd       |
+|             |                           |                                                   |             |            |           | party software to extract all run information    |
+|             |                           |                                                   |             |            |           | including summary reports and locations          |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query    |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\Common\APIs\\     |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only  |             |            |           | Software shall provide an API to allow 3rd       |
+|             |                           | reads from ZMW hole number 200 with a read        |             |            |           | party software to extract all run information    |
+|             |                           | quality of >0.5).                                 |             |            |           | including summary reports and locations          |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to  |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\\PostProcessing\\ |
+|             |                           | the `PacBio BAM`_ specification.                  |             |            |           | Software shall provide base files including      |
+|             |                           |                                                   |             |            |           | kinetic information in industry standard format  |
+|             |                           |                                                   |             |            |           | such as SAM/BAM using current specifications     |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the  |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\\PostProcessing\\ |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam     |             |            |           | Software shall provide base files including      |
+|             |                           | shall be able to generate this file type for a    |             |            |           | kinetic information in industry standard format  |
+|             |                           | `PacBio BAM`_ file.                               |             |            |           | such as SAM/BAM using current specifications     |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+
+10.2. Automated Tests Coverage
+------------------------------
+
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| **FS Item** | **FS Item Title**         | **Use Case Description**                           | **Test Case Name/ID**                                            |
++=============+===========================+====================================================+==================================================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query     | See section 9.1. Unit Testing.                                   |
+|             | genomic region            | data, limited to some genomic region of interest.  |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query     | See section 9.1. Unit Testing.                                   |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only   |                                                                  |
+|             |                           | reads from ZMW hole number 200 with a read         |                                                                  |
+|             |                           | quality of >0.5).                                  |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to   | See section 9.1. Unit Testing.                                   |
+|             |                           | the `PacBio BAM`_ specification.                   |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the   | See section 9.1. Unit Testing.                                   |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam      |                                                                  |
+|             |                           | shall be able to generate this file type for a     |                                                                  |
+|             |                           | `PacBio BAM`_ file.                                |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+
diff --git a/include/pbbam/Accuracy.h b/include/pbbam/Accuracy.h

new file mode 100644 (file)

index 0000000..f1db014
--- /dev/null
+++ b/include/pbbam/Accuracy.h
@@ -0,0 +1,89 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Accuracy.h
+/// \brief Defines the Accuracy class.
+//
+// Author: Derek Barnett
+
+#ifndef ACCURACY_H
+#define ACCURACY_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The Accuracy class represents the expected accuracy of a BamRecord.
+///
+/// Values are clamped to fall within [0,1].
+///
+class PBBAM_EXPORT Accuracy
+{
+public:
+    static const float MIN; ///< Minimum valid accuracy value [0.0]
+    static const float MAX; ///< Maximum valid accuracy value [1.0]
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// Constructs an Accuracy object from a floating-point number.
+    ///
+    /// \note This is not an \b explicit ctor, to make it as easy as
+    ///       possible to use in numeric operations. We really just want
+    ///       to make sure that the acceptable range is respected.
+    ///
+    Accuracy(float accuracy);
+    Accuracy(const Accuracy& other);
+    ~Accuracy(void);
+
+    /// \}
+
+public:
+    /// \returns Accuracy as float primitive
+    operator float(void) const;
+
+private:
+    float accuracy_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/Accuracy.inl"
+
+#endif // ACCURACY_H
diff --git a/include/pbbam/AlignmentPrinter.h b/include/pbbam/AlignmentPrinter.h

new file mode 100644 (file)

index 0000000..4dda6cd
--- /dev/null
+++ b/include/pbbam/AlignmentPrinter.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file AlignmentPrinter.h
+/// \brief Defines the AlignmentPrinter class.
+//
+// Author: Armin Töpfer
+
+#ifndef ALIGNMENTPRINTER_H
+#define ALIGNMENTPRINTER_H
+
+#include <memory>
+#include <string>
+#include "pbbam/BamRecord.h"
+#include "pbbam/IndexedFastaReader.h"
+#include "pbbam/Orientation.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+
+/// \brief The AlignmentPrinter class "pretty-prints" an alignment with respect
+///        to its associated reference sequence.
+///
+/// Example output:
+/// \verbinclude plaintext/AlignmentPrinterOutput.txt
+///
+class AlignmentPrinter
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// Constructs the alignment printer with an associated FASTA file reader.
+    ///
+    /// \param[in] ifr FASTA reader
+    ///
+    /// \throws std::runtime_error if FASTA file cannot be opened for reading.
+    ///
+    AlignmentPrinter(const IndexedFastaReader& ifr);
+
+    AlignmentPrinter(void) = delete;
+    AlignmentPrinter(const AlignmentPrinter&) = delete;
+    AlignmentPrinter(AlignmentPrinter&&) = default;
+    AlignmentPrinter& operator=(const AlignmentPrinter&) = delete;
+    AlignmentPrinter& operator=(AlignmentPrinter&&) = default;
+    ~AlignmentPrinter(void) = default;
+
+    /// \}
+
+public:
+    /// \name Printing
+    /// \{
+
+    /// Pretty-prints an aligned BamRecord to std::string.
+    ///
+    /// \note The current implementation includes ANSI escape sequences for
+    ///       coloring terminal output. Future versions of this method will
+    ///       likely make this optional.
+    ///
+    /// \returns formatted string containing the alignment and summary
+    ///          information
+    ///
+    std::string Print(const BamRecord& record,
+                      const Orientation orientation = Orientation::GENOMIC);
+
+    /// \}
+
+private:
+       const std::unique_ptr<IndexedFastaReader> ifr_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // ALIGNMENTPRINTER_H
diff --git a/include/pbbam/BaiIndexedBamReader.h b/include/pbbam/BaiIndexedBamReader.h

new file mode 100644 (file)

index 0000000..7441c69
--- /dev/null
+++ b/include/pbbam/BaiIndexedBamReader.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BaiIndexedBamReader.h
+/// \brief Defines the BaiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#ifndef BAIINDEXEDBAMREADER_H
+#define BAIINDEXEDBAMREADER_H
+
+#include "pbbam/BamReader.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/GenomicInterval.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal { struct BaiIndexedBamReaderPrivate; }
+
+/// \brief The BaiIndexedBamReader class provides read-only iteration over %BAM
+///        records, bounded by a particular genomic interval.
+///
+/// The SAM/BAM standard index (*.bai) is used to allow random-access operations.
+///
+class PBBAM_EXPORT BaiIndexedBamReader : public BamReader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Constructs %BAM reader, bounded by a genomic interval.
+    ///
+    /// All reads that overlap the interval will be available.
+    ///
+    /// \param[in] interval iteration will be bounded by this GenomicInterval.
+    /// \param[in] filename input %BAM filename
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open
+    ///         for reading, or if the interval is invalid
+    ///
+    BaiIndexedBamReader(const GenomicInterval& interval,
+                        const std::string& filename);
+
+    /// \brief Constructs BAM reader, bounded by a genomic interval.
+    ///
+    /// All reads that overlap the interval will be available.
+    ///
+    /// \param[in] interval iteration will be bounded by this GenomicInterval.
+    /// \param[in] bamFile input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open
+    ///         for reading, or if the interval is invalid
+    ///
+    BaiIndexedBamReader(const GenomicInterval& interval, const BamFile& bamFile);
+
+    /// \brief Constructs %BAM reader, bounded by a genomic interval.
+    ///
+    /// All reads that overlap the interval will be available.
+    ///
+    /// \param[in] interval iteration will be bounded by this GenomicInterval.
+    /// \param[in] bamFile input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open
+    ///         for reading, or if the interval is invalid
+    ///
+    BaiIndexedBamReader(const GenomicInterval& interval, BamFile&& bamFile);
+
+    /// \}
+
+public:
+    /// \name Random-Access
+    /// \{
+
+    /// \returns the current GenomicInterval in use by this reader
+    const GenomicInterval& Interval(void) const;
+
+    /// \brief Sets a new genomic interval on the reader.
+    ///
+    /// \param[in] interval
+    /// \returns reference to this reader
+    ///
+    BaiIndexedBamReader& Interval(const GenomicInterval& interval);
+
+    /// \}
+
+protected:
+    int ReadRawData(BGZF* bgzf, bam1_t* b);
+
+private:
+    std::unique_ptr<internal::BaiIndexedBamReaderPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // BAIINDEXEDBAMREADER_H
diff --git a/include/pbbam/BamFile.h b/include/pbbam/BamFile.h

new file mode 100644 (file)

index 0000000..d7c6811
--- /dev/null
+++ b/include/pbbam/BamFile.h
@@ -0,0 +1,218 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamFile.h
+/// \brief Defines the BamFile class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMFILE_H
+#define BAMFILE_H
+
+#include "pbbam/Config.h"
+#include "pbbam/BamHeader.h"
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal { class BamFilePrivate; }
+
+/// \brief The BamFile class represents a %BAM file.
+///
+/// It provides access to header metadata and methods for finding/creating
+/// associated index files.
+///
+class PBBAM_EXPORT BamFile
+{
+public:
+
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a BamFile object on the provided \p filename &
+    ///        loads header information.
+    ///
+    /// \param[in] filename %BAM filename
+    /// \throws std::exception on failure to open %BAM file for reading
+    ///
+    BamFile(const std::string& filename);
+
+    BamFile(const BamFile& other);
+    BamFile(BamFile&& other);
+    BamFile& operator=(const BamFile& other);
+    BamFile& operator=(BamFile&& other);
+    ~BamFile(void);
+
+    /// \}
+
+public:
+
+    /// \name Index & Filename Methods
+    /// \{
+
+    /// \brief Creates a ".pbi" file for this %BAM file.
+    ///
+    /// \note Existing index file will be overwritten. Use
+    ///       EnsurePacBioIndexExists() if this is not desired.
+    ///
+    /// \throws if PBI file could not be properly created and/or
+    ///         written to disk
+    ///
+    void CreatePacBioIndex(void) const;
+
+    /// \brief Creates a ".bai" file for this %BAM file.
+    ///
+    /// \note Existing index file will be overwritten. Use
+    ///       EnsureStandardIndexExists() if this is not desired.
+    ///
+    /// \throws if BAI file could not be properly created (e.g. this
+    ///         %BAM is not coordinate-sorted) or could not be written to disk
+    ///
+    void CreateStandardIndex(void) const;
+
+    /// \brief Creates a ".pbi" file if one does not exist or is older than its
+    ///        %BAM file.
+    ///
+    /// Equivalent to:
+    /// \code{.cpp}
+    ///    if (!file.PacBioIndexExists())
+    ///        file.CreatePacBioIndex();
+    /// \endcode
+    ///
+    /// \note As of v0.4.02+, no timestamp check is performed. Previously we requr
+    /// with an additional timestamp check.
+    ///
+    /// \throws if PBI file could not be properly created and/or
+    ///         written to disk
+    ///
+    void EnsurePacBioIndexExists(void) const;
+
+    /// \brief Creates a ".bai" file if one does not exist or is older than its
+    ///        %BAM file.
+    ///
+    /// Equivalent to:
+    /// \code{.cpp}
+    ///    if (!file.StandardIndexExists())
+    ///        file.CreateStandardIndex();
+    /// \endcode
+    ///
+    /// \note As of v0.4.2, no timestamp check is performed.
+    ///
+    /// \throws if BAI file could not be properly created (e.g. this
+    ///         %BAM is not coordinate-sorted) or could not be written to disk
+    ///
+    void EnsureStandardIndexExists(void) const;
+
+    /// \returns %BAM filename
+    std::string Filename(void) const;
+
+    /// \returns true if %BAM file has EOF marker (empty BGZF block). Streamed
+    ///          input (filename: "-")
+    bool HasEOF(void) const;
+
+    /// \returns true if ".pbi" exists and is newer than this %BAM file.
+    bool PacBioIndexExists(void) const;
+
+    /// \returns filename of %PacBio index file (".pbi")
+    /// \note No guarantee is made on the existence of this file.
+    ///       This method simply returns the expected filename.
+    std::string PacBioIndexFilename(void) const;
+
+    /// \returns true if ".pbi" has a more recent timestamp than this file
+    bool PacBioIndexIsNewer(void) const;
+
+    /// \returns true if ".bai" exists
+    bool StandardIndexExists(void) const;
+
+    /// \note No guarantee is made on the existence of this file.
+    ///       This method simply returns the expected filename.
+    std::string StandardIndexFilename(void) const;
+
+    /// \returns true if ".bai" has a more recent timestamp than this file
+    bool StandardIndexIsNewer(void) const;
+
+    /// \}
+
+public:
+    /// \name File Header Data
+    /// \{
+
+    /// \returns true if header metadata has this reference name
+    bool HasReference(const std::string& name) const;
+
+    /// \returns const reference to BamHeader containing the file's metadata
+    const BamHeader& Header(void) const;
+
+    /// \returns true if file is a %PacBio %BAM file (i.e. has non-empty version
+    ///          associated with header "pb" tag)
+    bool IsPacBioBAM(void) const;
+
+    /// \returns ID for reference \p name (can be used for e.g.
+    ///          GenomicIntervalQuery), or -1 if not found
+    int ReferenceId(const std::string& name) const;
+
+    /// \return name of reference matching \p id, empty string if not found
+    std::string ReferenceName(const int id) const;
+
+    /// \returns length of requested reference \p name. 0 if not found
+    uint32_t ReferenceLength(const std::string& name) const;
+
+    /// \returns length of requested reference \p id. 0 if not found
+    uint32_t ReferenceLength(const int id) const;
+
+    /// \}
+
+public:
+    /// \name Additional Attributes
+    /// \{
+
+    /// \returns virtual offset of first alignment. Intended mostly for internal
+    ///          use. Note that this is a BGZF \b virtual offset, not a
+    ///          'normal' file position.
+    ///
+    int64_t FirstAlignmentOffset(void) const;
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::BamFilePrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // BAMFILE_H
diff --git a/include/pbbam/BamHeader.h b/include/pbbam/BamHeader.h

new file mode 100644 (file)

index 0000000..eada466
--- /dev/null
+++ b/include/pbbam/BamHeader.h
@@ -0,0 +1,418 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamHeader.h
+/// \brief Defines the BamHeader class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMHEADER_H
+#define BAMHEADER_H
+
+#include "pbbam/Config.h"
+#include "pbbam/ProgramInfo.h"
+#include "pbbam/ReadGroupInfo.h"
+#include "pbbam/SequenceInfo.h"
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal { class BamHeaderPrivate; }
+
+/// \brief The BamHeader class represents the header section of the %BAM file.
+///
+/// It provides metadata about the file including file version, reference
+/// sequences, read groups, comments, etc.
+///
+/// A BamHeader may be fetched from a BamFile to view an existing file's
+/// metadata. Or one may be created/edited for use with writing to a new file
+/// (via BamWriter).
+///
+/// \note A particular BamHeader is likely to be re-used in lots of places
+///       throughout the library, for read-only purposes. For this reason, even
+///       though a BamHeader may be returned by value, it is essentially a thin
+///       wrapper for a shared-pointer to the actual data. This means, though,
+///       that if you need to edit an existing BamHeader for use with a
+///       BamWriter, please consider using BamHeader::DeepCopy. Otherwise any
+///       modifications will affect all BamHeaders that are sharing its
+///       underlying data.
+///
+class PBBAM_EXPORT BamHeader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    BamHeader(void);
+    BamHeader(const std::string& samHeaderText);
+    BamHeader(const BamHeader& other);
+    BamHeader(BamHeader&& other);
+    BamHeader& operator=(const BamHeader& other);
+    BamHeader& operator=(BamHeader&& other);
+    ~BamHeader(void);
+
+    /// \brief Detaches underlying data from the shared-pointer, returning a
+    ///        independent copy of the header contents.
+    ///
+    /// This ensures that any modifications to the newly returned BamHeader do
+    /// not affect other BamHeader objects that were sharing its underlying data.
+    ///
+    BamHeader DeepCopy(void) const;
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    /// \brief Merges another header with this one.
+    ///
+    /// Headers must be compatible for merging. This means that their Version,
+    /// SortOrder, PacBioBamVersion (and in the case of aligned BAM data,
+    /// Sequences) must all match. If not, an exception will be thrown.
+    ///
+    /// \param[in] other  header to merge with this one
+    /// \returns reference to this header
+    ///
+    /// \throws std::runtime_error if the headers are not compatible
+    ///
+    BamHeader& operator+=(const BamHeader& other);
+
+    /// \brief Creates a new, merged header.
+    ///
+    /// Headers must be compatible for merging. This means that their Version,
+    /// SortOrder, PacBioBamVersion (and in the case of aligned BAM data,
+    /// Sequences) must all match. If not, an exception will be thrown.
+    ///
+    /// Both original headers (this header and \p other) will not be modified.
+    ///
+    /// \param[in] other  header to merge with this one
+    /// \returns merged header
+    ///
+    /// \throws std::runtime_error if the headers are not compatible
+    ///
+    BamHeader operator+(const BamHeader& other) const;
+
+    /// \}
+
+public:
+    /// \name General Attributes
+    /// \{
+
+    /// \returns the %PacBio %BAM version number (\@HD:pb)
+    ///
+    /// \note This is different from the SAM/BAM version number
+    /// \sa BamHeader::Version.
+    ///
+    std::string PacBioBamVersion(void) const;
+
+    /// \returns the sort order used
+    ///
+    /// Valid values: "unknown", "unsorted", "queryname", or "coordinate"
+    ///
+    std::string SortOrder(void) const;
+
+    /// \returns the SAM/BAM version number (\@HD:VN)
+    ///
+    /// \note This is different from the %PacBio %BAM version number
+    /// \sa BamHeader::PacBioBamVersion
+    ///
+    std::string Version(void) const;
+
+    /// \}
+
+public:
+    /// \name Read Groups
+    /// \{
+
+    /// \returns true if the header contains a read group with \p id (\@RG:ID)
+    bool HasReadGroup(const std::string& id) const;
+
+    /// \returns a ReadGroupInfo object representing the read group matching
+    ///          \p id (\@RG:ID)
+    /// \throws std::runtime_error if \p id is unknown
+    ///
+    ReadGroupInfo ReadGroup(const std::string& id) const;
+
+    /// \returns vector of read group IDs listed in this header
+    std::vector<std::string> ReadGroupIds(void) const;
+
+    /// \returns vector of ReadGroupInfo objects, representing all read groups
+    ///          listed in this header
+    ///
+    std::vector<ReadGroupInfo> ReadGroups(void) const;
+
+    /// \}
+
+public:
+    /// \name Sequences
+    /// \{
+
+    /// \returns true if header contains a sequence with \p name (\@SQ:SN)
+    bool HasSequence(const std::string& name) const;
+
+    /// \returns number of sequences (\@SQ entries) stored in this header
+    size_t NumSequences(void) const;
+
+    /// \returns numeric ID for sequence matching \p name (\@SQ:SN)
+    ///
+    /// This is the numeric ID used elsewhere throughout the API.
+    ///
+    /// \throws std::runtime_error if \p name is unknown
+    /// \sa BamReader::ReferenceId, PbiReferenceIdFilter,
+    ///     PbiRawMappedData::tId_
+    ///
+    int32_t SequenceId(const std::string& name) const;
+
+    /// \returns the length of the sequence (\@SQ:LN, e.g. chromosome length) at
+    ///          index \p id
+    ///
+    /// \sa SequenceInfo::Length, BamHeader::SequenceId
+    ///
+    std::string SequenceLength(const int32_t id) const;
+
+    /// \returns the name of the sequence (\@SQ:SN) at index \p id
+    ///
+    /// \sa SequenceInfo::Name, BamHeader::SequenceId
+    ///
+    std::string SequenceName(const int32_t id) const;
+
+    /// \returns vector of sequence names (\@SQ:SN) stored in this header
+    ///
+    /// Position in the vector is equivalent to SequenceId.
+    ///
+    std::vector<std::string> SequenceNames(void) const;
+
+    /// \returns SequenceInfo object at index \p id
+    ///
+    /// \throws std::out_of_range if \p is an invalid or unknown index
+    /// \sa BamHeader::SequenceId
+    ///
+    SequenceInfo Sequence(const int32_t id) const;
+
+    /// \returns SequenceInfo for the sequence matching \p name
+    SequenceInfo Sequence(const std::string& name) const;
+
+    /// \returns vector of SequenceInfo objects representing the sequences
+    ///          (\@SQ entries) stored in this header
+    ///
+    std::vector<SequenceInfo> Sequences(void) const;
+
+    /// \}
+
+public:
+    /// \name Programs
+    /// \{
+
+    /// \returns true if this header contains a program entry with ID (\@PG:ID)
+    ///          matching \p id
+    ///
+    bool HasProgram(const std::string& id) const;
+
+    /// \returns ProgramInfo object for the program entry matching \p id
+    /// \throws std::runtime_error if \p id is unknown
+    ///
+    ProgramInfo Program(const std::string& id) const;
+
+    /// \returns vector of program IDs (\@PG:ID)
+    std::vector<std::string> ProgramIds(void) const;
+
+    /// \returns vector of ProgramInfo objects representing program entries
+    ///          (\@PG) stored in this heder
+    ///
+    std::vector<ProgramInfo> Programs(void) const;
+
+    /// \}
+
+public:
+    /// \name Comments
+    /// \{
+
+    /// \returns vector of comment (\@CO) strings
+    std::vector<std::string> Comments(void) const;
+
+    /// \}
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \returns SAM-header-formatted string representing this header's data
+    std::string ToSam(void) const;
+
+    /// \}
+
+public:
+
+    /// \name General Attributes
+    /// \{
+
+    /// \brief Sets this header's PacBioBAM version number (\@HD:pb).
+    ///
+    /// \returns reference to this object
+    /// \throws std::runtime_error if version number cannot be parsed or
+    ///         is less than the minimum version allowed.
+    ///
+    BamHeader& PacBioBamVersion(const std::string& version);
+
+    /// \brief Sets this header's sort order label (\@HD:SO).
+    ///
+    /// Valid values: "unknown", "unsorted", "queryname", or "coordinate"
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& SortOrder(const std::string& order);
+
+    /// \brief Sets this header's SAM/BAM version number (\@HD:VN).
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Version(const std::string& version);
+
+    /// \}
+
+public:
+    /// \name Read Groups
+    /// \{
+
+    /// \brief Appends a read group entry (\@RG) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddReadGroup(const ReadGroupInfo& readGroup);
+
+    /// \brief Removes all read group entries from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearReadGroups(void);
+
+    /// \brief Replaces this header's list of read group entries with those in
+    ///        \p readGroups.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ReadGroups(const std::vector<ReadGroupInfo>& readGroups);
+
+    /// \}
+
+public:
+    /// \name Sequences
+    /// \{
+
+    /// \brief Appends a sequence entry (\@SQ) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddSequence(const SequenceInfo& sequence);
+
+    /// \brief Removes all sequence entries from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearSequences(void);
+
+    /// \brief Replaces this header's list of sequence entries with those in
+    ///       \p sequences.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Sequences(const std::vector<SequenceInfo>& sequences);
+
+    /// \}
+
+public:
+    /// \name Programs
+    /// \{
+
+    /// \brief Appends a program entry (\@PG) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddProgram(const ProgramInfo& pg);
+
+    /// \brief Removes all program entries from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearPrograms(void);
+
+    /// \brief Replaces this header's list of program entries with those in
+    ///        \p programs.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Programs(const std::vector<ProgramInfo>& programs);
+
+    /// \}
+
+public:
+    /// \name Comments
+    /// \{
+
+    /// \brief Appends a comment (\@CO) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddComment(const std::string& comment);
+
+    /// \brief Removes all comments from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearComments(void);
+
+    /// \brief Replaces this header's list of comments with those in \p comments.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Comments(const std::vector<std::string>& comments);
+
+    /// \}
+
+private:
+    PBBAM_SHARED_PTR<internal::BamHeaderPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/BamHeader.inl"
+
+#endif // BAMHEADER_H
diff --git a/include/pbbam/BamReader.h b/include/pbbam/BamReader.h

new file mode 100644 (file)

index 0000000..774a2ec
--- /dev/null
+++ b/include/pbbam/BamReader.h
@@ -0,0 +1,191 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamReader.h
+/// \brief Defines the BamReader class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMREADER_H
+#define BAMREADER_H
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/GenomicInterval.h"
+
+#include <htslib/sam.h>
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal { struct BamReaderPrivate; }
+
+/// \brief The BamReader class provides basic read-access to a %BAM file.
+///
+/// The base-class implementation provides a sequential read-through of BAM
+/// records. Derived classes may implement other access schemes (e.g. genomic
+/// region, PBI-enabled record filtering).
+///
+class PBBAM_EXPORT BamReader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Opens BAM file for reading.
+    ///
+    /// \param[in] fn %BAM filename
+    /// \throws std::runtime_error if failed to open
+    ///
+    explicit BamReader(const std::string& fn);
+
+    /// \brief Opens BAM file for reading.
+    ///
+    /// \param[in] bamFile BamFile object
+    /// \throws std::runtime_error if failed to open
+    ///
+    explicit BamReader(const BamFile& bamFile);
+
+    /// \brief Opens BAM file for reading.
+    ///
+    /// \param[in] bamFile BamFile object
+    /// \throws std::runtime_error if failed to open
+    ///
+    explicit BamReader(BamFile&& bamFile);
+
+    virtual ~BamReader(void);
+
+    /// \}
+
+public:
+    /// \name BAM File Attributes
+    /// \{
+
+    /// \returns the underlying BamFile
+    const BamFile& File(void) const;
+
+    /// \returns %BAM filename
+    std::string Filename(void) const;
+
+    /// \returns BamHeader object from %BAM header contents
+    const BamHeader& Header(void) const;
+
+    /// \}
+
+public:
+    /// \name BAM File I/O
+    /// \{
+
+    /// \brief Fetches the "next" %BAM record.
+    ///
+    /// Default implementation will read records until EOF. Derived readers may
+    /// use additional criteria to decide which record is "next" and when
+    /// reading is done.
+    ///
+    /// \param[out] record  next BamRecord object. Should not be used if method
+    ///                     returns false.
+    ///
+    /// \returns true if record was read successfully. Returns false if EOF (or
+    ///          end of iterator in derived readers). False is not an error,
+    ///          it indicates "end of data".
+    ///
+    /// \throws std::runtime_error if failed to read from file (e.g. possible
+    ///         truncated or corrupted file).
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// \brief Seeks to virtual offset in %BAM.
+    ///
+    /// \note This is \b NOT a normal file offset, but the virtual offset used
+    ///       in %BAM indexing.
+    ///
+    /// \throws std::runtime_error if failed to seek
+    ///
+    void VirtualSeek(int64_t virtualOffset);
+
+    /// \returns current (virtual) file position.
+    ///
+    /// \note This is \b NOT a normal file offset, but the virtual offset used
+    ///       in %BAM indexing.
+    ///
+    int64_t VirtualTell(void) const;
+
+    /// \}
+
+protected:
+    /// \name BAM File I/O
+    /// \{
+
+    /// \brief Helper method for access to underlying BGZF stream pointer.
+    ///
+    /// Useful for derived readers' contact points with htslib methods.
+    ///
+    /// \returns BGZF stream pointer
+    ///
+    BGZF* Bgzf(void) const;
+
+    /// \brief Performs the actual raw read of the next record from the BAM
+    ///        file.
+    ///
+    /// Default implementation will read records, sequentially, until EOF.
+    /// Derived readers may use additional criteria to decide which record is
+    ///  "next" and when reading is done.
+    ///
+    /// Return value should be equivalent to htslib's bam_read1():
+    ///     >= 0 : normal
+    ///       -1 : EOF (not an error)
+    ///     < -1 : error
+    ///
+    /// \param[in]  bgzf BGZF stream pointer
+    /// \param[out] b    %BAM record pointer
+    /// \returns integer status code, see description
+    ///
+    virtual int ReadRawData(BGZF* bgzf, bam1_t* b);
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::BamReaderPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // BAMREADER_H
diff --git a/include/pbbam/BamRecord.h b/include/pbbam/BamRecord.h

new file mode 100644 (file)

index 0000000..a642a9f
--- /dev/null
+++ b/include/pbbam/BamRecord.h
@@ -0,0 +1,1284 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecord.h
+/// \brief Defines the BamRecord class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORD_H
+#define BAMRECORD_H
+
+#include "pbbam/Accuracy.h"
+#include "pbbam/Frames.h"
+#include "pbbam/BamRecordImpl.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/ClipType.h"
+#include "pbbam/FrameEncodingType.h"
+#include "pbbam/LocalContextFlags.h"
+#include "pbbam/Orientation.h"
+#include "pbbam/PulseBehavior.h"
+#include "pbbam/ReadGroupInfo.h"
+#include "pbbam/RecordType.h"
+#include "pbbam/Strand.h"
+#include "pbbam/QualityValues.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+#include "pbbam/ZmwType.h"
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+
+class BamRecordMemory;
+class Pulse2BaseCache;
+
+} // namespace internal
+
+/// \brief The BamRecord class represents a %PacBio %BAM record.
+///
+/// %PacBio %BAM records are extensions of normal SAM/BAM records. Thus in
+/// addition to normal fields like bases, qualities, mapping coordinates, etc.,
+/// tags are used extensively to annotate records with additional
+/// PacBio-specific data.
+///
+/// Mapping and clipping APIs are provided as well to ensure that such
+/// operations "trickle down" to all data fields properly.
+///
+/// \sa https://samtools.github.io/hts-specs/SAMv1.pdf
+///     for more information on standard %BAM data, and
+///     https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst
+///     for more information on %PacBio %BAM fields.
+///
+class PBBAM_EXPORT BamRecord
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    BamRecord(void);
+    BamRecord(const BamHeader& header);
+    BamRecord(const BamRecordImpl& impl);
+    BamRecord(BamRecordImpl&& impl);
+    BamRecord(const BamRecord& other);
+    BamRecord(BamRecord&& other);
+    BamRecord& operator=(const BamRecord& other);
+    BamRecord& operator=(BamRecord&& other);
+    virtual ~BamRecord(void);
+
+    /// \}
+
+public:
+    /// \name General Data
+    /// \{
+
+    /// \returns this record's full name
+    /// \sa BamRecordImpl::Name
+    ///
+    std::string FullName(void) const;
+
+    /// \returns shared pointer to this record's associated BamHeader
+    BamHeader Header(void) const;
+
+    /// \returns ZMW hole number
+    /// \throws if missing zm tag & record name does not contain hole number
+    ///
+    int32_t HoleNumber(void) const;
+
+    /// \returns this record's LocalContextFlags
+    PacBio::BAM::LocalContextFlags LocalContextFlags(void) const;
+
+    /// \returns this record's movie name
+    std::string MovieName(void) const;
+
+    /// \returns "number of complete passes of the insert"
+    int32_t NumPasses(void) const;
+
+    /// \returns the record's query end position, or Sequence().length() if not
+    ///          stored
+    /// \note QueryEnd is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position QueryEnd(void) const;
+
+    /// \returns the record's query start position, or 0 if not stored
+    ///
+    /// \note QueryStart is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position QueryStart(void) const;
+
+    /// \returns this record's expected read accuracy [0, 1000]
+    Accuracy ReadAccuracy(void) const;
+
+    /// \returns ReadGroupInfo object for this record
+    ReadGroupInfo ReadGroup(void) const;
+
+    /// \returns string ID of this record's read group
+    /// \sa ReadGroupInfo::Id
+    ///
+    std::string ReadGroupId(void) const;
+
+    /// \returns integer value for this record's read group ID
+    int32_t ReadGroupNumericId(void) const;
+
+    /// \returns this scrap record's scrap region type
+    VirtualRegionType ScrapRegionType(void) const;
+
+    /// \returns this scrap record's scrap ZMW type
+    ZmwType ScrapZmwType(void) const;
+
+    /// \returns this record's average signal-to-noise for each of A, C, G,
+    ///          and T
+    ///
+    std::vector<float> SignalToNoise(void) const;
+
+    /// \returns this record's type
+    /// \sa RecordType
+    RecordType Type(void) const;
+
+    /// \}
+
+public:
+    /// \name Mapping Data
+    /// \{
+
+    /// \returns the record's aligned end position
+    ///
+    /// \note AlignedEnd is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position AlignedEnd(void) const;
+
+    /// \returns the record's aligned start position
+    ///
+    /// \note AlignedStart is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position AlignedStart(void) const;
+
+    /// \returns the record's strand as a Strand enum value
+    Strand AlignedStrand(void) const;
+
+    /// \returns the record's CIGAR data as a Cigar object
+    ///
+    /// \param[in] exciseAllClips   if true, remove all clipping operations
+    ///                             (hard & soft) [default:false]
+    ///
+    Cigar CigarData(bool exciseAllClips = false) const;
+
+    /// \returns true if this record was mapped by aligner
+    bool IsMapped(void) const;
+
+    /// \returns this record's mapping quality. A value of 255 indicates
+    ///          "unknown"
+    ///
+    uint8_t MapQuality(void) const;
+
+    /// \returns the number of deleted bases (relative to reference)
+    size_t NumDeletedBases(void) const;
+
+    /// \returns the number of inserted bases (relative to reference)
+    size_t NumInsertedBases(void) const;
+
+    /// \returns the number of matching bases (sum of '=' CIGAR op lengths)
+    size_t NumMatches(void) const;
+
+    /// \returns a tuple containing NumMatches (first) and NumMismatches
+    ///         (second)
+    ///
+    std::pair<size_t, size_t> NumMatchesAndMismatches(void) const;
+
+    /// \returns the number of mismatching bases (sum of 'X' CIGAR op lengths)
+    size_t NumMismatches(void) const;
+
+    /// \returns this record's reference ID, or -1 if unmapped.
+    ///
+    /// \note This is only a valid identifier within this %BAM file
+    ///
+    int32_t ReferenceId(void) const;
+
+    /// \returns this record's reference name.
+    ///
+    /// \throws an exception if unmapped record.
+    ///
+    std::string ReferenceName(void) const;
+
+    /// \returns the record's reference end position, or UnmappedPosition if
+    ///          unmapped
+    ///
+    /// \note ReferenceEnd is in reference coordinates, NOT polymerase read
+    ///       coordinates.
+    ///
+    Position ReferenceEnd(void) const;
+
+    /// \returns the record's reference start position, or UnmappedPosition if
+    ///          unmapped
+    ///
+    /// \note ReferenceStart is in reference coordinates, NOT polymerase read
+    ///       coordinates.
+    ///
+    Position ReferenceStart(void) const;
+
+    /// \}
+
+public:
+    /// \name Barcode Data
+    /// \{
+
+    /// \returns forward barcode id
+    ///
+    /// \throws std::runtime_error if barcode data is absent or malformed.
+    /// \sa HasBarcodes
+    ///
+    int16_t BarcodeForward(void) const;
+
+    /// \returns barcode call confidence (Phred-scaled posterior probability
+    ///          of correct barcode call)
+    ///
+    /// \sa HasBarcodeQuality
+    ///
+    uint8_t BarcodeQuality(void) const;
+
+    /// \returns reverse barcode id
+    ///
+    /// \throws std::runtime_error if barcode data is absent or malformed.
+    /// \sa HasBarcodes
+    ///
+    int16_t BarcodeReverse(void) const;
+
+    /// \returns the forward and reverse barcode ids
+    ///
+    /// \throws std::runtime_error if barcode data is absent or malformed.
+    /// \sa HasBarcodes
+    ///
+    std::pair<int16_t,int16_t> Barcodes(void) const;
+
+    /// \}
+
+public:
+    /// \name Auxiliary Data Queries
+    /// \{
+
+    /// \returns true if this record has AltLabelQV data
+    bool HasAltLabelQV(void) const;
+
+    /// \returns true if this record has AltLabelTag data
+    bool HasAltLabelTag(void) const;
+
+    /// \returns true if this record has Barcode data
+    bool HasBarcodes(void) const;
+
+    /// \returns true is this record has BarcodeQuality data
+    bool HasBarcodeQuality(void) const;
+
+    /// \returns true if this record has DeletionQV data
+    bool HasDeletionQV(void) const;
+
+    /// \returns true if this record has DeletionTag data
+    bool HasDeletionTag(void) const;
+
+    /// \returns true if this record has a HoleNumber
+    bool HasHoleNumber(void) const;
+
+    /// \returns true if this record has InsertionQV data
+    bool HasInsertionQV(void) const;
+
+    /// \returns true if this record has IPD data
+    bool HasIPD(void) const;
+
+    /// \returns true if this record has LabelQV data
+    bool HasLabelQV(void) const;
+
+    /// \returns true if this record has LocalContextFlags (absent in CCS)
+    bool HasLocalContextFlags(void) const;
+
+    /// \returns true if this record has MergeQV data
+    bool HasMergeQV(void) const;
+
+    /// \returns true if this record has NumPasses data
+    bool HasNumPasses(void) const;
+
+    /// \returns true if this record has Pkmean data
+    bool HasPkmean(void) const;
+
+    /// \returns true if this record has Pkmid data
+    bool HasPkmid(void) const;
+
+    /// \returns true if this record has Pkmean2 data
+    bool HasPkmean2(void) const;
+
+    /// \returns true if this record has Pkmid2 data
+    bool HasPkmid2(void) const;
+
+    /// \returns true if this record has PreBaseFrames aka IPD data
+    bool HasPreBaseFrames(void) const;
+
+    /// \returns true if this record has PrePulseFrames data
+    bool HasPrePulseFrames(void) const;
+
+    /// \returns true if this record has PulseCall data
+    bool HasPulseCall(void) const;
+
+    /// \returns true if this record has PulseCallWidth data
+    bool HasPulseCallWidth(void) const;
+
+    /// \returns true if this record has PulseMergeQV data
+    bool HasPulseMergeQV(void) const;
+
+    /// \returns true if this record has PulseWidth data
+    bool HasPulseWidth(void) const;
+
+    /// \returns true if this record has ReadAccuracyTag data
+    bool HasReadAccuracy(void) const;
+
+    /// \returns true if this record has QueryEnd data
+    bool HasQueryEnd(void) const;
+
+    /// \returns true if this record has QueryStart data
+    bool HasQueryStart(void) const;
+
+    /// \returns true if this record has ScrapRegionType data (only in SCRAP)
+    bool HasScrapRegionType(void) const;
+
+    /// \returns true if this record has scrap ZMW type data (only in SCRAP)
+    bool HasScrapZmwType(void) const;
+
+    /// \returns true if this record has signal-to-noise data (absent in
+    ///          POLYMERASE)
+    ///
+    bool HasSignalToNoise(void) const;
+
+    /// \returns true if this record has StartFrame data
+    bool HasStartFrame(void) const;
+
+    /// \returns true if this record has SubstitutionQV data
+    bool HasSubstitutionQV(void) const;
+
+    /// \returns true if this record has SubstitutionTag data
+    bool HasSubstitutionTag(void) const;
+
+    /// \}
+
+public:
+    /// \name Sequence & Tag Data
+    /// \{
+
+    /// \brief Fetches this record's AltLabelTag values ("pt" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    ///
+    /// \returns AltLabelTags string
+    ///
+    std::string AltLabelTag(Orientation orientation = Orientation::NATIVE,
+                            bool aligned = false,
+                            bool exciseSoftClips = false,
+                            PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's DeletionTag values ("dt" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns DeletionTag string
+    ///
+    std::string DeletionTag(Orientation orientation = Orientation::NATIVE,
+                            bool aligned = false,
+                            bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's DNA sequence (SEQ field).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns sequence string
+    ///
+    std::string Sequence(const Orientation orientation = Orientation::NATIVE,
+                         bool aligned = false,
+                         bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's SubstitutionTag values ("st" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns SubstitutionTags string
+    ///
+    std::string SubstitutionTag(Orientation orientation = Orientation::NATIVE,
+                                bool aligned = false,
+                                bool exciseSoftClips = false) const;
+
+    /// \}
+
+public:
+    /// \name Quality Data
+    /// \{
+
+    /// \brief Fetches this record's AltLabelQV values ("pv" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    ///
+    /// \returns AltLabelQV as QualityValues object
+    ///
+    QualityValues AltLabelQV(Orientation orientation = Orientation::NATIVE,
+                             bool aligned = false,
+                             bool exciseSoftClips = false,
+                             PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's DeletionQV values ("dq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns DeletionQV as QualityValues object
+    ///
+    QualityValues DeletionQV(Orientation orientation = Orientation::NATIVE,
+                             bool aligned = false,
+                             bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's InsertionQV values ("iq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns InsertionQVs as QualityValues object
+    ///
+    QualityValues InsertionQV(Orientation orientation = Orientation::NATIVE,
+                              bool aligned = false,
+                              bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's LabelQV values ("pq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    ///
+    /// \returns LabelQV as QualityValues object
+    ///
+    QualityValues LabelQV(Orientation orientation = Orientation::NATIVE,
+                          bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's MergeQV values ("mq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns MergeQV as QualityValues object
+    ///
+    QualityValues MergeQV(Orientation orientation = Orientation::NATIVE,
+                          bool aligned = false,
+                          bool exciseSoftClips = false) const;
+
+    /// \brief Fetches  this record's %BAM quality values (QUAL field).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns %BAM qualities as QualityValues object
+    ///
+    QualityValues Qualities(Orientation orientation = Orientation::NATIVE,
+                            bool aligned = false,
+                            bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's SubstitutionQV values ("sq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns SubstitutionQV as QualityValues object
+    ///
+    QualityValues SubstitutionQV(Orientation orientation = Orientation::NATIVE,
+                                 bool aligned = false,
+                                 bool exciseSoftClips = false) const;
+
+    /// \}
+
+public:
+    /// \name Pulse Data
+    /// \{
+
+    /// \brief Fetches this record's IPD values ("ip" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new frames will have a value of 0;
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns IPD as Frames object
+    ///
+    Frames IPD(Orientation orientation = Orientation::NATIVE,
+               bool aligned = false,
+               bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's IPD values ("ip" tag), but does not upscale.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns IPD as Frames object
+    ///
+    Frames IPDRaw(Orientation orientation = Orientation::NATIVE) const;
+
+    /// \brief Fetches this record's Pkmean values ("pa" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmean as vector<float> object
+    ///
+    std::vector<float> Pkmean(Orientation orientation = Orientation::NATIVE,
+                              bool aligned = false,
+                              bool exciseSoftClips = false,
+                              PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's Pkmid values ("pm" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmid as vector<float> object
+    ///
+    std::vector<float> Pkmid(Orientation orientation = Orientation::NATIVE,
+                             bool aligned = false,
+                             bool exciseSoftClips = false,
+                             PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's Pkmean2 values ("pi" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmean as vector<float> object
+    ///
+    std::vector<float> Pkmean2(Orientation orientation = Orientation::NATIVE,
+                               bool aligned = false,
+                               bool exciseSoftClips = false,
+                               PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's Pkmid2 values ("ps" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmid as vector<float> object
+    ///
+    std::vector<float> Pkmid2(Orientation orientation = Orientation::NATIVE,
+                              bool aligned = false,
+                              bool exciseSoftClips = false,
+                              PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PreBaseFrames aka IPD values ("ip" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new frames will have a value of 0;
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns IPD as Frames object
+    ///
+    Frames PreBaseFrames(Orientation orientation = Orientation::NATIVE,
+                         bool aligned = false,
+                         bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's PrePulseFrames values ("pd" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PrePulseFrames as Frames object
+    ///
+    Frames PrePulseFrames(Orientation orientation = Orientation::NATIVE,
+                          bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PulseCall values ("pc" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseCalls string
+    ///
+    std::string PulseCall(Orientation orientation = Orientation::NATIVE,
+                          bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PulseCallWidth values ("px" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseCallWidth as Frames object
+    ///
+    Frames PulseCallWidth(Orientation orientation = Orientation::NATIVE,
+                          bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetch this record's PulseMergeQV values ("pg" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseMergeQV as QualityValues object
+    ///
+    QualityValues PulseMergeQV(Orientation orientation = Orientation::NATIVE,
+                               bool aligned = false,
+                               bool exciseSoftClips = false,
+                               PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PulseWidth values ("pw" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new frames will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns PulseWidths as Frames object
+    ///
+    Frames PulseWidth(Orientation orientation = Orientation::NATIVE,
+                      bool aligned = false,
+                      bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's PulseWidth values ("pw" tag), but does not
+    ///        upscale.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseWidth as Frames object
+    ///
+    Frames PulseWidthRaw(Orientation orientation = Orientation::NATIVE,
+                         bool aligned = false,
+                         bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's StartFrame values ("sf" tag).
+    ///
+    /// \param[in] orientation     Orientation of output
+    ///
+    /// \returns StartFrame as uint32_t vector
+    ///
+    std::vector<uint32_t> StartFrame(Orientation orientation = Orientation::NATIVE,
+                                     bool aligned = false,
+                                     bool exciseSoftClips = false,
+                                     PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \}
+
+public:
+    /// \name Low-Level Access & Operations
+    /// \{
+
+    /// \warning This method should be considered temporary and avoided as much
+    ///          as possible. Direct access to the internal object is likely to
+    ///          disappear as BamRecord interface matures.
+    ///
+    /// \returns const reference to underlying BamRecordImpl object
+    ///
+    const BamRecordImpl& Impl(void) const;
+
+    /// \warning This method should be considered temporary and avoided as much
+    ///          as possible. Direct access to the internal object is likely to
+    ///          disappear as BamRecord interface matures.
+    ///
+    /// \returns reference to underlying BamRecordImpl object
+    ///
+    BamRecordImpl& Impl(void);
+
+    /// \}
+
+public:
+    /// \name General Data
+    /// \{
+
+    /// \brief Sets this record's ZMW hole number.
+    ///
+    /// \param[in] holeNumber
+    /// \returns reference to this record
+    ///
+    BamRecord& HoleNumber(const int32_t holeNumber);
+
+    /// \brief Sets this record's local context flags
+    ///
+    /// \param[in] flags
+    /// \returns reference to this record
+    ///
+    BamRecord& LocalContextFlags(const PacBio::BAM::LocalContextFlags flags);
+
+    /// \brief Sets this record's "number of complete passes of the insert".
+    ///
+    /// \param[in] numPasses
+    /// \returns reference to this record
+    ///
+    BamRecord& NumPasses(const int32_t numPasses);
+
+    /// \brief Sets this record's query end position.
+    ///
+    /// \note Changing this will modify the name of non-CCS records.
+    ///
+    /// \param[in] pos
+    /// \returns reference to this record
+    ///
+    BamRecord& QueryEnd(const PacBio::BAM::Position pos);
+
+    /// \brief Sets this record's query start position.
+    ///
+    /// \note Changing this will modify the name of non-CCS records.
+    ///
+    /// \param[in] pos
+    /// \returns reference to this record
+    ///
+    BamRecord& QueryStart(const PacBio::BAM::Position pos);
+
+    /// \brief Sets this record's expected read accuracy [0, 1000]
+    ///
+    /// \param[in] accuracy
+    /// \returns reference to this record
+    ///
+    BamRecord& ReadAccuracy(const Accuracy& accuracy);
+
+    /// \brief Attaches this record to the provided read group, changing the
+    ///        record name & 'RG' tag.
+    ///
+    /// \param[in] rg
+    /// \returns reference to this record
+    ///
+    BamRecord& ReadGroup(const ReadGroupInfo& rg);
+
+    /// \brief Attaches this record to the provided read group, changing the
+    ///        record name & 'RG' tag.
+    ///
+    /// \param[in] id
+    /// \returns reference to this record
+    ///
+    BamRecord& ReadGroupId(const std::string& id);
+
+    /// \brief Sets this scrap record's ScrapRegionType
+    ///
+    /// \param[in] type
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapRegionType(const VirtualRegionType type);
+
+    /// \brief Sets this scrap record's ScrapRegionType
+    ///
+    /// \param[in] type character equivalent of VirtualRegionType
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapRegionType(const char type);
+
+    /// \brief Sets this scrap record's ScrapZmwType
+    ///
+    /// \param[in] type
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapZmwType(const ZmwType type);
+
+    /// \brief Sets this scrap record's ScrapZmwType
+    ///
+    /// \param[in] type character equivalent of ZmwType
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapZmwType(const char type);
+
+    /// \brief Sets this record's average signal-to-noise in each of A, C, G,
+    ///        and T
+    ///
+    /// \param[in] snr average signal-to-noise of A, C, G, and T (in this order)
+    /// \returns reference to this record
+    ///
+    BamRecord& SignalToNoise(const std::vector<float>& snr);
+
+    /// \}
+
+public:
+    /// \name Barcode Data
+    /// \{
+
+    /// \brief Sets this record's barcode IDs ('bc' tag)
+    ///
+    /// \param[in] barcodeIds
+    /// \returns reference to this record
+    ///
+    BamRecord& Barcodes(const std::pair<int16_t, int16_t>& barcodeIds);
+
+    /// \brief Sets this record's barcode quality ('bq' tag)
+    ///
+    /// \param[in] quality Phred-scaled confidence call
+    /// \returns reference to this record
+    ///
+    BamRecord& BarcodeQuality(const uint8_t quality);
+
+    /// \}
+
+public:
+    /// \name Sequence & Tag Data
+    /// \{
+
+    /// \brief Sets this record's AltLabelTag values ("at" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& AltLabelTag(const std::string& tags);
+
+    /// \brief Sets this record's DeletionTag values ("dt" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& DeletionTag(const std::string& tags);
+
+    /// \brief Sets this record's SubstitutionTag values ("st" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& SubstitutionTag(const std::string& tags);
+
+    /// \}
+
+public:
+    /// \name Quality Data
+    /// \{
+
+    /// \brief Sets this record's AltLabelQV values ("pv" tag).
+    ///
+    /// \param[in] altLabelQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& AltLabelQV(const QualityValues& altLabelQVs);
+
+    /// \brief Sets this record's DeletionQV values ("dq" tag).
+    ///
+    /// \param[in] deletionQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& DeletionQV(const QualityValues& deletionQVs);
+
+    /// \brief Sets this record's InsertionQV values ("iq" tag).
+    ///
+    /// \param[in] insertionQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& InsertionQV(const QualityValues& insertionQVs);
+
+    /// \brief Sets this record's LabelQV values ("pq" tag).
+    ///
+    /// \param[in] labelQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& LabelQV(const QualityValues& labelQVs);
+
+    /// \brief Sets this record's MergeQV values ("mq" tag).
+    ///
+    /// \param[in] mergeQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& MergeQV(const QualityValues& mergeQVs);
+
+    /// \brief Sets this record's SubstitutionQV values ("sq" tag).
+    ///
+    /// \param[in] substitutionQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& SubstitutionQV(const QualityValues& substitutionQVs);
+
+    /// \}
+
+public:
+    /// \name Pulse Data
+    /// \{
+
+    /// \brief Sets this record's IPD values ("ip" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& IPD(const Frames& frames,
+                   const FrameEncodingType encoding);
+
+    /// \brief Sets this record's Pkmean values ("pm" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmean values ("pm" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's Pkmid values ("pa" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmid values ("pa" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's Pkmean2 values ("ps" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean2(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmean2 values ("ps" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean2(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's Pkmid2 values ("pi" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid2(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmid2 values ("pi" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid2(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's PreBaseFrames aka IPD values ("ip" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PreBaseFrames(const Frames& frames,
+                             const FrameEncodingType encoding);
+
+    /// \brief Sets this record's PrePulseFrames values ("pd" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PrePulseFrames(const Frames& frames,
+                              const FrameEncodingType encoding);
+
+    /// \brief Sets this record's PulseCall values ("pc" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseCall(const std::string& tags);
+
+    /// \brief Sets this record's PulseCallWidth values ("px" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseCallWidth(const Frames& frames,
+                              const FrameEncodingType encoding);
+
+    /// \brief Sets this record's PulseMergeQV values ("pg" tag).
+    ///
+    /// \param[in] pulseMergeQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseMergeQV(const QualityValues& pulseMergeQVs);
+
+    /// \brief Sets this record's PulseWidth values ("pw" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseWidth(const Frames& frames,
+                          const FrameEncodingType encoding);
+
+    /// \brief Sets this record's StartFrame values ("sf" tag).
+    ///
+    /// \param[in] startFrame
+    /// \returns reference to this record
+    ///
+    BamRecord& StartFrame(const std::vector<uint32_t>& startFrame);
+
+    /// \}
+
+public:
+    /// \name Low-Level Access & Operations
+    /// \{
+
+    /// \brief Resets cached aligned start/end.
+    ///
+    /// \note This method should not be needed in most client code. It exists
+    ///       primarily as a hook for internal reading loops (queries, index
+    ///       build, etc.) It's essentially a workaround and will likely be
+    ///       removed from the API.
+    ///
+    void ResetCachedPositions(void) const;
+
+    /// \brief Resets cached aligned start/end.
+    ///
+    /// \note This method should not be needed in most client code. It exists
+    ///       primarily as a hook for internal reading loops (queries, index
+    ///       build, etc.) It's essentially a workaround and will likely be
+    ///       removed from the API.
+    ///
+    void ResetCachedPositions(void);
+
+    /// \brief Updates the record's name (BamRecord::FullName) to reflect
+    ///        modifications to name components (movie name, ZMW hole number,
+    ///        etc.)
+    ///
+    void UpdateName(void);
+
+    /// \}
+
+public:
+    /// \name Pulse Data
+    /// \{
+
+    static const float photonFactor;
+
+    static std::vector<uint16_t> EncodePhotons(const std::vector<float>& data);
+
+    /// \}
+
+public:
+    /// \name Clipping & Mapping
+    /// \{
+
+    /// Creates a copied record from input, with clipping applied
+    static BamRecord Clipped(const BamRecord& input,
+                             const ClipType clipType,
+                             const PacBio::BAM::Position start,
+                             const PacBio::BAM::Position end);
+
+    /// Creates a copied record from input, with mapping applied
+    static BamRecord Mapped(const BamRecord& input,
+                            const int32_t referenceId,
+                            const Position refStart,
+                            const Strand strand,
+                            const Cigar& cigar,
+                            const uint8_t mappingQuality);
+
+    /// Applies clipping to this record
+    BamRecord& Clip(const ClipType clipType,
+                    const PacBio::BAM::Position start,
+                    const PacBio::BAM::Position end);
+
+    /// Creates a copied record from this one, with clipping applied
+    BamRecord Clipped(const ClipType clipType,
+                      const PacBio::BAM::Position start,
+                      const PacBio::BAM::Position end) const;
+
+    /// Applies mapping to this record
+    BamRecord& Map(const int32_t referenceId,
+                   const Position refStart,
+                   const Strand strand,
+                   const Cigar& cigar,
+                   const uint8_t mappingQuality);
+
+    /// Creates a copied record from this one, with mapping applied
+    BamRecord Mapped(const int32_t referenceId,
+                     const Position refStart,
+                     const Strand strand,
+                     const Cigar& cigar,
+                     const uint8_t mappingQuality) const;
+    /// \}
+
+private:
+    BamRecordImpl impl_;
+
+public:
+    /// public & mutable so that queries can directly set the header info,
+    /// even on a record that is const from client code's perspective
+    mutable BamHeader header_;
+
+private:
+    /// \internal
+    /// cached positions (mutable to allow lazy-calc in const methods)
+    mutable Position alignedStart_;
+    mutable Position alignedEnd_;
+
+private:
+    /// \internal
+    /// pulse to bam mapping cache
+    mutable std::unique_ptr<internal::Pulse2BaseCache> p2bCache_;
+
+private:
+    ///\internal
+    /// clipping methods
+
+    void ClipFields(const size_t clipPos, const size_t clipLength);
+    BamRecord& ClipToQuery(const PacBio::BAM::Position start,
+                           const PacBio::BAM::Position end);
+    BamRecord& ClipToReference(const PacBio::BAM::Position start,
+                               const PacBio::BAM::Position end);
+    BamRecord& ClipToReferenceForward(const PacBio::BAM::Position start,
+                                      const PacBio::BAM::Position end);
+    BamRecord& ClipToReferenceReverse(const PacBio::BAM::Position start,
+                                      const PacBio::BAM::Position end);
+
+private:
+    ///\internal
+    /// raw tag data fetching
+
+    // sequence tags
+    std::string FetchBasesRaw(const BamRecordTag tag) const;
+    std::string FetchBases(const BamRecordTag tag,
+                           const Orientation orientation = Orientation::NATIVE,
+                           const bool aligned = false,
+                           const bool exciseSoftClips = false,
+                           const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // frame tags
+    Frames FetchFramesRaw(const BamRecordTag tag) const;
+    Frames FetchFrames(const BamRecordTag tag,
+                       const Orientation orientation = Orientation::NATIVE,
+                       const bool aligned = false,
+                       const bool exciseSoftClips = false,
+                       const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // pulse tags
+    std::vector<float> FetchPhotonsRaw(const BamRecordTag tag) const;
+    std::vector<float> FetchPhotons(const BamRecordTag tag,
+                                    const Orientation orientation = Orientation::NATIVE,
+                                    const bool aligned = false,
+                                    const bool exciseSoftClips = false,
+                                    const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // QV tags
+    QualityValues FetchQualitiesRaw(const BamRecordTag tag) const;
+    QualityValues FetchQualities(const BamRecordTag tag,
+                                 const Orientation orientation = Orientation::NATIVE,
+                                 const bool aligned = false,
+                                 const bool exciseSoftClips = false,
+                                 const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // UInt tags (e.g. start frame)
+    std::vector<uint32_t> FetchUIntsRaw(const BamRecordTag tag) const;
+    std::vector<uint32_t> FetchUInts(const BamRecordTag tag,
+                                     const Orientation orientation = Orientation::NATIVE,
+                                     const bool aligned = false,
+                                     const bool exciseSoftClips = false,
+                                     const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+private:
+    ///\internal
+    /// marked const to allow calling from const methods
+    /// but updates our mutable cached values
+    void CalculateAlignedPositions(void) const;
+    void CalculatePulse2BaseCache(void) const;
+
+    friend class internal::BamRecordMemory;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/BamRecord.inl"
+
+#endif // BAMRECORD_H
diff --git a/include/pbbam/BamRecordBuilder.h b/include/pbbam/BamRecordBuilder.h

new file mode 100644 (file)

index 0000000..c6ff877
--- /dev/null
+++ b/include/pbbam/BamRecordBuilder.h
@@ -0,0 +1,316 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecordBuilder.h
+/// \brief Defines the BamRecordBuilder class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDBUILDER_H
+#define BAMRECORDBUILDER_H
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/Config.h"
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BamRecordBuilder class provides a helper utility for building
+///        BamRecords.
+///
+/// This class provides a mechanism for building up %BAM data and
+/// lazy-encoding/constructing the actual BamRecord. Currently, the methods here
+/// really only support  filling in the low-level SAM/BAM-style fields, not so
+/// much the PacBio-specific fields.
+///
+class PBBAM_EXPORT BamRecordBuilder
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty %BAM record builder.
+    BamRecordBuilder(void);
+
+    /// \brief Creates an empty %BAM record builder, with header info to apply
+    ///        to built records.
+    ///
+    /// \param[in] header   BamHeader object
+    ///
+    explicit BamRecordBuilder(const BamHeader& header);
+
+    /// \brief Creates record builder with inital record data.
+    ///
+    /// \param[in] prototype    data from this record will be used to seed the
+    ///                         builder
+    ///
+    BamRecordBuilder(const BamRecord& prototype);
+
+    BamRecordBuilder(const BamRecordBuilder& other);
+    BamRecordBuilder(BamRecordBuilder&& other);
+    BamRecordBuilder& operator=(const BamRecordBuilder& other);
+    BamRecordBuilder& operator=(BamRecordBuilder&& other);
+    ~BamRecordBuilder(void);
+
+    /// \}
+
+public:
+    /// \name Record-Building
+    /// \{
+
+    /// \brief Builds a BamRecord from current builder attributes.
+    ///
+    /// \returns newly-built BamRecord object
+    ///
+    BamRecord Build(void) const;
+
+    /// \brief Replaces an existing BamRecord's data with current builder
+    ///        attributes.
+    ///
+    /// \param[out] record resulting record
+    /// \returns true if successful
+    ///
+    bool BuildInPlace(BamRecord& record) const;
+
+    /// \brief Resets builder attributes to default values.
+    ///
+    void Reset(void);
+
+    /// \brief Resets builder attributes with \p prototype's data.
+    ///
+    /// \param[in] prototype
+    ///
+    void Reset(const BamRecord& prototype);
+
+    /// \brief Resets builder attributes with \p prototype's data.
+    ///
+    /// \param[in] prototype
+    ///
+    void Reset(BamRecord&& prototype);
+
+    /// \}
+
+public:
+
+    /// \name Core Attribute Setup
+    /// \{
+
+    /// \brief Sets the record's (BAI) index bin ID.
+    ///
+    /// \param[in] bin BAI index bin ID.
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Bin(const uint32_t bin);
+
+    /// \brief Sets this record's alignment flag, using a raw integer.
+    ///
+    /// \param[in] flag raw alignment flag
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& Flag(const uint32_t flag);
+
+    /// \brief Sets this record's insert size.
+    ///
+    /// \param[in] iSize insert size
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& InsertSize(const int32_t iSize);
+
+    /// \brief Sets this record's map quality.
+    ///
+    /// \param[in] mapQual mapping quality - value of 255 indicates "unknown"
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& MapQuality(const uint8_t mapQual);
+
+    /// \brief Sets this record's mate's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& MatePosition(const int32_t pos);
+
+    /// \brief Sets this record's mate's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& MateReferenceId(const int32_t id);
+
+    /// \brief Sets this record's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& Position(const int32_t pos);
+
+    /// \brief Sets this record's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& ReferenceId(const int32_t id);
+
+    /// \}
+
+public:
+    /// \name Alignment Flag Setup
+    /// \{
+
+    /// \brief Sets whether this record is a PCR/optical duplicate
+    BamRecordBuilder& SetDuplicate(bool ok);
+
+    /// \brief Sets whether this record failed quality controls
+    BamRecordBuilder& SetFailedQC(bool ok);
+
+    /// \brief Sets whether this record is the first mate of a pair.
+    BamRecordBuilder& SetFirstMate(bool ok);
+
+    /// \brief Sets whether this record was aligned.
+    BamRecordBuilder& SetMapped(bool ok);
+
+    /// \brief Sets whether this record's mate was aligned.
+    BamRecordBuilder& SetMateMapped(bool ok);
+
+    /// \brief Sets whether this record's mate mapped to reverse strand.
+    BamRecordBuilder& SetMateReverseStrand(bool ok);
+
+    /// \brief Sets whether this record came from paired-end sequencing.
+    BamRecordBuilder& SetPaired(bool ok);
+
+    /// \brief Sets whether this record is a read's primary alignment.
+    BamRecordBuilder& SetPrimaryAlignment(bool ok);
+
+    /// \brief Sets whether this record & its mate were properly mapped, per the
+    ///        aligner.
+    ///
+    BamRecordBuilder& SetProperPair(bool ok);
+
+    /// \brief Sets whether this record mapped to reverse strand.
+    BamRecordBuilder& SetReverseStrand(bool ok);
+
+    /// \brief Sets whether this record is the second mate of a pair.
+    BamRecordBuilder& SetSecondMate(bool ok);
+
+    /// \brief Sets whether this record is a supplementary alignment.
+    BamRecordBuilder& SetSupplementaryAlignment(bool ok);
+
+    /// \}
+
+public:
+    /// \name Variable-Length Data Setup
+    /// \{
+
+    /// \brief Sets the record's CIGAR data.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Cigar(const PacBio::BAM::Cigar& cigar);
+
+    /// \brief Sets the record's CIGAR data.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Cigar(PacBio::BAM::Cigar&& cigar);
+
+    /// \brief Sets the record's name.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Name(const std::string& name);
+
+    /// \brief Sets the record's name.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Name(std::string&& name);
+
+    /// \brief Sets the record's qualities.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Qualities(const std::string& qualities);
+
+    /// \brief Sets the record's qualities.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Qualities(std::string&& qualities);
+
+    /// \brief Sets the record's sequence.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Sequence(const std::string& sequence);
+
+    /// \brief Sets the record's sequence.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Sequence(std::string&& sequence);
+
+    /// \brief Sets the record's tags.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Tags(const TagCollection& tags);
+
+    /// \brief Sets the record's tags.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Tags(TagCollection&& tags);
+
+    /// \}
+
+private:
+    BamHeader header_;
+    bam1_core_t core_;
+    std::string name_;
+    std::string sequence_;
+    std::string qualities_;
+    PacBio::BAM::Cigar cigar_;
+    TagCollection tags_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/BamRecordBuilder.inl"
+
+#endif // BAMRECORDBUILDER_H
diff --git a/include/pbbam/BamRecordImpl.h b/include/pbbam/BamRecordImpl.h

new file mode 100644 (file)

index 0000000..2c72367
--- /dev/null
+++ b/include/pbbam/BamRecordImpl.h
@@ -0,0 +1,634 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecordImpl.h
+/// \brief Defines the BamRecordImpl class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDIMPL_H
+#define BAMRECORDIMPL_H
+
+#include "pbbam/BamRecordTag.h"
+#include "pbbam/Cigar.h"
+#include "pbbam/Config.h"
+#include "pbbam/Position.h"
+#include "pbbam/QualityValues.h"
+#include "pbbam/TagCollection.h"
+#include <htslib/sam.h>
+#include <map>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal { class BamRecordMemory; }
+
+/// \brief The BamRecordImpl class holds all data necessary for creating,
+///        querying or editing a generic %BAM record.
+///
+/// For PacBio-specific extensions and convenience methods, see BamRecord.
+///
+/// \note This class is mostly an internal implementation detail and will
+///       likely be removed from the public API in the future. Please use
+///       BamRecord as much as possible.
+///
+class PBBAM_EXPORT BamRecordImpl
+{
+public:
+
+    /// These flags describe the alignment status of the record.
+    enum AlignmentFlag
+    {
+        PAIRED              = 0x0001    ///< Record comes from paired-end sequencing
+      , PROPER_PAIR         = 0x0002    ///< Each mate of a pair was properly aligned ("proper" as determined by aligner)
+      , UNMAPPED            = 0x0004    ///< Record was not mapped by aligner
+      , MATE_UNMAPPED       = 0x0008    ///< Record's mate was not mapped by aligner
+      , REVERSE_STRAND      = 0x0010    ///< Record was aligned to reverse strand (Sequence() is reverse-complemented)
+      , MATE_REVERSE_STRAND = 0x0020    ///< Record's mate was aligned to reverse strand (mate's Sequence() is reverse-complemented)
+      , MATE_1              = 0x0040    ///< Record is first mate of pair
+      , MATE_2              = 0x0080    ///< Record is second mate of pair
+      , SECONDARY           = 0x0100    ///< Record is a secondary alignment
+      , FAILED_QC           = 0x0200    ///< Record failed quality controls
+      , DUPLICATE           = 0x0400    ///< Record is a PCR/optical duplicate
+      , SUPPLEMENTARY       = 0x0800    ///< Record is a supplementary alignment
+    };
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    BamRecordImpl(void);
+    BamRecordImpl(const BamRecordImpl& other);
+    BamRecordImpl(BamRecordImpl&& other);
+    BamRecordImpl& operator=(const BamRecordImpl& other);
+    BamRecordImpl& operator=(BamRecordImpl&& other);
+    virtual ~BamRecordImpl(void);
+
+    /// \}
+
+public:
+    /// \name Core Data
+    /// \{
+
+    /// \returns this record's assigned (BAI) index bin ID.
+    uint32_t Bin(void) const;
+
+    /// \returns this record's alignment flag, in raw integer form.
+    uint32_t Flag(void) const;
+
+    /// \returns this record's insert size
+    int32_t InsertSize(void) const;
+
+    /// \returns this record's mapping quality. A value of 255 indicates "unknown"
+    uint8_t MapQuality(void) const;
+
+    /// \returns this record's mate's mapped position, or -1 if unmapped
+    PacBio::BAM::Position MatePosition(void) const;
+
+    /// \returns this record's mate's mapped reference ID, or -1 if unmapped
+    int32_t MateReferenceId(void) const;
+
+    /// \returns this record's mapped position, or -1 if unmapped
+    PacBio::BAM::Position Position(void) const;
+
+    /// \returns this record's mate's mapped reference ID, or -1 if unmapped
+    int32_t ReferenceId(void) const;
+
+    /// Sets the record's (BAI) index bin ID.
+    ///
+    /// \param[in] bin BAI index bin ID.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Bin(uint32_t bin);
+
+    /// Sets this record's alignment flag, using a raw integer.
+    ///
+    /// \param[in] flag raw alignment flag
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Flag(uint32_t flag);
+
+    /// Sets this record's insert size.
+    ///
+    /// \param[in] iSize insert size
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& InsertSize(int32_t iSize);
+
+    /// Sets this record's map quality.
+    ///
+    /// \param[in] mapQual mapping quality - value of 255 indicates "unknown"
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& MapQuality(uint8_t mapQual);
+
+    /// Sets this record's mate's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& MatePosition(PacBio::BAM::Position pos);
+
+    /// Sets this record's mate's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& MateReferenceId(int32_t id);
+
+    /// Sets this record's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Position(PacBio::BAM::Position pos);
+
+    /// Sets this record's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& ReferenceId(int32_t id);
+
+    /// \}
+
+public:
+    /// \name Alignment Flags
+    /// \{
+
+    /// \returns true if this record is a PCR/optical duplicate
+    bool IsDuplicate(void) const;
+
+    /// \returns true if this record failed quality controls
+    bool IsFailedQC(void) const;
+
+    /// \returns true if this record is the first mate of a pair
+    bool IsFirstMate(void) const;
+
+    /// \returns true if this record was mapped by aligner
+    bool IsMapped(void) const;
+
+    /// \returns true if this record's mate was mapped by aligner
+    bool IsMateMapped(void) const;
+
+    /// \returns true if this record's mate was mapped to the reverse strand
+    bool IsMateReverseStrand(void) const;
+
+    /// \returns true if this record comes from paired-end sequencing
+    bool IsPaired(void) const;
+
+    /// \returns true if this record is a read's primary alignment
+    bool IsPrimaryAlignment(void) const;
+
+    /// \returns true if this record & its mate were properly aligned
+    bool IsProperPair(void) const;
+
+    /// \returns true if this record was mapped to the reverse strand
+    bool IsReverseStrand(void) const;
+
+    /// \returns true if this record is the second mate of a pair
+    bool IsSecondMate(void) const;
+
+    /// \returns true if this record is a supplementary alignment
+    bool IsSupplementaryAlignment(void) const;
+
+    /// Sets whether this record is a PCR/optical duplicate
+    BamRecordImpl& SetDuplicate(bool ok);
+
+    /// Sets whether this record failed quality controls
+    BamRecordImpl& SetFailedQC(bool ok);
+
+    /// Sets whether this record is the first mate of a pair.
+    BamRecordImpl& SetFirstMate(bool ok);
+
+    /// Sets whether this record was aligned.
+    BamRecordImpl& SetMapped(bool ok);
+
+    /// Sets whether this record's mate was aligned.
+    BamRecordImpl& SetMateMapped(bool ok);
+
+    /// Sets whether this record's mate mapped to reverse strand.
+    BamRecordImpl& SetMateReverseStrand(bool ok);
+
+    /// Sets whether this record came from paired-end sequencing.
+    BamRecordImpl& SetPaired(bool ok);
+
+    /// Sets whether this record is a read's primary alignment.
+    BamRecordImpl& SetPrimaryAlignment(bool ok);
+
+    /// Sets whether this record & its mate were properly mapped, per the aligner.
+    BamRecordImpl& SetProperPair(bool ok);
+
+    /// Sets whether this record mapped to reverse strand.
+    BamRecordImpl& SetReverseStrand(bool ok);
+
+    /// Sets whether this record is the second mate of a pair.
+    BamRecordImpl& SetSecondMate(bool ok);
+
+    /// Sets whether this record is a supplementary alignment.
+    BamRecordImpl& SetSupplementaryAlignment(bool ok);
+
+    /// \}
+
+public:
+    /// \name Variable-length Data (sequence, qualities, etc.)
+    /// \{
+
+    /// \returns the record's CIGAR data as a Cigar object
+    Cigar CigarData(void) const;
+
+    /// Sets the record's CIGAR data using a Cigar object
+    ///
+    /// \param[in] cigar PacBio::BAM::Cigar object
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& CigarData(const Cigar& cigar);
+
+    /// Sets the record's CIGAR data using a CIGAR-formatted string.
+    ///
+    /// \param[in] cigarString CIGAR-formatted string
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& CigarData(const std::string& cigarString);
+
+    // TODO: CIGAR iterator - Cigar only or here as well ??
+
+    /// \returns the record's query name
+    std::string Name(void) const;
+
+    /// Sets the record's "query name".
+    ///
+    /// \param name new name
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Name(const std::string& name);
+
+    /// \returns the record's quality values (phred-style ASCII)
+    ///
+    /// \note Usually Qualities().size() == Sequence.size(). However, in
+    ///       some data sets, the quality values are not provided. In that
+    ///       case, this method will return an empty container.
+    ///
+    QualityValues Qualities(void) const;
+
+    /// \returns the record's DNA sequence.
+    std::string Sequence(void) const;
+
+    size_t SequenceLength(void) const;
+
+    /// \brief Sets the record's DNA sequence and quality values
+    ///
+    /// This is an overloaded function. Sets the DNA sequence and quality
+    /// values, using the length of \p sequence.
+    ///
+    /// \note When using this overload (and \p qualities is non-empty), the
+    ///       lengths of \p sequence and \p qualities \b must be equal.
+    ///
+    /// \todo How to handle mismatched lengths?
+    ///
+    /// \param[in] sequence  std::string containing DNA sequence
+    /// \param[in] qualities std::string containing ASCII quality values
+    ///
+    /// \returns reference to this record.
+    ///
+    /// \sa SetSequenceAndQualities(const char* sequence,
+    ///     const size_t sequenceLength, const char* qualities)
+    ///
+    BamRecordImpl& SetSequenceAndQualities(const std::string& sequence,
+                                           const std::string& qualities = std::string());
+
+    /// \brief Sets the record's DNA sequence and quality values.
+    ///
+    /// The \p sequence must consist of IUPAC nucleotide codes {=ACMGRSVTWYHKDBN}.
+    /// The \p qualities, if not empty, must consist of 'phred'-style ASCII
+    /// quality values. \p qualities may be an empty string or NULL pointer in
+    /// cases where there are no such data available.
+    ///
+    /// \param[in] sequence         C-string containing DNA sequence
+    /// \param[in] sequenceLength   length of DNA sequence
+    /// \param[in] qualities        C-string containing 'phred-style' ASCII
+    ///                             quality values
+    ///
+    /// \note \p sequence does \b NOT have to be NULL-terminated. Length is
+    ///       explicitly determined by the value of \p sequenceLength provided.
+    ///
+    /// \returns reference to this record.
+    ///
+    BamRecordImpl& SetSequenceAndQualities(const char* sequence,
+                                           const size_t sequenceLength,
+                                           const char* qualities = 0);
+
+    /// \brief Sets the record's DNA sequence and quality values.
+    ///
+    /// The \p encodedSequence should be preencoded/packed into the BAM binary
+    /// format. The \p qualities, if not empty, must consist of 'phred'-style
+    /// ASCII quality values. \p qualities may be an empty string or NULL
+    /// pointer in cases where there are no such data available.
+    ///
+    /// \param[in] encodedSequence      C-string containing BAM-format-encoded
+    ///                                 DNA sequence
+    /// \param[in] rawSequenceLength    length of DNA sequence (not the encoded
+    ///                                 length)
+    /// \param[in] qualities            C-string containing 'phred-style' ASCII
+    ///                                 quality values
+    ///
+    /// \note \p encodedSequence does \b NOT have to be NULL-terminated. Length
+    ///       is explicitly determined by the value of \p sequenceLength
+    ///       provided.
+    ///
+    /// \returns reference to this record.
+    ///
+    /// \sa SetSequenceAndQualities(const char* sequence,
+    ///     const size_t sequenceLength, const char* qualities)
+    ///
+    BamRecordImpl& SetPreencodedSequenceAndQualities(const char* encodedSequence,
+                                                     const size_t rawSequenceLength,
+                                                     const char* qualities = 0);
+
+    /// \}
+
+public:
+    /// \name Tag Data
+    /// \{
+
+    /// \returns record's full tag data as a TagCollection object
+    TagCollection Tags(void) const;
+
+    /// \brief Sets the record's full tag data via a TagCollection object
+    ///
+    BamRecordImpl& Tags(const TagCollection& tags);
+
+    /// \brief Adds a new tag to this record.
+    ///
+    /// \param[in] tagName  2-character tag name.
+    /// \param[in] value    Tag object that describes the type & value of data
+    ///                     to be added
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     string s;
+    ///     vector<uint32_t> v;
+    ///     record.AddTag("XX", s); // will add a string-type tag
+    ///     record.AddTag("YY", v); // will add a uint32-array-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const std::string& tagName,
+                const Tag& value);
+
+    /// \brief Adds a new tag to this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag      BamRecordTag enum
+    /// \param[in] value    Tag object that describes the type & value of data
+    ///                     to be added
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const BamRecordTag tag,
+                const Tag& value);
+
+    /// \brief Adds a new tag to this record, with an optional modifier.
+    ///
+    /// \param[in] tagName              2-character tag name.
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     char c;
+    ///     string h;
+    ///     record.AddTag("XX", c, TagModifier::ASCII_CHAR); // will add a char-type tag
+    ///     record.AddTag("YY", h, TagModifier::HEX_STRING); // will add a hex string-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const std::string& tagName,
+                const Tag& value,
+                const TagModifier additionalModifier);
+
+    /// \brief Adds a new tag to this record, with an optional modifier.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag                  BamRecordTag enum.
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const BamRecordTag tag,
+                const Tag& value,
+                const TagModifier additionalModifier);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// \param[in] tagName      2-character tag name. Name must be present
+    ///                         (see HasTag)
+    /// \param[in] newValue     Tag object that describes the type & value of
+    ///                         new data to be added
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     string s;
+    ///     vector<uint32_t> v;
+    ///     record.EditTag("XX", s); // will overwrite tag XX with a string-type tag
+    ///     record.EditTag("YY", v); // will overwrite tag YY with a uint32-array-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const std::string& tagName,
+                 const Tag& newValue);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag          BamRecordTag enum
+    /// \param[in] newValue     Tag object that describes the type & value of
+    ///                         new data to be added
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const BamRecordTag tag,
+                 const Tag& newValue);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// \param[in] tagName              2-character tag name. Name must be
+    ///                                 present (see HasTag)
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of new data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     char c;
+    ///     string h;
+    ///     record.EditTag("XX", c, TagModifier::ASCII_CHAR); // will overwrite tag XX with a char-type tag
+    ///     record.EditTag("YY", h, TagModifier::HEX_STRING); // will overwrite tag YY with a hex string-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const std::string& tagName,
+                 const Tag& value,
+                 const TagModifier additionalModifier);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag                  BamRecordTag enum
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of new data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const BamRecordTag tag,
+                 const Tag& value,
+                 const TagModifier additionalModifier);
+
+
+    /// \returns true if a tag with this name is present in this record.
+    bool HasTag(const std::string& tagName) const;
+
+    /// \returns true if this tag is present in this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    bool HasTag(const BamRecordTag tag) const;
+
+    /// \brief Removes an existing tag from this record.
+    ///
+    /// \param[in] tagName  2-character tag name.
+    ///
+    /// \returns true if tag was actaully removed (i.e. false if tagName
+    ///          previously unknown)
+    /// \sa HasTag
+    ///
+    bool RemoveTag(const std::string& tagName);
+
+    /// \brief Removes an existing tag from this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag  BamRecordTag enum
+    ///
+    /// \returns true if tag was actaully removed (i.e. false if tagName
+    ///          previously unknown)
+    /// \sa HasTag
+    ///
+    bool RemoveTag(const BamRecordTag tag);
+
+    /// \brief Fetches a tag from this record.
+    ///
+    /// \param[in] tagName  2-character tag name.
+    ///
+    /// \returns Tag object for the requested name. If name is unknown, a
+    ///          default constructed Tag is returned (Tag::IsNull() is true).
+    ///
+    Tag TagValue(const std::string& tagName) const;
+
+    /// \brief Fetches a tag from this record.
+    ///
+    /// This is an overloaded method
+    ///
+    /// \param[in] tag  BamRecordTag enum
+    ///
+    /// \returns Tag object for the requested name. If name is unknown, a
+    ///          default constructed Tag is returned (Tag::IsNull() is true).
+    ///
+    Tag TagValue(const BamRecordTag tag) const;
+
+    // change above to Tag();
+
+//    template<typename T>
+//    T TagValue(const std::string& tagName) const;
+
+
+    /// \}
+
+private:
+    // returns a BamRecordImpl object, with a deep copy of @rawData contents
+    static BamRecordImpl FromRawData(const PBBAM_SHARED_PTR<bam1_t>& rawData);
+
+    // internal memory setup/expand methods
+    void InitializeData(void);
+    void MaybeReallocData(void);
+    void UpdateTagMap(void) const; // allowed to be called from const methods
+                                   // (lazy update on request)
+
+    // internal tag helper methods
+    bool AddTagImpl(const std::string& tagName,
+                    const Tag& value,
+                    const TagModifier additionalModifier);
+    bool RemoveTagImpl(const std::string& tagName);
+    int TagOffset(const std::string& tagName) const;
+
+    // core seq/qual logic shared by the public API
+    BamRecordImpl& SetSequenceAndQualitiesInternal(const char* sequence,
+                                                      const size_t sequenceLength,
+                                                      const char* qualities,
+                                                      bool isPreencoded);
+
+private:
+
+    // data members
+    PBBAM_SHARED_PTR<bam1_t> d_;
+    mutable std::map<uint16_t, int> tagOffsets_;
+
+    // friends
+    friend class internal::BamRecordMemory;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/BamRecordImpl.inl"
+
+#endif // BAMRECORDIMPL_H
diff --git a/include/pbbam/BamRecordTag.h b/include/pbbam/BamRecordTag.h

new file mode 100644 (file)

index 0000000..93768ca
--- /dev/null
+++ b/include/pbbam/BamRecordTag.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecordTag.h
+/// \brief Defines the BamRecordTag enum.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDTAG_H
+#define BAMRECORDTAG_H
+
+namespace PacBio {
+namespace BAM {
+
+enum class BamRecordTag
+{
+    ALT_LABEL_QV
+  , ALT_LABEL_TAG
+  , BARCODE_QUALITY
+  , BARCODES
+  , CONTEXT_FLAGS
+  , DELETION_QV
+  , DELETION_TAG
+  , HOLE_NUMBER
+  , INSERTION_QV
+  , IPD
+  , LABEL_QV
+  , MERGE_QV
+  , NUM_PASSES
+  , PKMEAN
+  , PKMEAN_2
+  , PKMID
+  , PKMID_2
+  , PRE_PULSE_FRAMES
+  , PULSE_CALL
+  , PULSE_CALL_WIDTH
+  , PULSE_MERGE_QV
+  , PULSE_WIDTH
+  , QUERY_END
+  , QUERY_START
+  , READ_ACCURACY
+  , READ_GROUP
+  , SCRAP_REGION_TYPE
+  , SCRAP_ZMW_TYPE
+  , SNR
+  , START_FRAME
+  , SUBSTITUTION_QV
+  , SUBSTITUTION_TAG
+
+  //
+  // not tags per se, but faking these here to simplify data fetching
+  //
+  , QUAL
+  , SEQ
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // BAMRECORDTAG_H
diff --git a/include/pbbam/BamRecordView.h b/include/pbbam/BamRecordView.h

new file mode 100644 (file)

index 0000000..afbebad
--- /dev/null
+++ b/include/pbbam/BamRecordView.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecordView.h
+/// \brief Defines the BamRecordView class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDVIEW_H
+#define BAMRECORDVIEW_H
+
+#include "pbbam/BamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief Provides a re-usable "view" onto a BamRecord
+///
+/// This class acts a convenience wrapper for working with per-base BamRecord
+/// data. Most of these BamRecord methods take a list of parameters, to adjust
+/// how the underlying data are presented to client code. Often these parameters
+/// will be re-used for each BamRecord method call. Thus, to simplify such
+/// client code, a BamRecordView can be used to state those parameters once, and
+/// then simply request the desired fields.
+///
+/// \internal
+/// \todo Sync up method names with BamRecord
+/// \endinternal
+///
+class PBBAM_EXPORT BamRecordView
+{
+public:
+    /// \brief Constructs a view onto \p record using the supplied parameters.
+    ///
+    /// For frame or QV data, if \p aligned is true, a value of 0 (Accuracy or
+    /// QualityValue) will be used at each inserted or padded base location.
+    ///
+    /// \param[in] record           BamRecord data source.
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    BamRecordView(const BamRecord& record,
+                  const Orientation orientation,
+                  const bool aligned,
+                  const bool exciseSoftClips,
+                  const PulseBehavior pulseBehavior = PulseBehavior::ALL);
+
+public:
+
+    /// \returns BamRecord::AltLabelQV with this view's parameters applied
+    QualityValues AltLabelQVs(void) const;
+
+    /// \returns BamRecord::AltLabelTag with this view's parameters applied
+    std::string AltLabelTags(void) const;
+
+    /// \returns BamRecord::DeletionQV with this view's parameters applied
+    QualityValues DeletionQVs(void) const;
+
+    /// \returns BamRecord::DeletionTag with this view's parameters applied
+    std::string DeletionTags(void) const;
+
+    /// \returns BamRecord::InsertionQV with this view's parameters applied
+    QualityValues InsertionQVs(void) const;
+
+    /// \returns BamRecord::IPD with this view's parameters applied
+    Frames IPD(void) const;
+
+    /// \returns BamRecord::LabelQV with this view's parameters applied
+    QualityValues LabelQVs(void) const;
+
+    /// \returns BamRecord::MergeQV with this view's parameters applied
+    QualityValues MergeQVs(void) const;
+
+    /// \returns BamRecord::PulseMergeQV with this view's parameters applied
+    QualityValues PulseMergeQVs(void) const;
+
+    /// \returns BamRecord::Pkmean with this view's parameters applied
+    std::vector<float> Pkmean(void) const;
+
+    /// \returns BamRecord::Pkmid with this view's parameters applied
+    std::vector<float> Pkmid(void) const;
+
+    /// \returns BamRecord::Pkmean2 with this view's parameters applied
+    std::vector<float> Pkmean2(void) const;
+
+    /// \returns BamRecord::Pkmid2 with this view's parameters applied
+    std::vector<float> Pkmid2(void) const;
+
+    /// \returns BamRecord::PreBaseFrames with this view's parameters applied
+    Frames PrebaseFrames(void) const;
+
+    /// \returns BamRecord::PrePulseFrames with this view's parameters applied
+    Frames PrePulseFrames(void) const;
+
+    /// \returns BamRecord::PulseCalls with this view's parameters applied
+    std::string PulseCalls(void) const;
+
+    /// \returns BamRecord::PulseCallWidth with this view's parameters applied
+    Frames PulseCallWidth(void) const;
+
+    /// \returns BamRecord::PulseWidths with this view's parameters applied
+    Frames PulseWidths(void) const;
+
+    /// \returns BamRecord::Qualities with this view's parameters applied
+    QualityValues Qualities(void) const;
+
+    /// \returns BamRecord::Sequence with this view's parameters applied
+    std::string Sequence(void) const;
+
+    /// \returns BamRecord::StartFrame with this view's parameters applied
+    std::vector<uint32_t> StartFrames(void) const;
+
+    /// \returns BamRecord::SubstitutionQV with this view's parameters applied
+    QualityValues SubstitutionQVs(void) const;
+
+    /// \returns BamRecord::SubstitutionTag with this view's parameters applied
+    std::string SubstitutionTags(void) const;
+
+private:
+    const BamRecord& record_;
+    Orientation orientation_;
+    bool aligned_;
+    bool exciseSoftClips_;
+    PulseBehavior pulseBehavior_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/BamRecordView.inl"
+
+#endif // BAMRECORDVIEW_H
diff --git a/include/pbbam/BamTagCodec.h b/include/pbbam/BamTagCodec.h

new file mode 100644 (file)

index 0000000..9126900
--- /dev/null
+++ b/include/pbbam/BamTagCodec.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamTagCodec.h
+/// \brief Defines the BamTagCodec class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMTAGCODEC_H
+#define BAMTAGCODEC_H
+
+#include "pbbam/Config.h"
+#include "pbbam/TagCollection.h"
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BamTagCodec class provides binary encoding/decoding of %BAM tag
+///        data.
+///
+/// \note BamTagCodec is mostly an implementation and/or testing detail, and may
+///       be removed from the public API.
+///
+class PBBAM_EXPORT BamTagCodec
+{
+public:
+    /// \name Tag Collection Methods
+    /// \{
+
+    /// \brief Creates a TagCollection from raw BAM data.
+    ///
+    /// \param[in] data     BAM-formatted (binary) tag data
+    /// \returns TagCollection containing tag data
+    ///
+    static TagCollection Decode(const std::vector<uint8_t>& data);
+
+    /// \brief Creates binary BAM data from a TagCollection.
+    ///
+    /// \param[in] tags     TagCollection containing tag data
+    /// \returns vector of bytes (encoded BAM data)
+    ///
+    static std::vector<uint8_t> Encode(const PacBio::BAM::TagCollection& tags);
+
+    /// \}
+
+public:
+    /// \name Per-Tag Methods
+    /// \{
+
+    /// \brief Determines the SAM/BAM tag code for a Tag.
+    ///
+    /// \param[in] tag                  Tag object to check
+    /// \param[in] additionalModifier   optional extra modifier (allows explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns the SAM/BAM single char code for tag type
+    ///
+    static uint8_t TagTypeCode(const PacBio::BAM::Tag& tag,
+                               const TagModifier& additionalModifier = TagModifier::NONE);
+
+    /// \brief Encodes a single Tag's contents in %BAM binary
+    ///
+    /// \note This method does \b NOT encode the tag name & tag type. It does
+    ///       include the element type for array-type tags.
+    ///
+    /// \param[in] tag                  Tag object containing data to encode
+    /// \param[in] additionalModifier   optional extra modifier (allows explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns vector of bytes (encoded BAM data)
+    ///
+    static std::vector<uint8_t> ToRawData(const PacBio::BAM::Tag& tag,
+                                          const TagModifier& additionalModifier = TagModifier::NONE);
+
+    /// \brief Creates a Tag object from binary BAM data.
+    ///
+    /// \param[in] rawData      raw BAM bytes (assumed to be the result of
+    ///                         htslib's bam_aux_get())
+    ///
+    /// \returns resulting Tag object
+    ///
+    static PacBio::BAM::Tag FromRawData(uint8_t* rawData);
+
+    /// \}
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // BAMTAGCODEC_H
diff --git a/include/pbbam/BamWriter.h b/include/pbbam/BamWriter.h

new file mode 100644 (file)

index 0000000..b12df9b
--- /dev/null
+++ b/include/pbbam/BamWriter.h
@@ -0,0 +1,210 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamWriter.h
+/// \brief Defines the BamWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMWRITER_H
+#define BAMWRITER_H
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/IRecordWriter.h"
+#include <htslib/sam.h>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile;
+
+namespace internal { class BamWriterPrivate; }
+
+/// \brief The BamWriter class provides a writing interface for creating
+///        new %BAM files.
+///
+/// \note The underlying buffered data may not be flushed to the file until the
+///       destructor is called. Trying to access the file (reading, stat-ing,
+///       indexing, etc.) before the BamWriter is destroyed yields undefined
+///       behavior. Enclose the BamWriter in some form of local scope (curly
+///       braces, a separate function, etc.) to ensure that its destructor is
+///       called before proceeding to read-based operations.
+///
+/// \code{.cpp}
+///  {
+///     BamWriter w(...);
+///     // write data
+///  }
+///  // now safe to access the new file
+/// \endcode
+///
+///
+class PBBAM_EXPORT BamWriter : public IRecordWriter
+{
+public:
+    /// \brief This enum allows you to control the compression level of the
+    ///        output %BAM file.
+    ///
+    /// Values are equivalent to zlib compression levels. See its documentation
+    /// for more details: http://www.zlib.net/manual.html
+    ///
+    enum CompressionLevel
+    {
+        CompressionLevel_0 = 0
+      , CompressionLevel_1 = 1
+      , CompressionLevel_2 = 2
+      , CompressionLevel_3 = 3
+      , CompressionLevel_4 = 4
+      , CompressionLevel_5 = 5
+      , CompressionLevel_6 = 6
+      , CompressionLevel_7 = 7
+      , CompressionLevel_8 = 8
+      , CompressionLevel_9 = 9
+
+      , DefaultCompression = -1
+      , NoCompression      = CompressionLevel_0
+      , FastCompression    = CompressionLevel_1
+      , BestCompression    = CompressionLevel_9
+    };
+
+    /// \brief This enum allows you to control whether BAI bin numbers are
+    ///        calculated for output records.
+    /// 
+    /// For most cases, the default behavior (ON) should be retained for maximum
+    /// compatibility with downstream tools (e.g. samtools index). Disabling bin
+    /// calculation should only be used if all records are known to never be
+    /// mapped, and even then only if profiling revelas the calculation to
+    /// affect extremely performance-sensitive, "critical paths".
+    ///
+    enum BinCalculationMode
+    {
+        BinCalculation_ON = 0
+      , BinCalculation_OFF
+    };
+
+public:
+
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Opens a %BAM file for writing & writes the header information.
+    ///
+    /// The error status will be set if either operation fails.
+    ///
+    /// \note Set \p filename to "-" for stdout.
+    ///
+    /// \param[in] filename         path to output %BAM file
+    /// \param[in] header           BamHeader object
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, BamWriter will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \param[in] binCalculationMode BAI bin calculation mode. The default
+    ///            behavior will ensure proper bin numbers are provided for all
+    ///            records written. This extra step may turned off when bin
+    ///            numbers are not needed. Though if in doubt, keep the default.
+    ///
+    /// \throws std::runtmie_error if there was a problem opening the file for
+    ///         writing or if an error occurred while writing the header
+    ///
+    BamWriter(const std::string& filename,
+              const BamHeader& header,
+              const BamWriter::CompressionLevel compressionLevel = BamWriter::DefaultCompression,
+              const size_t numThreads = 4,
+              const BinCalculationMode binCalculationMode = BamWriter::BinCalculation_ON);
+
+    /// Fully flushes all buffered data & closes file.
+    ~BamWriter(void);
+
+    /// \}
+
+public:
+
+    /// \name Data Writing & Resource Management
+    /// \{
+
+    /// \brief Try to flush any buffered data to file.
+    ///
+    /// \note The underlying implementation doesn't necessarily flush buffered
+    ///       data immediately, especially in a multithreaded writer situation.
+    ///       Let the BamWriter go out of scope to fully ensure flushing.
+    ///
+    /// \throws std::runtime_error if flush fails
+    ///
+    void TryFlush(void);
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] record BamRecord object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecord& record);
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] record BamRecord object
+    /// \param[out] vOffset BGZF virtual offset to start of \p record
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecord& record, int64_t* vOffset);
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] recordImpl BamRecordImpl object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecordImpl& recordImpl);
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::BamWriterPrivate> d_;
+    DISABLE_MOVE_AND_COPY(BamWriter);
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // BAMWRITER_H
diff --git a/include/pbbam/BarcodeQuery.h b/include/pbbam/BarcodeQuery.h

new file mode 100644 (file)

index 0000000..9b55167
--- /dev/null
+++ b/include/pbbam/BarcodeQuery.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BarcodeQuery.h
+/// \brief Defines the BarcodeQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef BARCODEQUERY_H
+#define BARCODEQUERY_H
+
+#include "pbbam/Config.h"
+#include "pbbam/internal/QueryBase.h"
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BarcodeQuery class provides iterable access to a DataSet's %BAM
+///        records, limiting results to those matching a particular barcode.
+///
+/// Example:
+/// \include code/BarcodeQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT BarcodeQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new BarcodeQuery, limiting record results to only those
+    ///        annotated with a particular barcode ID.
+    ///
+    /// \param[in] barcode  filtering criteria
+    /// \param[in] dataset  input data source(s)
+    ///
+    /// \sa BamRecord::Barcodes
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI
+    ///         files.
+    ///
+    BarcodeQuery(const int16_t barcode, const DataSet& dataset);
+
+    ~BarcodeQuery(void);
+
+public:
+
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r);
+
+private:
+    struct BarcodeQueryPrivate;
+    std::unique_ptr<BarcodeQueryPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // BARCODEQUERY_H
diff --git a/include/pbbam/Cigar.h b/include/pbbam/Cigar.h

new file mode 100644 (file)

index 0000000..c391057
--- /dev/null
+++ b/include/pbbam/Cigar.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Cigar.h
+/// \brief Defines the Cigar class.
+//
+// Author: Derek Barnett
+
+#ifndef CIGAR_H
+#define CIGAR_H
+
+#include "pbbam/CigarOperation.h"
+#include "pbbam/Config.h"
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The Cigar class represents the CIGAR string used to report alignment
+///        charateristics in SAM/BAM.
+///
+/// \note Use of the 'M' operator is forbidden in PacBio BAMs. See
+///       CigarOperationType description for more information.
+///
+/// \sa https://samtools.github.io/hts-specs/SAMv1.pdf for more information on CIGAR in general.
+///
+class PBBAM_EXPORT Cigar : public std::vector<CigarOperation>
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a Cigar object from SAM/BAM string input
+    ///
+    /// \param [in] stdString   SAM/BAM formatted CIGAR data
+    /// \returns a Cigar object representing the input data
+    ///
+    /// \note This class may be removed from the public API in the future,
+    ///       as the constructor taking a std::string accomplishes the same end.
+    ///
+    static Cigar FromStdString(const std::string& stdString);
+
+    /// \brief Creates an empty Cigar.
+    Cigar(void);
+
+    /// \brief Creates a Cigar object from SAM/BAM string input
+    ///
+    /// \param [in] cigarString   SAM/BAM formatted CIGAR data
+    ///
+    Cigar(const std::string& cigarString);
+
+    Cigar(const Cigar& other);
+    Cigar(Cigar&& other);
+    Cigar& operator=(const Cigar& other);
+    Cigar& operator=(Cigar&& other);
+    ~Cigar(void);
+
+    /// \}
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// Converts Cigar object data to SAM/BAM formatted string
+    ///
+    /// \returns SAM/BAM formatted std::string
+    ///
+    std::string ToStdString(void) const;
+
+    /// \}
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/Cigar.inl"
+
+#endif // CIGAR_H
diff --git a/include/pbbam/CigarOperation.h b/include/pbbam/CigarOperation.h

new file mode 100644 (file)

index 0000000..9b936ef
--- /dev/null
+++ b/include/pbbam/CigarOperation.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file CigarOperation.h
+/// \brief Defines the CigarOperationType enum & CigarOperation class.
+//
+// Author: Derek Barnett
+
+#ifndef CIGAROPERATION_H
+#define CIGAROPERATION_H
+
+#include "pbbam/Config.h"
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief Describes a CIGAR operation.
+///
+/// Bracketed character is the corresponding SAM/BAM character code.
+///
+/// \warning ALIGNMENT_MATCH ('M') is included in this enum to maintain
+///          consistency with htslib. However, as of PacBio BAM spec version
+///          3.0b7, this CIGAR operation \b forbidden. Any attempt to read or
+///          write a record containing this operation will trigger a
+///          std::runtime_error. SEQUENCE_MATCH('=) or SEQUENCE_MISMATCH('X')
+///          should be used instead.
+///
+enum class CigarOperationType
+{
+    UNKNOWN_OP        = -1 ///< unknown/invalid CIGAR operator
+  , ALIGNMENT_MATCH   = 0  ///< alignment match (can be a sequence match or mismatch) [M]
+  , INSERTION              ///< insertion to the reference [I]
+  , DELETION               ///< deletion from the reference [D]
+  , REFERENCE_SKIP         ///< skipped region from the reference [N]
+  , SOFT_CLIP              ///< soft clipping (clipped sequences present in SEQ) [S]
+  , HARD_CLIP         = 5  ///< hard clipping (clipped sequences NOT present in SEQ) [H]
+  , PADDING                ///< padding (silent deletion from padded reference) [P]
+  , SEQUENCE_MATCH         ///< sequence match [=]
+  , SEQUENCE_MISMATCH      ///< sequence mismatch [X]
+};
+
+/// \brief The CigarOperation class represents a single CIGAR operation
+///        (consisting of a type & length).
+///
+class PBBAM_EXPORT CigarOperation
+{
+public:
+
+    /// \name Operation Type Conversion Methods
+    /// \{
+
+    /// Convert between CigarOperationType enum & SAM/BAM character code.
+    ///
+    /// \param[in] type CigarOperationType value
+    /// \returns SAM/BAM character code
+    static char TypeToChar(const CigarOperationType type);
+
+    /// Convert between CigarOperationType enum & SAM/BAM character code.
+    ///
+    /// \param[in] c SAM/BAM character code
+    /// \returns CigarOperationType value
+    static CigarOperationType CharToType(const char c);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    CigarOperation(void);
+    CigarOperation(char c, uint32_t length);
+    CigarOperation(CigarOperationType op, uint32_t length);
+    CigarOperation(const CigarOperation& other);
+    CigarOperation(CigarOperation&& other);
+    CigarOperation& operator=(const CigarOperation& other);
+    CigarOperation& operator=(CigarOperation&& other);
+    ~CigarOperation(void);
+
+    /// \}
+
+public:
+
+    /// \returns operation type as SAM/BAM char code
+    inline char Char(void) const;
+
+    /// \returns operation length
+    inline uint32_t Length(void) const;
+
+    /// \returns operation type as CigarOperationType enum value
+    inline CigarOperationType Type(void) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// Sets this operation type.
+    ///
+    /// \param[in] opChar SAM/BAM character code
+    /// \returns reference to this operation
+    inline CigarOperation& Char(const char opChar);
+
+    /// Sets this operation length.
+    ///
+    /// \param[in] length
+    /// \returns reference to this operation
+    inline CigarOperation& Length(const uint32_t length);
+
+    /// Sets this operation type.
+    ///
+    /// \param[in] opType CigarOperationType value
+    /// \returns reference to this operation
+    inline CigarOperation& Type(const CigarOperationType opType);
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    /// \returns true if both CIGAR operation type & length match
+    inline bool operator==(const CigarOperation& other) const;
+
+    /// \returns true if either CIGAR operation type or length differ
+    inline bool operator!=(const CigarOperation& other) const;
+
+    /// \}
+
+private:
+    CigarOperationType type_;
+    uint32_t length_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/CigarOperation.inl"
+
+#endif // CIGAROPERATION_H
diff --git a/include/pbbam/ClipType.h b/include/pbbam/ClipType.h

new file mode 100644 (file)

index 0000000..eb97167
--- /dev/null
+++ b/include/pbbam/ClipType.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ClipType.h
+/// \brief Defines the ClipType enum.
+//
+// Author: Derek Barnett
+
+#ifndef CLIPTYPE_H
+#define CLIPTYPE_H
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the modes supported by BamRecord clipping
+///        operations.
+///
+/// Methods like BamRecord::Clip accept Position parameters - which may be in
+/// either polymerase or reference coorindates. Using this enum as a flag
+/// indicates how the positions should be interpreted.
+///
+enum class ClipType
+{
+    CLIP_NONE           ///< No clipping will be performed.
+  , CLIP_TO_QUERY       ///< Clipping positions are in polymerase coordinates.
+  , CLIP_TO_REFERENCE   ///< Clipping positions are in genomic coordinates.
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // CLIPTYPE_H
diff --git a/include/pbbam/Compare.h b/include/pbbam/Compare.h

new file mode 100644 (file)

index 0000000..8570e02
--- /dev/null
+++ b/include/pbbam/Compare.h
@@ -0,0 +1,430 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Compare.h
+/// \brief Defines the Compare class & a number of function objects for
+///       comparing BamRecords.
+//
+// Author: Derek Barnett
+
+#ifndef COMPARE_H
+#define COMPARE_H
+
+#include "pbbam/BamRecord.h"
+#include <functional>
+#include <string>
+#include <utility>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The Compare class provides utilities for sorting collections of
+///        BamRecords.
+///
+/// \note The functors provided here currently only support std::less<T>
+///       comparisons (i.e. sorting by ascending value).
+///
+/// \include code/Compare.txt
+///
+struct PBBAM_EXPORT Compare
+{
+public:
+
+    /// \name Comparison Type
+    /// \{
+
+    /// \brief This enum defines the supported comparison types
+    ///        { ==, !=, <, <=, >, >=, & (contains), ~ (not contains) }.
+    ///
+    enum Type {
+        EQUAL = 0
+      , NOT_EQUAL
+      , LESS_THAN
+      , LESS_THAN_EQUAL
+      , GREATER_THAN
+      , GREATER_THAN_EQUAL
+      , CONTAINS
+      , NOT_CONTAINS
+    };
+
+    /// \brief Convert operator string to Compare::Type.
+    ///
+    /// \include code/Compare_TypeFromOperator.txt
+    ///
+    /// \param[in] opString operator string. Can be C++-style operators
+    ///                     ("==", "!=", "<=", etc) or alpha equivalents
+    ///                     ("eq", "ne", "lte", etc).
+    ///
+    /// \returns comparison type from an operator string
+    /// \throws std::runtime_error if cannot convert opString to Compare::Type
+    /// \sa Compare::TypeToOperator
+    ///
+    static Compare::Type TypeFromOperator(const std::string& opString);
+
+    /// \brief Convert a Compare::Type to printable enum name.
+    ///
+    /// \include code/Compare_TypeToName.txt
+    ///
+    /// \param[in] type Compare::Type to convert
+    /// \returns the printable name for a Compare::Type enum value.are::Type
+    /// \throws std::runtime_error on unknown Compare::Type
+    ///
+    static std::string TypeToName(const Compare::Type& type);
+
+    /// \brief Convert a Compare::Type to printable operator.
+    ///
+    /// \param[in] type     Compare::Type to convert
+    /// \param[in] asAlpha  (optional) flag to print using alpha equivalents
+    ///                     e.g. "lte" rather than "<="
+    /// \returns the printable operator string
+    /// \throws std::runtime_error on unknown Compare::Type
+    ///
+    static std::string TypeToOperator(const Compare::Type& type,
+                                      bool asAlpha = false);
+
+    /// \}
+
+public:
+
+    /// \name Comparison Function Objects
+    /// \{
+
+    /// %Base class for all BamRecord compare functors.
+    ///
+    /// Mostly used for method signatures that can accept any comparator.
+    ///
+    /// Custom comparators may be used by inheriting from this class.
+    ///
+    struct Base : public std::function<bool(const BamRecord&, const BamRecord&)> { };
+
+private:
+    /// \internal
+    ///
+    /// Exists to provide the typedef we'll use in the actual
+    /// MemberFunctionBase, since we need to use it in the template signature.
+    /// This keeps that a lot easier to read.
+    ///
+    template<typename ValueType>
+    struct MemberFunctionBaseHelper : public Compare::Base
+    {
+        typedef ValueType (BamRecord::*MemberFnType)(void) const;
+    };
+
+public:
+    /// \brief %Base class for all BamRecord compare functors that take a
+    ///        BamRecord function pointer and compare on its return type.
+    ///
+    /// Derived comparators usually need only declare the return value &
+    /// function pointer in the template signature. This class implements the
+    /// basic method-calling machinery.
+    ///
+    /// Custom comparators will work for any BamRecord member function that does
+    /// not take any input parameters.
+    ///
+    template<typename ValueType,
+             typename MemberFunctionBaseHelper<ValueType>::MemberFnType fn,
+             typename CompareType = std::less<ValueType> >
+    struct MemberFunctionBase : public Compare::MemberFunctionBaseHelper<ValueType>
+    {
+        bool operator()(const BamRecord& lhs, const BamRecord& rhs) const;
+    };
+
+public:
+
+    /// \brief Compares on BamRecord::AlignedEnd.
+    ///
+    /// Example:
+    /// \include code/Compare_AlignedEnd.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct AlignedEnd : public MemberFunctionBase<Position, &BamRecord::AlignedEnd> { };
+
+    /// \brief Compares on BamRecord::AlignedStart.
+    ///
+    /// Example:
+    /// \include code/Compare_AlignedStart.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct AlignedStart : public MemberFunctionBase<Position, &BamRecord::AlignedStart> { };
+
+    /// \brief Compares on BamRecord::AlignedStrand
+    ///
+    /// Example:
+    /// \include code/Compare_AlignedStrand.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct AlignedStrand : public MemberFunctionBase<Strand, &BamRecord::AlignedStrand> { };
+
+    /// \brief Compares on BamRecord::BarcodeForward.
+    ///
+    /// Example:
+    /// \include code/Compare_BarcodeForward.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct BarcodeForward : public MemberFunctionBase<int16_t, &BamRecord::BarcodeForward> { };
+
+    /// \brief Compares on BamRecord::BarcodeQuality.
+    ///
+    /// Example:
+    /// \include code/Compare_BarcodeQuality.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct BarcodeQuality : public MemberFunctionBase<uint8_t, &BamRecord::BarcodeQuality> { };
+
+    /// \brief Compares on BamRecord::BarcodeReverse.
+    ///
+    /// Example:
+    /// \include code/Compare_BarcodeReverse.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct BarcodeReverse: public MemberFunctionBase<int16_t, &BamRecord::BarcodeReverse> { };
+
+    /// \brief Compares on BamRecord::FullName.
+    ///
+    /// Example:
+    /// \include code/Compare_FullName.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct FullName : public MemberFunctionBase<std::string, &BamRecord::FullName> { };
+
+    /// \brief Compares on BamRecord::LocalContextFlags.
+    ///
+    /// Example:
+    /// \include code/Compare_LocalContextFlag.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct LocalContextFlag : public MemberFunctionBase<LocalContextFlags, &BamRecord::LocalContextFlags> { };
+
+    /// \brief Compares on BamRecord::MapQuality.
+    ///
+    /// Example:
+    /// \include code/Compare_MapQuality.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct MapQuality : public MemberFunctionBase<uint8_t, &BamRecord::MapQuality> { };
+
+    /// \brief Compares on BamRecord::MovieName.
+    ///
+    /// Example:
+    /// \include code/Compare_MovieName.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct MovieName : public MemberFunctionBase<std::string, &BamRecord::MovieName> { };
+
+    /// \brief Provides an operator() is essentially a no-op for
+    ///        comparing/sorting.
+    ///
+    /// If used in a sorting operation, then no change will occur.
+    ///
+    struct None : public Compare::Base
+    {
+        bool operator()(const BamRecord&, const BamRecord&) const;
+    };
+
+    ///\brief Compares on BamRecord::NumDeletedBases.
+    ///
+    /// Example:
+    /// \include code/Compare_NumDeletedBases.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumDeletedBases : public MemberFunctionBase<size_t, &BamRecord::NumDeletedBases> { };
+
+    /// \brief Compares on BamRecord::NumInsertedBases.
+    ///
+    /// Example:
+    /// \include code/Compare_NumInsertedBases.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumInsertedBases : public MemberFunctionBase<size_t, &BamRecord::NumInsertedBases> { };
+
+    /// \brief Compares on BamRecord::NumMatches.
+    ///
+    /// Example:
+    /// \include code/Compare_NumMatches.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumMatches : public MemberFunctionBase<size_t, &BamRecord::NumMatches> { };
+
+    /// \brief Compares on BamRecord::NumMismatches.
+    ///
+    /// Example:
+    /// \include code/Compare_NumMismatches.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumMismatches : public MemberFunctionBase<size_t, &BamRecord::NumMismatches> { };
+
+    /// \brief Compares on BamRecord::QueryEnd.
+    ///
+    /// Example:
+    /// \include code/Compare_QueryEnd.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct QueryEnd : public MemberFunctionBase<Position, &BamRecord::QueryEnd> { };
+
+    /// \brief Compares on BamRecord::QueryStart.
+    ///
+    /// Example:
+    /// \include code/Compare_QueryStart.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct QueryStart : public MemberFunctionBase<Position, &BamRecord::QueryStart> { };
+
+    /// \brief Compares on BamRecord::ReadAccuracy.
+    ///
+    /// Example:
+    /// \include code/Compare_ReadAccuracy.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReadAccuracy : public MemberFunctionBase<Accuracy, &BamRecord::ReadAccuracy> { };
+
+    /// \brief Compares on BamRecord::ReadGroupId.
+    ///
+    /// \note Even though the ReadGroupId string contains hex values, it is
+    ///       still just a std::string. Comparisons will use lexical, not
+    ///       numeric ordering. If numeric ordering is desired, use
+    ///       Compare::ReadGroupNumericId instead.
+    ///
+    /// Example:
+    /// \include code/Compare_ReadGroupId.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReadGroupId : public MemberFunctionBase<std::string, &BamRecord::ReadGroupId> { };
+
+    /// \brief Compares on BamRecord::ReadGroupNumericId.
+    ///
+    /// Example:
+    /// \include code/Compare_ReadGroupNumericId.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReadGroupNumericId : public MemberFunctionBase<int32_t, &BamRecord::ReadGroupNumericId> { };
+
+    /// \brief Compares on BamRecord::ReferenceEnd.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceEnd.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceEnd : public MemberFunctionBase<Position, &BamRecord::ReferenceEnd> { };
+
+    /// \brief Compares on BamRecord::ReferenceId.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceId.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceId : public MemberFunctionBase<int32_t, &BamRecord::ReferenceId> { };
+
+    /// \brief Compares on BamRecord::ReferenceName.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceName.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceName : public MemberFunctionBase<std::string, &BamRecord::ReferenceName> { };
+
+    /// \brief Compares on BamRecord::ReferenceStart.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceStart.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceStart : public MemberFunctionBase<Position, &BamRecord::ReferenceStart> { };
+
+    /// \brief Compares on BamRecord::HoleNumber.
+    ///
+    /// Example:
+    /// \include code/Compare_Zmw.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct Zmw : public MemberFunctionBase<int32_t, &BamRecord::HoleNumber> { };
+
+    /// \}
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/Compare.inl"
+
+#endif // COMPARE_H
diff --git a/include/pbbam/CompositeBamReader.h b/include/pbbam/CompositeBamReader.h

new file mode 100644 (file)

index 0000000..f0de942
--- /dev/null
+++ b/include/pbbam/CompositeBamReader.h
@@ -0,0 +1,269 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file CompositeBamReader.h
+/// \brief Defines the composite BAM readers, for working with multiple input
+///       files.
+//
+// Author: Derek Barnett
+
+#ifndef COMPOSITEBAMREADER_H
+#define COMPOSITEBAMREADER_H
+
+#include "pbbam/BaiIndexedBamReader.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamReader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/GenomicInterval.h"
+#include "pbbam/PbiIndexedBamReader.h"
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+
+/// \internal
+/// \brief The CompositeMergeItem class provides a helper struct for composite
+///        readers, containing a single-file reader and its "next" record.
+///
+struct CompositeMergeItem
+{
+public:
+    std::unique_ptr<BamReader> reader;
+    BamRecord record;
+
+public:
+    CompositeMergeItem(std::unique_ptr<BamReader>&& rdr);
+    CompositeMergeItem(std::unique_ptr<BamReader>&& rdr, BamRecord&& rec);
+    CompositeMergeItem(CompositeMergeItem&& other);
+    CompositeMergeItem& operator=(CompositeMergeItem&& other);
+    ~CompositeMergeItem(void);
+};
+
+/// \internal
+/// \brief The CompositeMergeItemSorter class provides a helper function object
+///        for ordering composite reader results.
+///
+/// Essentially just exracts a BamRecord from its parent CompositeMergeItem for
+/// further checks.
+///
+template<typename CompareType>
+struct CompositeMergeItemSorter : public std::function<bool(const CompositeMergeItem&,
+                                                            const CompositeMergeItem&)>
+{
+    bool operator()(const CompositeMergeItem& lhs,
+                    const CompositeMergeItem& rhs);
+};
+
+} // namespace internal
+
+/// \brief The GenomicIntervalCompositeBamReader class provides read access to
+///        multipe %BAM files, limiting results to a genomic region.
+///
+/// Requires a ".bai" file for each input %BAM file.
+///
+/// Results will be returned in order of genomic coordinate (first by reference
+/// ID, then by position).
+///
+class PBBAM_EXPORT GenomicIntervalCompositeBamReader
+{
+public:
+    /// \name Contstructors & Related Methods
+    /// \{
+
+    GenomicIntervalCompositeBamReader(const GenomicInterval& interval,
+                                      const std::vector<BamFile>& bamFiles);
+    GenomicIntervalCompositeBamReader(const GenomicInterval& interval,
+                                      std::vector<BamFile>&& bamFiles);
+    GenomicIntervalCompositeBamReader(const GenomicInterval& interval,
+                                      const DataSet& dataset);
+
+    /// \}
+
+public:
+    /// \name Data Access
+    /// \{
+
+    /// Fetches next BAM record in the interval specified, storing in \p record
+    ///
+    /// \param[out] record
+    /// \returns true on success, false if no more data available.
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// Sets a new genomic interval of interest.
+    ///
+    /// \returns reference to this reader
+    ///
+    GenomicIntervalCompositeBamReader& Interval(const GenomicInterval& interval);
+
+    /// \returns the current specified interval
+    ///
+    const GenomicInterval& Interval(void) const;
+
+    /// \}
+
+private:
+    void UpdateSort(void);
+
+private:
+    GenomicInterval interval_;
+    std::deque<internal::CompositeMergeItem> mergeItems_;
+    std::vector<std::string> filenames_;
+};
+
+/// \brief Provides read access to multipe %BAM files, limiting results to those
+///        passing a PbiFilter.
+///
+/// Requires a ".pbi" file for each input %BAM file.
+///
+/// \note The template parameter OrderByType is not fully implemented at this
+///       time. Use of comparison functor (e.g. Compare::Zmw) for this will
+///       currently result in the proper "next" value <b> at each iteration
+///       step, independently, but not over the full data set. </b> If all
+///       files' "order-by" data values are accessible in increasing order
+///       within each file, then the expected ordering will be observed,
+///       However, if these data are not sorted within a file, the final results
+///       will appear unordered. \n
+///       \n
+///           Example:\n
+///           file 1: { 1, 5, 2, 6 } \n
+///           file 2: { 3, 8, 4, 7 } \n
+///           results: { 1, 3, 5, 2, 6, 8, 4, 7 } \n
+///       \n
+///       This a known issue and will be addressed in a future update. But in
+///       the meantime, use of Compare::None as the OrderByType is recommended,
+///       to explicitly indicate that no particular ordering is expected.
+///
+template<typename OrderByType>
+class PBBAM_EXPORT PbiFilterCompositeBamReader
+{
+public:
+    typedef internal::CompositeMergeItem                      value_type;
+    typedef internal::CompositeMergeItemSorter<OrderByType>   merge_sorter_type;
+    typedef std::deque<value_type>                            container_type;
+    typedef typename container_type::iterator                 iterator;
+    typedef typename container_type::const_iterator           const_iterator;
+
+public:
+    /// \name Contstructors & Related Methods
+    /// \{
+
+    PbiFilterCompositeBamReader(const PbiFilter& filter,
+                                const std::vector<BamFile>& bamFiles);
+    PbiFilterCompositeBamReader(const PbiFilter& filter,
+                                std::vector<BamFile>&& bamFiles);
+    PbiFilterCompositeBamReader(const PbiFilter& filter,
+                                const DataSet& dataset);
+
+    /// \}
+
+public:
+    /// \name Data Access
+    /// \{
+
+    /// Fetches next BAM record in the interval specified.
+    ///
+    /// \returns true on success, false if no more data available.
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// Sets a new PBI filter
+    ///
+    /// \returns reference to this reader
+    ///
+    PbiFilterCompositeBamReader& Filter(const PbiFilter& filter);
+
+    /// \}
+
+private:
+    void UpdateSort(void);
+
+private:
+    container_type mergeQueue_;
+    std::vector<std::string> filenames_;
+};
+
+/// \brief The SequentialCompositeBamReader class provides read access to
+///        multiple %BAM files, reading through the entire contents of each
+///        file.
+///
+/// Input files will be accessed in the order provided to the constructor. Each
+/// file's contents will be exhausted before moving on to the next one (as
+/// opposed to a "round-robin" scheme).
+///
+class PBBAM_EXPORT SequentialCompositeBamReader
+{
+public:
+    /// \name Contstructors & Related Methods
+    /// \{
+
+    SequentialCompositeBamReader(const std::vector<BamFile>& bamFiles);
+    SequentialCompositeBamReader(std::vector<BamFile>&& bamFiles);
+    SequentialCompositeBamReader(const DataSet& dataset);
+
+    /// \}
+
+public:
+    /// \name Data Access
+    /// \{
+
+    /// Fetches next BAM record from the .
+    ///
+    /// \returns true on success, false if no more data available.
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// \}
+
+private:
+    std::deque<std::unique_ptr<BamReader> > readers_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/CompositeBamReader.inl"
+
+#endif // COMPOSITEBAMREADER_H
diff --git a/include/pbbam/Config.h b/include/pbbam/Config.h

new file mode 100644 (file)

index 0000000..2521288
--- /dev/null
+++ b/include/pbbam/Config.h
@@ -0,0 +1,227 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Config.h
+/// \brief Defines library-wide macros & global variables.
+//
+// Author: Derek Barnett
+
+#ifndef PBBAM_CONFIG_H
+#define PBBAM_CONFIG_H
+
+#include <cstdint>
+
+#ifndef INT8_MAX
+#define INT8_MAX         127
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX        32767
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX        2147483647
+#endif
+#ifndef INT64_MAX
+#define INT64_MAX        9223372036854775807LL
+#endif
+#ifndef INT8_MIN
+#define INT8_MIN          -128
+#endif
+#ifndef INT16_MIN 
+#define INT16_MIN         -32768
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN        (-INT32_MAX-1)
+#endif
+#ifndef INT64_MIN
+#define INT64_MIN        (-INT64_MAX-1)
+#endif
+#ifndef UINT8_MAX 
+#define UINT8_MAX         255
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX        65535
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX        4294967295U
+#endif
+#ifndef UINT64_MAX
+#define UINT64_MAX        18446744073709551615ULL
+#endif
+
+/// \name Library Import/Export
+/// \{
+
+#ifndef PBBAM_LIBRARY_EXPORT
+#  if defined(WIN32)
+#    define PBBAM_LIBRARY_EXPORT __declspec(dllexport)
+#  else
+#    define PBBAM_LIBRARY_EXPORT __attribute__((visibility("default")))
+#  endif
+#endif
+
+#ifndef PBBAM_LIBRARY_IMPORT
+#  if defined(WIN32)
+#    define PBBAM_LIBRARY_IMPORT __declspec(dllimport)
+#  else
+#    define PBBAM_LIBRARY_IMPORT
+#  endif
+#endif
+
+#ifndef PBBAM_EXPORT
+#  if defined(PBBAM_LIBRARY)
+#    define PBBAM_EXPORT PBBAM_LIBRARY_EXPORT
+#  else
+#    define PBBAM_EXPORT PBBAM_LIBRARY_IMPORT
+#  endif
+#endif
+
+/// \}
+
+/// \name Shared Pointer Settings
+/// \{
+
+// uncomment this define, or pass via command-line (-DPBBAM_USE_BOOST_SHARED_PTR),
+// to use boost::shared_ptr<T> instead of std::shared_ptr<T>
+//
+//#define PBBAM_USE_BOOST_SHARED_PTR
+
+#ifdef PBBAM_USE_BOOST_SHARED_PTR
+#  include <boost/smart_ptr/shared_ptr.hpp>
+#  define PBBAM_SHARED_PTR boost::shared_ptr
+#else
+#  include <memory>
+#  define PBBAM_SHARED_PTR std::shared_ptr
+#endif
+
+/// \}
+
+/// \name Class Definition Helpers
+/// \{
+
+/// \brief Disables the use of copy constructors and assignment operators for a
+///        class.
+///
+/// To use, place the macro in a class's private section:
+/// \code{.cpp}
+/// struct Foo {
+/// private:
+///     DISABLE_COPY(Foo);
+/// };
+/// \endcode
+///
+#ifndef DISABLE_COPY
+#define DISABLE_COPY(Class) \
+    Class(const Class&); \
+    Class& operator=(const Class&)
+#endif
+
+/// \brief Disables the use of move constructors and assignment operators for a
+///        class.
+///
+/// To use, place the macro in a class's private section:
+/// \code{.cpp}
+/// struct Foo {
+/// private:
+///     DISABLE_MOVE(Foo);
+/// };
+/// \endcode
+///
+#ifndef DISABLE_MOVE
+#define DISABLE_MOVE(Class) \
+    Class(Class&&); \
+    Class& operator=(Class&&);
+#endif
+
+/// \brief Disables the use of copy & move constructors and assignment operators f
+///        or a class.
+///
+/// To use, place the macro in a class's private section:
+/// \code{.cpp}
+/// struct Foo {
+/// private:
+///     DISABLE_MOVE_AND_COPY(Foo);
+/// };
+/// \endcode
+///
+#ifndef DISABLE_MOVE_AND_COPY
+#define DISABLE_MOVE_AND_COPY(Class) \
+    DISABLE_MOVE(Class) \
+    DISABLE_COPY(Class)
+#endif
+
+/// \}
+
+// \brief Auto-validation
+//
+// To validate BAM components (header, records, etc.) you can either use the
+// Validator API provided, or enable auto-validation. To compile pbbam for
+// auto-validation, add the -DPacBioBAM_auto_validate=ON option to your cmake
+// invocation.
+//
+//
+#ifndef PBBAM_AUTOVALIDATE
+#  define PBBAM_AUTOVALIDATE 0
+#endif
+
+/// \}
+
+namespace PacBio {
+namespace BAM {
+
+/// \name Verbosity Settings
+/// \{
+
+/// \brief Sets the desired verbosity level of htslib warnings.
+///
+/// Change this value to allow debug/warning statements from htslib itself.
+/// The valid range seems to be [0-3], where 0 indicates OFF, and 3 is the
+/// most verbose.
+///
+/// By default, pbbam disables htslib statements to keep output channels clean.
+/// We rely on exceptions & their associated messages instead.
+///
+/// This global variable is obviously not thread-safe by any means. But as a
+/// debug flag, it is unlikely to cause any real issues. The worst case would be
+/// unexpected presence/absence of output statements.
+///
+extern int HtslibVerbosity;
+
+/// \}
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // PBBAM_CONFIG_H
diff --git a/include/pbbam/DataSet.h b/include/pbbam/DataSet.h

new file mode 100644 (file)

index 0000000..af1b14f
--- /dev/null
+++ b/include/pbbam/DataSet.h
@@ -0,0 +1,820 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file DataSet.h
+/// \brief Defines the DataSet class.
+//
+// Author: Derek Barnett
+
+#ifndef DATASET_H
+#define DATASET_H
+
+#include "pbbam/BamFile.h"
+#include "pbbam/Config.h"
+#include "pbbam/DataSetTypes.h"
+#include <chrono>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The DataSet class represents a %PacBio analyis dataset (e.g. from
+///        XML).
+///
+/// \nosubgrouping
+///
+/// It provides resource paths, filters, and metadata associated with a dataset
+/// under analysis.
+///
+class PBBAM_EXPORT DataSet
+{
+public:
+    /// \name DataSet Type
+    /// \{
+
+    /// \brief This enum defines the currently-supported DataSet types.
+    ///
+    enum TypeEnum {
+        GENERIC = 0
+      , ALIGNMENT
+      , BARCODE
+      , CONSENSUS_ALIGNMENT
+      , CONSENSUS_READ
+      , CONTIG
+      , HDF_SUBREAD
+      , REFERENCE
+      , SUBREAD
+    };
+
+    /// \brief Converts printable dataset type to type enum.
+    ///
+    /// \param[in] typeName printable dataset type
+    /// \returns dataset type enum
+    /// \throws std::runtime_error if \p typeName is unknown
+    ///
+    static DataSet::TypeEnum NameToType(const std::string& typeName);
+
+    /// \brief Converts dataset type enum to printable name.
+    ///
+    /// \param[in] type dataset type enum
+    /// \returns printable dataset type
+    /// \throws std::runtime_error if \p type is unknown
+    ///
+    static std::string TypeToName(const DataSet::TypeEnum& type);
+
+    /// \}
+
+public:
+
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Constructs an empty, generic DataSet.
+    ///
+    DataSet(void);
+
+    /// \brief Constructs an empty DataSet of the type specified.
+    ///
+    /// \param[in] type dataset type
+    /// \throws std::runtime_error if \p type is unknown
+    ///
+    DataSet(const DataSet::TypeEnum type);
+
+    /// \brief Constructs a DataSet from a %BAM file.
+    ///
+    /// This currently defaults to a SubreadSet, with an ExternalResource
+    /// pointing to BamFile::Filename.
+    ///
+    /// \param[in] bamFile  BamFile object
+    ///
+    DataSet(const BamFile& bamFile);
+
+    /// \brief Loads a DataSet from a file.
+    ///
+    /// \p filename may be one of three types, indicated by its extension:\n
+    ///  - %BAM ("*.bam") \n
+    ///  - FOFN ("*.fofn") \n
+    ///  - DataSetXML ("*.xml") \n
+    ///
+    /// \param[in] filename  input filename
+    /// \throws std::runtime_error if \p filename has an unsupported extension,
+    ///         or if a valid DataSet could not be created from its contents
+    ///
+    DataSet(const std::string& filename);
+
+    /// \brief Constructs a DataSet from a list of files.
+    ///
+    /// \param[in] filenames  input filenames
+    /// \throws std::runtime_error if DataSet could not be created from
+    ///         \p filenames
+    ///
+    DataSet(const std::vector<std::string>& filenames);
+
+    DataSet(const DataSet& other);
+    DataSet(DataSet&& other);
+    DataSet& operator=(const DataSet& other);
+    DataSet& operator=(DataSet&& other);
+    ~DataSet(void);
+
+    /// \brief Creates a DataSet from "raw" XML data.
+    ///
+    /// \param[in] xml DataSetXML text
+    ///
+    static DataSet FromXml(const std::string& xml);
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    /// \brief Merges DataSet contents.
+    ///
+    /// Adds contents of \p other to this dataset object
+    ///
+    /// \param[in] other  some other dataset to add to this one
+    /// \returns reference to this dataset object
+    ///
+    DataSet& operator+=(const DataSet& other);
+
+    /// \}
+
+public:
+    /// \name Serialization
+    /// \{
+
+    /// \brief Saves dataset XML to file.
+    ///
+    /// \param[in] outputFilename destination for XML contents
+    ///
+    /// \throws std::runtime_error if file could be opened or if DataSet
+    ///         elements could not be converted to XML
+    ///
+    void Save(const std::string& outputFilename);
+
+    /// \brief Saves dataset XML to output stream, e.g. std::cout,
+    ///        std::stringstream.
+    ///
+    /// \param[out] out destination for XML contents
+    ///
+    /// \throws std::runtime_error if DataSet elements could not be converted to
+    ///         XML
+    ///
+    void SaveToStream(std::ostream& out);
+
+    /// \}
+
+public:
+
+    /// \name Attributes
+    /// \{
+    ///
+
+    /// \brief Fetches the value of a DataSet root element's attribute.
+    ///
+    /// These are the attributes attached to the root dataset element: \n
+    /// \verbatim <SubreadSet foo="x" bar="y" /> \endverbatim
+    ///
+    /// Built-in accessors exist for the standard attributes (e.g. CreatedAt)
+    /// but additional attributes can be used as well via these generic
+    /// Attribute methods.
+    ///
+    /// \param[in] name root element's attribute name
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Attribute(const std::string& name) const;
+
+    /// \brief Fetches the value of dataset's CreatedAt attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& CreatedAt(void) const;
+
+    /// \brief Fetches the value of dataset's Format attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Format(void) const;
+
+    /// \brief Fetches the value of dataset's MetaType attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& MetaType(void) const;
+
+    /// \brief Fetches the value of dataset's ModifiedAt attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& ModifiedAt(void) const;
+
+    /// \brief Fetches the value of dataset's Name attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Name(void) const;
+
+    /// \brief Fetches the value of dataset's ResourceId attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& ResourceId(void) const;
+
+    /// \brief Fetches the value of dataset's Tags attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Tags(void) const;
+
+    /// \brief Fetches the value of dataset's TimeStampedName attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& TimeStampedName(void) const;
+
+    /// \brief Fetches the value of dataset's UniqueId attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& UniqueId(void) const;
+
+    /// \brief Fetches the value of dataset's Version attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Version(void) const;
+
+    /// \}
+
+public:
+    /// \name DataSet Type
+    /// \{
+
+    /// \brief Fetches the dataset's type.
+    ///
+    /// \returns dataset type enum
+    ///
+    PacBio::BAM::DataSet::TypeEnum Type(void) const;
+
+    /// \brief Fetches the dataset's type.
+    ///
+    /// \returns printable dataset type
+    ///
+    std::string TypeName(void) const;
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the dataset's Extensions element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::Extensions& Extensions(void) const;
+
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ExternalResources& ExternalResources(void) const;
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::Filters& Filters(void) const;
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::DataSetMetadata& Metadata(void) const;
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::SubDataSets& SubDataSets(void) const;
+
+    /// \}
+
+public:
+    /// \name Resource Handling
+    /// \{
+
+    /// \brief Returns this dataset's primary %BAM resources, with relative
+    ///        filepaths already resolved.
+    ///
+    /// Primary resources are those listed as top-level %ExternalResources, not
+    /// associated files (indices, references, scraps %BAMs, etc.).
+    ///
+    /// \returns vector of BamFiles
+    ///
+    /// \sa DataSet::ResolvedResourceIds
+    ///
+    std::vector<BamFile> BamFiles(void) const;
+
+    /// \brief Returns all primary external resource filepaths, with relative
+    ///        paths resolved.
+    ///
+    /// Primary resources are those listed as top-level %ExternalResources, not
+    /// associated files (indices, references, scraps %BAMs, etc.).
+    ///
+    /// \sa ResolvePath
+    ///
+    /// \returns resourceIds
+    ///
+    std::vector<std::string> ResolvedResourceIds(void) const;
+
+    /// \brief Resolves a filepath (that may be relative to the dataset).
+    ///
+    /// A DataSet's resources may be described using absolute filepaths or with
+    /// relative paths. For absolute paths, nothing is changed from the input.
+    /// For relative paths, these are resolved using the DataSet's own path
+    /// as a starting point. A DataSet's own path will be one of:\n
+    ///  1 - the location of its XML or %BAM input file, e.g. created using
+    ///      DataSet("foo.xml") or DataSet("foo.bam")\n
+    ///  2 - application's current working directory for all other DataSet
+    ///      construction methods { DataSet(), DataSet(type),
+    ///      DataSet("foo.fofn") }\n
+    ///
+    /// \param[in] originalPath     input file path (absolute or relative)
+    /// \returns resolved path
+    ///
+    std::string ResolvePath(const std::string& originalPath) const;
+
+    /// \returns sequence chemistry info for all read groups in this dataset
+    ///
+    /// \sa ReadGroupInfo::SequencingChemistry
+    ///
+    std::set<std::string> SequencingChemistries(void) const;
+
+    /// \}
+
+public:
+    /// \name XML Namespace Handling
+    /// \{
+
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns const reference to dataset's NamespaceRegistry
+    ///
+    const NamespaceRegistry& Namespaces(void) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Fetches the value of a DataSet root element's attribute.
+    ///
+    /// These are the attributes attached to the root dataset element: \n
+    /// \verbatim <SubreadSet foo="x" bar="y" /> \endverbatim
+    ///
+    /// Built-in accessors exist for the standard attributes (e.g. CreatedAt)
+    /// but additional attributes can be used as well via these generic methods.
+    ///
+    /// A new attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] name root element's attribute name
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Attribute(const std::string& name);
+
+    /// \brief Fetches the value of dataset's CreatedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& CreatedAt(void);
+
+    /// \brief Fetches the value of dataset's Format attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Format(void);
+
+    /// \brief Fetches the value of dataset's MetaType attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& MetaType(void);
+
+    /// \brief Fetches the value of dataset's ModifiedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& ModifiedAt(void);
+
+    /// \brief Fetches the value of dataset's Name attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Name(void);
+
+    /// \brief Fetches the value of dataset's ResourceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& ResourceId(void);
+
+    /// \brief Fetches the value of dataset's Tags attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Tags(void);
+
+    /// \brief Fetches the value of dataset's TimeStampedName attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& TimeStampedName(void);
+
+    /// \brief Fetches the value of dataset's UniqueId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& UniqueId(void);
+
+    /// \brief Fetches the value of dataset's Version attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Version(void);
+    
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets this dataset's XML attribute \p name, with \p value
+    ///
+    /// These are the attributes attached to the root dataset element: \n
+    /// \verbatim <SubreadSet foo="x" bar="y" /> \endverbatim
+    ///
+    /// Built-in accessors exist for the standard attributes (e.g. CreatedAt)
+    /// but additional attributes can be used as well via these generic methods.
+    ///
+    /// The attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] name   root element's attribute name
+    /// \param[in] value  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Attribute(const std::string& name, const std::string& value);
+
+    /// \brief Sets this dataset's CreatedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] createdAt  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& CreatedAt(const std::string& createdAt);
+
+    /// \brief Sets this dataset's Format attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] format  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Format(const std::string& format);
+
+    /// \brief Sets this dataset's MetaType attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] metatype  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& MetaType(const std::string& metatype);
+
+    /// \brief Sets this dataset's ModifiedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] modifiedAt  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& ModifiedAt(const std::string& modifiedAt);
+
+    /// \brief Sets this dataset's Name attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] name  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Name(const std::string& name);
+
+    /// \brief Sets this dataset's ResourceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] resourceId  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& ResourceId(const std::string& resourceId);
+
+    /// \brief Sets this dataset's Tags attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] tags  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Tags(const std::string& tags);
+
+    /// \brief Sets this dataset's TimeStampedName attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] timeStampedName  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& TimeStampedName(const std::string& timeStampedName);
+
+    /// \brief Sets this dataset's UniqueId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] uuid  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& UniqueId(const std::string& uuid);
+
+    /// \brief Sets this dataset's Version attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] version  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Version(const std::string& version);
+
+    /// \}
+
+public:
+    /// \name DataSet Type
+    /// \{
+
+    /// \brief Edits dataset type.
+    ///
+    /// \param[in] type  new dataset type
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Type(const PacBio::BAM::DataSet::TypeEnum type);
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the dataset's Extensions element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Extensions& Extensions(void);
+
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ExternalResources& ExternalResources(void);
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Filters& Filters(void);
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::DataSetMetadata& Metadata(void);
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::SubDataSets& SubDataSets(void);
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Sets this dataset's Extensions element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] extensions  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Extensions(const PacBio::BAM::Extensions& extensions);
+
+    /// \brief Sets this dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] resources  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& ExternalResources(const PacBio::BAM::ExternalResources& resources);
+
+    /// \brief Sets this dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] filters  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Filters(const PacBio::BAM::Filters& filters);
+
+    /// \brief Sets this dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] metadata  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Metadata(const PacBio::BAM::DataSetMetadata& metadata);
+
+    /// \brief Sets this dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] subdatasets  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& SubDataSets(const PacBio::BAM::SubDataSets& subdatasets);
+    
+    /// \}
+
+public:
+    /// \name XML Namespace Handling
+    /// \{
+
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns non-const reference to dataset's NamespaceRegistry
+    ///
+    NamespaceRegistry& Namespaces(void);
+
+    /// \}
+
+private:
+    std::unique_ptr<DataSetBase> d_;
+    std::string path_;
+};
+
+/// \name DataSet Timestamp Utilities
+/// \{
+
+/// \brief Fetches current time, in "DataSetXML format".
+///
+/// \returns DataSetXML formatted timestamp
+///
+/// \sa ToDataSetFormat
+///
+PBBAM_EXPORT std::string CurrentTimestamp(void);
+
+/// \brief Converts a time_point to "DataSetXML-formatted" timestamp.
+///
+/// This is the format used as a component of the DataSet::TimeStampedName
+/// (yymmdd_HHmmssttt>.
+///
+/// \returns "DataSetXML-formatted" timestamp
+///
+PBBAM_EXPORT std::string ToDataSetFormat(const std::chrono::system_clock::time_point& tp);
+
+/// \brief Converts a time_t to "DataSetXML-formatted" timestamp.
+///
+/// This is the format used as a component of the DataSet::TimeStampedName
+/// (yymmdd_HHmmssttt>.
+///
+/// \returns "DataSetXML-formatted" timestamp
+///
+PBBAM_EXPORT std::string ToDataSetFormat(const time_t& tp);
+
+/// \brief Converts a time_point to ISO-8601 formatted timestamp.
+///
+/// This is the format used in DataSet::CreatedAt and DataSet::ModifiedAt.
+///
+/// \returns ISO-8601 formatted timestamp
+///
+PBBAM_EXPORT std::string ToIso8601(const std::chrono::system_clock::time_point& tp);
+
+/// \brief Converts a time_t to ISO-8601 formatted timestamp.
+///
+/// This is the format used in DataSet::CreatedAt and DataSet::ModifiedAt.
+///
+/// \returns ISO-8601 formatted timestamp
+///
+PBBAM_EXPORT std::string ToIso8601(const time_t& t);
+
+/// \}
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/DataSet.inl"
+
+#endif // DATASET_H
diff --git a/include/pbbam/DataSetTypes.h b/include/pbbam/DataSetTypes.h

new file mode 100644 (file)

index 0000000..23df643
--- /dev/null
+++ b/include/pbbam/DataSetTypes.h
@@ -0,0 +1,904 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file DataSetTypes.h
+/// \brief Defines the public DataSet component classes.
+//
+// Author: Derek Barnett
+
+#ifndef DATASETTYPES_H
+#define DATASETTYPES_H
+
+#include "pbbam/BamFile.h"
+#include "pbbam/Config.h"
+#include "pbbam/DataSetXsd.h"
+#include "pbbam/internal/DataSetBaseTypes.h"
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The DataSetMetadata class represents the %DataSetMetadata child
+///        element in DataSetXML.
+///
+/// A few top-level elements are built-in, but as pbbam is not primarily a
+/// DataSetXML API, most of the metadata hierarchy needs to be manually managed.
+///
+class PBBAM_EXPORT DataSetMetadata : public internal::DataSetElement
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Constructs a DataSetMetadata with required fields.
+    DataSetMetadata(const std::string& numRecords,
+                    const std::string& totalLength);
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    /// \brief Merges DataSetMetadata contents.
+    ///
+    /// Adds contents of \p other to this metadata object
+    ///
+    /// \param[in] other  some other metadata to add to this one
+    /// \returns reference to this object
+    ///
+    DataSetMetadata& operator+=(const DataSetMetadata& other);
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the text of the NumRecords element.
+    ///
+    /// \returns const reference to element text (empty string if not present)
+    ///
+    const std::string& NumRecords(void) const;
+
+    /// \brief Fetches the text of the TotalLength element.
+    ///
+    /// \returns const reference to element text (empty string if not present)
+    ///
+    const std::string& TotalLength(void) const;
+
+    /// \brief Fetches the Provenance element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::Provenance& Provenance(void) const;
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the text of the NumRecords element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to element text
+    ///
+    std::string& NumRecords(void);
+
+    /// \brief Fetches the text of the TotalLength element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to element text
+    ///
+    std::string& TotalLength(void);
+
+    /// \brief Fetches Provenance element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Provenance& Provenance(void);
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Sets the text of the NumRecords element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns reference to this metadata object
+    ///
+    DataSetMetadata& NumRecords(const std::string& numRecords);
+
+    /// \brief Sets the text of the TotalLength element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns reference to this metadata object
+    ///
+    DataSetMetadata& TotalLength(const std::string& totalLength);
+
+    /// \brief Sets the Provenance child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns reference to this metadata object
+    ///
+    DataSetMetadata& Provenance(const PacBio::BAM::Provenance& provenance);
+
+    /// \}
+};
+
+/// \brief The ExtensionElement class represents an %ExtensionElement element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT ExtensionElement : public internal::DataSetElement  {
+public:
+    ExtensionElement(void);
+};
+
+/// \brief The Extensions class represents an %Extensions element in DataSetXML.
+///
+/// The Extensions element is essentially just a list of ExtensionElement
+/// objects.
+///
+class PBBAM_EXPORT Extensions : public internal::DataSetListElement<ExtensionElement>
+{
+public:
+    /// \brief Creates an empty extensions list.
+    Extensions(void);
+};
+
+class ExternalResources;
+
+/// \brief The ExternalResource class represents an %ExternalResource element in
+///        DataSetXML.
+///
+/// An ExternalResource can itself have a child element, ExternalResources, that
+/// lists related files (e.g. index files).
+///
+class PBBAM_EXPORT ExternalResource : public internal::IndexedDataType
+{
+public:
+    /// \brief Creates an ExternalResource from a BamFile object.
+    ///
+    /// The metatype & resourceId are automatically set.
+    ///
+    ExternalResource(const BamFile& bamFile);
+
+    /// \brief Creates an ExternalResource with provided \p metatype and
+    ///        \p filename as resource ID.
+    ///
+    ExternalResource(const std::string& metatype,
+                     const std::string& filename);
+
+public:
+    /// \brief Fetches the resource's ExternalResources child element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ExternalResources& ExternalResources(void) const;
+
+public:
+    /// \brief Fetches the resource's ExternalResources child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ExternalResources& ExternalResources(void);
+
+    /// \brief Sets this resource's ExternalResources child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] resources  new value for the element
+    /// \returns reference to this resource object
+    ///
+    ExternalResource& ExternalResources(const PacBio::BAM::ExternalResources& resources);
+
+public:
+    /// \brief Converts an ExternalResource to a BamFile object
+    ///
+    /// \returns corresponding BamFile object for this ExternalResource
+    /// \throws std::runtime_error if fails to open %BAM file (e.g. does not
+    ///         exist, not a %BAM file, etc.)
+    ///
+    /// \deprecated Use the results from DataSet::BamFiles instead. This method
+    ///             cannot resolve relative filepaths and will be removed in the
+    ///             near future.
+    ///
+    BamFile ToBamFile(void) const;
+};
+
+/// \brief The ExternalResources class represents an %ExternalResources element
+///        in DataSetXML.
+///
+/// The ExternalResources element is essentially just a list of ExternalResource
+/// elements.
+///
+class PBBAM_EXPORT ExternalResources : public internal::DataSetListElement<ExternalResource>
+{
+public:
+    /// \brief Creates an empty resource list.
+    ExternalResources(void);
+
+    /// \brief Merges \p other resource list with this one.
+    ExternalResources& operator+=(const ExternalResources& other);
+
+public:
+    /// \brief Adds an ExternalResource to this list.
+    void Add(const ExternalResource& ext);
+
+    /// \brief Removes an ExternalResource from this list.
+    void Remove(const ExternalResource& ext);
+
+public:
+    /// \brief Converts resource list to BamFile objects.
+    ///
+    /// \deprecated Use DataSet::BamFiles instead. This method cannot resolve
+    ///             relative filepaths and will be removed in the near future.
+    ///
+    std::vector<BamFile> BamFiles(void) const;
+};
+
+/// \brief The FileIndex class represents a %FileIndex element in DataSetXML.
+///
+/// A FileIndex is used as an auxiliary to an ExternalResource, providing
+/// information about a data file's index file (e.g. for %BAM files, *.bai or
+/// *.pbi).
+///
+class PBBAM_EXPORT FileIndex : public internal::InputOutputDataType
+{
+public:
+    /// \brief Creates a FileIndex with provided \p metatype and \p filename as
+    ///        resource ID.
+    ///
+    FileIndex(const std::string& metatype, 
+              const std::string& filename);
+};
+
+/// \brief The FileIndices class represents a %FileIndices element in DataSetXML.
+///
+/// The FileIndices element is essentially just a list of FileIndex elements,
+/// providing information about a data file's index files (e.g. for %BAM files
+/// this will usually be *.bai and/or *.pbi).
+///
+class PBBAM_EXPORT FileIndices : public internal::DataSetListElement<FileIndex>
+{
+public:
+    /// \brief Creates an empty index list.
+    FileIndices(void);
+
+public:
+    /// \brief Adds a FileIndex to this list.
+    void Add(const FileIndex& index);
+
+    /// \brief Removes a FileIndex from this list.
+    void Remove(const FileIndex& index);
+};
+
+/// \brief The Filter class represents a %Filter element in DataSetXML.
+///
+/// The Filter element allows analysis pipelines to describe filters on data
+/// that should be respected downstream, without needing to create filtered
+/// intermediate files.
+///
+/// A filter consists of a list of Property elements, each of which must be
+/// passed (logical AND) to pass the filter, e.g. property1 && property2 &&
+/// property3.
+///
+class PBBAM_EXPORT Filter : public internal::DataSetElement
+{
+public:
+    /// \brief Creates an empty filter.
+    Filter(void);
+
+public:
+    /// \brief Fetches the filter's property list element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::Properties& Properties(void) const;
+
+public:
+    /// \brief Fetches the filter's property list child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Properties& Properties(void);
+
+    /// \brief Sets this filter's Properties child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] properties new value for the element
+    /// \returns reference to this filter object
+    ///
+    Filter& Properties(const PacBio::BAM::Properties& properties);
+};
+
+/// \brief The Filters class represents a %Filters list element in DataSetXML.
+///
+/// The Filters element is essentially a list of Filter elements. For analysis
+/// purpose, each filter is considered separately (logical OR) to consider which
+/// data passes, e.g. filter1 || filter2 || filter3.
+///
+class PBBAM_EXPORT Filters : public internal::DataSetListElement<Filter>
+{
+public:
+    /// \brief Creates an empty filter list.
+    Filters(void);
+
+    /// \brief Merges \p other filter list with this one.
+    Filters& operator+=(const Filters& other);
+
+public:
+    /// \brief Adds a filter to this list.
+    void Add(const Filter& filter);
+
+    /// \brief Removes a filter from this list.
+    void Remove(const Filter& filter);
+};
+
+/// \brief The ParentTool class represents a %ParentTool element in DataSetXML.
+///
+class PBBAM_EXPORT ParentTool : public internal::BaseEntityType {
+public:
+    /// \brief Creates an empty %ParentTool element.
+    ParentTool(void);
+};
+
+/// \brief The Property class represents a %Property element in DataSetXML.
+///
+/// A Property is the primary building block of %DataSetXML filtering. The
+/// %Property element describes a data record's property (or field), some value,
+/// and a comparison operator.
+///
+/// For example, one could filter all %BAM records with a read accuracy at or
+/// above 0.9. In C++ this could be constructed like:
+/// \code{.cpp}
+/// Property p("accuracy", "0.9", ">=");
+/// \endcode
+///
+class PBBAM_EXPORT Property : public internal::DataSetElement
+{
+public:
+    /// \brief Constructs a filter property.
+    Property(const std::string& name,
+             const std::string& value,
+             const std::string& op);
+
+public:
+
+    /// \brief Fetches the value of property's Name attribute.
+    ///
+    /// \returns const reference to attribute value
+    ///
+    const std::string& Name(void) const;
+
+    /// \brief Fetches the value of property's Operator attribute.
+    ///
+    /// \returns const reference to attribute value
+    ///
+    const std::string& Operator(void) const;
+
+    /// \brief Fetches the value of property's Value attribute.
+    ///
+    /// \returns const reference to attribute value
+    ///
+    const std::string& Value(void) const;
+
+public:
+
+    /// \brief Fetches the value of property's Name attribute.
+    ///
+    /// \returns non-const reference to attribute value
+    ///
+    std::string& Name(void);
+
+    /// \brief Fetches the value of property's Operator attribute.
+    ///
+    /// \returns non-const reference to attribute value
+    ///
+    std::string& Operator(void);
+
+    /// \brief Fetches the value of property's Value attribute.
+    ///
+    /// \returns nonconst reference to attribute value
+    ///
+    std::string& Value(void);
+
+public:
+    /// \brief Sets this property's Name attribute.
+    ///
+    /// \param[in] name  new value for the attribute
+    /// \returns reference to this property object
+    ///
+    Property& Name(const std::string& name);
+
+    /// \brief Sets this property's Operator attribute.
+    ///
+    /// \param[in] op  new value for the attribute
+    /// \returns reference to this property object
+    ///
+    Property& Operator(const std::string& op);
+
+    /// \brief Sets this property's Value attribute.
+    ///
+    /// \param[in] value  new value for the attribute
+    /// \returns reference to this property object
+    ///
+    Property& Value(const std::string& value);
+};
+
+/// \brief The Properties class represents a %Properties list element in
+///        DataSetXML.
+///
+/// The Properties element is essentially a list of Property elements.
+///
+class PBBAM_EXPORT Properties : public internal::DataSetListElement<Property>
+{
+public:
+    /// \brief Creates an empty property list.
+    Properties(void);
+
+public:
+    /// \brief Adds a property to this list.
+    void Add(const Property& property);
+
+    /// \brief Removes a property from this list.
+    void Remove(const Property& property);
+};
+
+/// \brief The Provenance class represents a %Provenance element in DataSetXML.
+///
+class PBBAM_EXPORT Provenance : public internal::DataSetElement
+{
+public:
+    /// \brief Creates a empty provenance element.
+    Provenance(void);
+
+public:
+    /// \brief Fetches the value of CreatedBy attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& CreatedBy(void) const;
+
+    /// \brief Fetches the value of CommonServicesInstanceId attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& CommonServicesInstanceId(void) const;
+
+    /// \brief Fetches the value of CreatorUserId attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& CreatorUserId(void) const;
+
+    /// \brief Fetches the value of ParentJobId attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& ParentJobId(void) const;
+
+    /// \brief Fetches the ParentTool child element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ParentTool& ParentTool(void) const;
+
+public:
+
+    /// \brief Fetches the value of CreatedBy attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& CreatedBy(void);
+
+    /// \brief Fetches the value of CommonServicesInstanceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& CommonServicesInstanceId(void);
+
+    /// \brief Fetches the value of CreatorUserId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& CreatorUserId(void);
+
+    /// \brief Fetches the value of ParentJobId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& ParentJobId(void);
+
+    /// \brief Fetches the ParentTool element element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ParentTool& ParentTool(void);
+
+public:
+
+    /// \brief Sets the CreatedBy attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] createdBy  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& CreatedBy(const std::string& createdBy);
+
+    /// \brief Sets the CommonServicesInstanceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] id  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& CommonServicesInstanceId(const std::string& id);
+
+    /// \brief Sets the CreatorUserId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] id  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& CreatorUserId(const std::string& id);
+
+    /// \brief Sets the ParentJobId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] id  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& ParentJobId(const std::string& id);
+
+    /// \brief Sets the ParentTool child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] tool  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    Provenance& ParentTool(const PacBio::BAM::ParentTool& tool);
+};
+
+class SubDataSets;
+
+/// \brief The DataSetBase class provides the attributes & child elements shared
+///        by all dataset types.
+///
+/// Client code should not need to use this class directly. It should be
+/// considered as more of an implementation detail and may in fact be removed
+/// from public API in the future. The top-level DataSet is the recommended
+/// entry point.
+///
+class PBBAM_EXPORT DataSetBase : public internal::StrictEntityType
+{
+public:
+
+    /// \brief Creates a DataSetBase object, or one of its subclasses, from an
+    ///        XML element name (e.g. SubreadSet)
+    ///
+    static std::shared_ptr<DataSetBase> Create(const std::string& typeName);
+
+public:
+    /// \brief Creates an empty, generic DataSetBase.
+    DataSetBase(void);
+
+protected:
+    /// \brief Creates a DataSetBase with key values initialized.
+    DataSetBase(const std::string& metatype, 
+                const std::string& label, 
+                const XsdType& xsd);
+
+    /// \brief Returns a new DataSetBase containing a deep copy of contents
+    DataSetBase* DeepCopy(void) const;
+
+public:
+    /// \brief Merges dataset contents.
+    ///
+    /// Adds contents of \p other to this dataset object
+    ///
+    /// \param[in] other  some other dataset to add to this one
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& operator+=(const DataSetBase& other);
+
+public:
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ExternalResources& ExternalResources(void) const;
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::Filters& Filters(void) const;
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::DataSetMetadata& Metadata(void) const;
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::SubDataSets& SubDataSets(void) const;
+
+public:
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns const reference to dataset's NamespaceRegistry
+    ///
+    const NamespaceRegistry& Namespaces(void) const;
+
+public:
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ExternalResources& ExternalResources(void);
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Filters& Filters(void);
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::DataSetMetadata& Metadata(void);
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::SubDataSets& SubDataSets(void);
+
+public:
+    /// \brief Sets this dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] resources  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& ExternalResources(const PacBio::BAM::ExternalResources& resources);
+
+    /// \brief Sets this dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] filters  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& Filters(const PacBio::BAM::Filters& filters);
+
+    /// \brief Sets this dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] metadata  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& Metadata(const PacBio::BAM::DataSetMetadata& metadata);
+
+    /// \brief Sets this dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] subdatasets  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& SubDataSets(const PacBio::BAM::SubDataSets& subdatasets);
+
+public:
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns non-const reference to dataset's NamespaceRegistry
+    ///
+    NamespaceRegistry& Namespaces(void);
+
+private:
+    NamespaceRegistry registry_;
+};
+
+/// \brief The AlignmentSet class represents an %AlignmentSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT AlignmentSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty AlignmentSet dataset.
+    AlignmentSet(void);
+};
+
+/// \brief The BarcodeSet class represents a %BarcodeSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT BarcodeSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty BarcodeSet dataset.
+    BarcodeSet(void);
+};
+
+/// \brief The ConsensusAlignmentSet class represents a %ConsensusAlignmentSet
+///        root element in DataSetXML.
+///
+class PBBAM_EXPORT ConsensusAlignmentSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ConsensusAlignmentSet dataset.
+    ConsensusAlignmentSet(void);
+};
+
+/// \brief The ConsensusReadSet class represents a %ConsensusReadSet root
+///        element in DataSetXML.
+///
+class PBBAM_EXPORT ConsensusReadSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ConsensusReadSet dataset.
+    ConsensusReadSet(void);
+};
+
+/// \brief The ContigSet class represents a %ContigSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT ContigSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ContigSet dataset.
+    ContigSet(void);
+};
+
+/// \brief The HdfSubreadSet class represents a %HdfSubreadSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT HdfSubreadSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty HdfSubreadSet dataset.
+    HdfSubreadSet(void);
+};
+
+/// \brief The ReferenceSet class represents a %ReferenceSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT ReferenceSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ReferenceSet dataset.
+    ReferenceSet(void);
+};
+
+/// \brief The SubDataSets class represents a %DataSets list element in
+///        DataSetXML.
+///
+/// The SubDataSets element is essentially a list of DataSets.
+///
+class PBBAM_EXPORT SubDataSets : public internal::DataSetListElement<DataSetBase>
+{
+public:
+    /// \brief Creates an empty list of sub-datasets.
+    SubDataSets(void);
+
+public:
+    /// \brief Adds \p other sub-dataset to this list.
+    SubDataSets& operator+=(const DataSetBase& other); // single
+
+    /// \brief Adds \p other sub-dataset list to this list.
+    SubDataSets& operator+=(const SubDataSets& other); // list
+
+public:
+    /// \brief Adds a sub-dataset to this list.
+    void Add(const DataSetBase& subdataset);
+
+    /// \brief Removes a sub-dataset from this list.
+    void Remove(const DataSetBase& subdataset);
+};
+
+/// \brief The SubreadSet class represents a %SubreadSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT SubreadSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty SubreadSet dataset.
+    SubreadSet(void);
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "internal/DataSetTypes.inl"
+
+#endif // DATASETTYPES_H
diff --git a/include/pbbam/DataSetXsd.h b/include/pbbam/DataSetXsd.h

new file mode 100644 (file)

index 0000000..8d0ec38
--- /dev/null
+++ b/include/pbbam/DataSetXsd.h
@@ -0,0 +1,157 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file DataSetXsd.h
+/// \brief Defines the XSD- and namespace-related classes for DataSetXML.
+//
+// Author: Derek Barnett
+
+#ifndef DATASETXSD_H
+#define DATASETXSD_H
+
+#include "pbbam/Config.h"
+#include <map>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The XsdType enum defines the supported XSD namespaces.
+///
+enum class XsdType
+{
+    NONE
+
+  , AUTOMATION_CONSTRAINTS
+  , BASE_DATA_MODEL
+  , COLLECTION_METADATA
+  , COMMON_MESSAGES
+  , DATA_MODEL
+  , DATA_STORE
+  , DATASETS
+  , DECL_DATA
+  , PART_NUMBERS
+  , PRIMARY_METRICS
+  , REAGENT_KIT
+  , RIGHTS_AND_ROLES
+  , SAMPLE_INFO
+  , SEEDING_DATA
+};
+
+/// \brief The NamespaceInfo class provides XML namespace info (prefix & URI).
+///
+class PBBAM_EXPORT NamespaceInfo
+{
+public:
+    /// \brief Creates an empty entry.
+    ///
+    /// This constructor only exists for STL container compatibility.
+    ///
+    NamespaceInfo(void);
+
+    /// \brief Creates a valid info entry.
+    NamespaceInfo(const std::string& name,
+                  const std::string& uri);
+
+public:
+    /// \brief Fetches namespace name (i.e. prefix)
+    const std::string& Name(void) const { return name_; }
+
+    /// \brief Fetches namespace URI.
+    const std::string& Uri(void) const { return uri_; }
+
+private:
+    std::string name_;
+    std::string uri_;
+};
+
+/// \brief The NamespaceRegistry class provides a per-dataset registry of XML
+///        namespace information.
+///
+/// This is used to format XML output - properly prefixing element labels with
+/// namespace as appropriate.
+///
+class PBBAM_EXPORT NamespaceRegistry
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    NamespaceRegistry(void);
+    NamespaceRegistry(const NamespaceRegistry& other);
+    NamespaceRegistry(NamespaceRegistry&& other);
+    NamespaceRegistry& operator=(const NamespaceRegistry& other);
+    NamespaceRegistry& operator=(NamespaceRegistry&& other);
+    ~NamespaceRegistry(void);
+
+    /// \}
+
+public:
+    /// \name Registry Access
+    /// \{
+
+    /// \brief Fetches namespace info for the dataset's default XSD type.
+    const NamespaceInfo& DefaultNamespace(void) const;
+
+    /// \brief Fetches dataset's default XSD type.
+    XsdType DefaultXsd(void) const;
+
+    /// \brief Fetches namespace info for the requested XSD type.
+    const NamespaceInfo& Namespace(const XsdType& xsd) const;
+
+    /// \brief Registers namespace info for a particular XSD type.
+    void Register(const XsdType& xsd, const NamespaceInfo& namespaceInfo);
+
+    /// \brief Updates dataset's default XSD type.
+    void SetDefaultXsd(const XsdType& xsd);
+
+    /// \brief Fetches the XSD type for \p elementLabel.
+    XsdType XsdForElement(const std::string& elementLabel) const;
+
+    /// \brief Fetches the XSD type for a particular URI.
+    XsdType XsdForUri(const std::string& uri) const;
+
+    /// \}
+
+private:
+    std::map<XsdType, NamespaceInfo> data_;
+    XsdType defaultXsdType_;
+};
+
+} // namespace PacBio
+} // namespace BAM
+
+#endif // DATASETXSD_H
diff --git a/include/pbbam/EntireFileQuery.h b/include/pbbam/EntireFileQuery.h

new file mode 100644 (file)

index 0000000..10c06ff
--- /dev/null
+++ b/include/pbbam/EntireFileQuery.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file EntireFileQuery.h
+/// \brief Defines the EntireFileQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef ENTIREFILEQUERY_H
+#define ENTIREFILEQUERY_H
+
+#include "pbbam/internal/QueryBase.h"
+#include <memory>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The EntireFileQuery class provides iterable access to a DataSet's
+///        %BAM records, reading through the entire contents of each file.
+///
+/// Input files will be accessed in the order listed in the DataSet.
+///
+/// \include code/EntireFileQuery.txt
+///
+/// Iteration is not limited to only 'const' records. The files themselves will
+/// not be affected, but individual records may be modified if needed.
+///
+/// \include code/EntireFileQuery_NonConst.txt
+///
+/// \note DataSets can be implicitly constructed from %BAM filenames as well.
+///       Thus a single %BAM file can be read through using the following:
+///
+/// \include code/EntireFileQuery_BamFilename.txt
+///
+class PBBAM_EXPORT EntireFileQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new EntireFileQuery, reading through the entire
+    ///        contents of a dataset.
+    ///
+    /// \param[in] dataset  input data source(s)
+    /// \throws std::runtime_error on failure to open/read underlying %BAM
+    ///         files.
+    ///
+    EntireFileQuery(const PacBio::BAM::DataSet& dataset);
+    ~EntireFileQuery(void);
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r);
+
+private:
+    struct EntireFileQueryPrivate;
+    std::unique_ptr<EntireFileQueryPrivate> d_;
+};
+
+} // namespace BAM
+} // namspace PacBio
+
+#endif // ENTIREFILEQUERY_H
diff --git a/include/pbbam/FastaReader.h b/include/pbbam/FastaReader.h

new file mode 100644 (file)

index 0000000..dc19e53
--- /dev/null
+++ b/include/pbbam/FastaReader.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file FastaReader.h
+/// \brief Defines the FastaReader class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTAREADER_H
+#define FASTAREADER_H
+
+#include "pbbam/FastaSequence.h"
+#include <memory>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal { struct FastaReaderPrivate; }
+
+///
+/// \brief The FastaReader provides sequential access to FASTA records.
+///
+class FastaReader
+{
+public:
+    ///
+    /// \brief Reads all FASTA sequences from a file
+    ///
+    /// \param fn   FASTA filename
+    /// \return vector of FastaSequence results
+    ///
+    static std::vector<FastaSequence> ReadAll(const std::string& fn);
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    explicit FastaReader(const std::string& fn);
+    FastaReader(FastaReader&& other);
+    FastaReader& operator=(FastaReader&& other);
+    ~FastaReader(void);
+
+    // copy is disabled
+    FastaReader(const FastaReader&) = delete;
+    FastaReader& operator=(const FastaReader&) = delete;
+
+    /// \}
+
+public:
+    /// \name Sequence Access
+    /// \{
+
+    ///
+    /// \brief GetNext
+    ///
+    /// \code{cpp}
+    ///
+    /// FastaReader reader{ fn };
+    /// FastaSequence f;
+    /// while (reader.GetNext(f)) {
+    ///     // do stuff with f
+    /// }
+    /// \endcode
+    ///
+    /// \param[out] record
+    /// \return success/failure
+    ///
+    bool GetNext(FastaSequence& record);
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::FastaReaderPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // FASTAREADER_H
diff --git a/include/pbbam/FastaSequence.h b/include/pbbam/FastaSequence.h

new file mode 100644 (file)

index 0000000..7748506
--- /dev/null
+++ b/include/pbbam/FastaSequence.h
@@ -0,0 +1,103 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file FastaSequence.h
+/// \brief Defines the FastaSequence class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTASEQUENCE_H
+#define FASTASEQUENCE_H
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The FastaSequence class represents a FASTA record (name & bases)
+///
+class FastaSequence
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    ///
+    /// \brief FastaSequence
+    /// \param name
+    /// \param bases
+    ///
+    explicit FastaSequence(std::string name, std::string bases);
+
+    FastaSequence(void) = default;
+    FastaSequence(const FastaSequence&) = default;
+    FastaSequence(FastaSequence&&) = default;
+    FastaSequence& operator=(const FastaSequence&) = default;
+    FastaSequence& operator=(FastaSequence&&) = default;
+    ~FastaSequence(void) = default;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    ///
+    /// \brief Name
+    /// \return
+    ///
+    std::string Name(void) const;
+
+    ///
+    /// \brief Bases
+    /// \return
+    ///
+    std::string Bases(void) const;
+
+    /// \}
+
+private:
+    std::string name_;
+    std::string bases_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "internal/FastaSequence.inl"
+
+#endif // FASTASEQUENCE_H
diff --git a/include/pbbam/FrameEncodingType.h b/include/pbbam/FrameEncodingType.h

new file mode 100644 (file)

index 0000000..3b5a52b
--- /dev/null
+++ b/include/pbbam/FrameEncodingType.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file FrameEncodingType.h
+/// \brief Defines the FrameEncodingType enum.
+//
+// Author: Derek Barnett
+
+#ifndef FRAMEENCODINGTYPE_H
+#define FRAMEENCODINGTYPE_H
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the possible encoding modes used in Frames data
+/// (e.g. BamRecord::IPD or BamRecord::PulseWidth).
+///
+/// The LOSSY mode is the default in production output; LOSSLESS mode
+/// being used primarily for internal applications.
+///
+/// \sa https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst
+///     for more information on pulse frame encoding schemes.
+///
+enum class FrameEncodingType
+{
+    LOSSY       ///< 8-bit compression (using CodecV1) of frame data
+  , LOSSLESS    ///< 16-bit native frame data
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // FRAMEENCODINGTYPE_H
diff --git a/include/pbbam/Frames.h b/include/pbbam/Frames.h

new file mode 100644 (file)

index 0000000..326701b
--- /dev/null
+++ b/include/pbbam/Frames.h
@@ -0,0 +1,187 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Frames.h
+/// \brief Defines the Frames class.
+//
+// Author: Derek Barnett
+
+#ifndef FRAMES_H
+#define FRAMES_H
+
+#include "pbbam/Config.h"
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The Frames class represents pulse frame data.
+///
+/// Frame data may be stored in either their raw, 16-bit values or
+/// using a lossy, 8-bit compression scheme.
+///
+/// This class is used to store the data and convert between the 2 storage types.
+///
+class PBBAM_EXPORT Frames
+{
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \brief Constructs a Frames object from encoded (lossy, 8-bit) data.
+    ///
+    /// \note This method should probably not be needed often by client code
+    ///       working with frame data. It exists primarily for (internal)
+    ///       parsing & interpretation of the %BAM file contents. The method is
+    ///       available, though, should the conversion operation be needed.
+    ///
+    /// \param[in] codedData    encoded data
+    /// \returns Frames object
+    ///
+    static Frames Decode(const std::vector<uint8_t>& codedData);
+
+    /// \brief Creates encoded, compressed frame data from raw input data.
+    ///
+    /// \param[in] frames   raw frame data
+    /// \returns lossy, 8-bit encoded frame data
+    ///
+    static std::vector<uint8_t> Encode(const std::vector<uint16_t>& frames);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    Frames(void);
+    Frames(const std::vector<uint16_t>& frames);
+    Frames(std::vector<uint16_t>&& frames);
+    Frames(const Frames& other);
+    Frames(Frames&& other);
+    Frames& operator=(const Frames& other);
+    Frames& operator=(Frames&& other);
+    ~Frames(void);
+
+    /// \}
+
+public:
+    /// \name Access Data
+    /// \{
+
+    /// \returns Frame data in expanded (not encoded) form
+    std::vector<uint16_t>& DataRaw(void);
+    const std::vector<uint16_t>& Data(void) const;
+
+    /// \}
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \returns Frame data in (lossy, 8-bit) encoded form.
+    std::vector<uint8_t> Encode(void) const;
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    bool operator==(const Frames& other) const;
+    bool operator!=(const Frames& other) const;
+
+    /// \}
+
+public:
+    /// \name STL Compatbility
+    /// \{
+
+    /// \returns A const_iterator to the beginning of the sequence.
+    std::vector<uint16_t>::const_iterator cbegin(void) const;
+
+    /// \returns A const_iterator to the element past the end of the sequence.
+    std::vector<uint16_t>::const_iterator cend(void) const;
+
+    /// \returns A const_iterator to the beginning of the sequence.
+    std::vector<uint16_t>::const_iterator begin(void) const;
+
+    /// \returns A const_iterator to the element past the end of the sequence.
+    std::vector<uint16_t>::const_iterator end(void) const;
+
+    /// \returns An iterator to the beginning of the sequence.
+    std::vector<uint16_t>::iterator begin(void);
+
+    /// \returns An iterator to the element past the end of the sequence.
+    std::vector<uint16_t>::iterator end(void);
+
+    /// \returns The number of frame data points.
+    size_t size(void) const;
+
+    /// \returns True if the container is empty, false otherwise.
+    bool empty(void) const;
+
+    /// \} 
+
+public:
+    /// \name Access Data
+    /// \{
+
+    /// Sets this record's data.
+    ///
+    /// \param[in] frames data in expanded (not encoded) form
+    /// \returns reference to this object
+    ///
+    Frames& Data(const std::vector<uint16_t>& frames);
+
+    /// Sets this record's data.
+    ///
+    /// \param[in] frames data in expanded (not encoded) form
+    /// \returns reference to this object
+    ///
+    Frames& Data(std::vector<uint16_t>&& frames);
+
+    /// \}
+
+private:
+    std::vector<uint16_t> data_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/Frames.inl"
+
+#endif // FRAMES_H
diff --git a/include/pbbam/GenomicInterval.h b/include/pbbam/GenomicInterval.h

new file mode 100644 (file)

index 0000000..a7d4986
--- /dev/null
+++ b/include/pbbam/GenomicInterval.h
@@ -0,0 +1,188 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file GenomicInterval.h
+/// \brief Defines the GenomicInterval class.
+//
+// Author: Derek Barnett
+
+#ifndef GENOMICINTERVAL_H
+#define GENOMICINTERVAL_H
+
+#include "pbbam/Config.h"
+#include "pbbam/Interval.h"
+#include "pbbam/Position.h"
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The GenomicInterval class represents a genomic interval (reference
+///        name and 0-based coordinates).
+///
+class PBBAM_EXPORT GenomicInterval
+{
+public:
+    /// \name Constructors & Related Methods
+    ///  \{
+
+    /// \brief Creates an empty genomic interval
+    GenomicInterval(void);
+
+    /// \brief Creates a genomic interval on sequence with \p name, using range:
+    ///       [\p start, \p stop)
+    GenomicInterval(const std::string& name,
+                    const Position& start,
+                    const Position& stop);
+
+    /// \brief Creates a genomic interval, using REGION string
+    ///
+    /// "<ref>:<start>-<stop>" ("chr8:200-600")
+    ///
+    /// \note The htslib/samtools REGION string expects start positions to be
+    ///       1-based. However, throughout pbbam (including the rest of this
+    ///       class), we stick to 0-based start coordinates. Thus, while the
+    ///       syntax matches that of samtools, we are using a 0-based start
+    ///       coordinate here.
+    ///
+    GenomicInterval(const std::string& zeroBasedRegionString);
+
+    GenomicInterval(const GenomicInterval& other);
+    GenomicInterval& operator=(const GenomicInterval& other);
+
+    ~GenomicInterval(void);
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    /// \returns true if same id & underlying interval
+    bool operator==(const GenomicInterval& other) const;
+
+    /// \returns true if either ids or underlying intervals differ
+    bool operator!=(const GenomicInterval& other) const;
+
+    /// \}
+
+public:
+    /// \name Interval Operations
+    /// \{
+
+    /// \returns true if same id and underlying Interval::CoveredBy() other.
+    bool CoveredBy(const GenomicInterval& other) const;
+
+    /// \returns true if same id and underlying Interval::Covers() other.
+    bool Covers(const GenomicInterval& other) const;
+
+    /// \returns true if same id and underlying Interval::Intersects() other.
+    bool Intersects(const GenomicInterval& other) const;
+
+    /// \returns true if underlying Interval::IsValid(), and id/endpoints are
+    ///          non-negative.
+    ///
+    bool IsValid(void) const;
+
+    /// \returns length of underlying
+    size_t Length(void) const;
+
+    /// \}
+
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns interval reference name
+    std::string Name(void) const;
+
+    /// \returns underlying Interval object
+    PacBio::BAM::Interval<Position> Interval(void) const;
+
+    /// \returns interval start coordinate
+    Position Start(void) const;
+
+    /// \returns interval stop coordinate
+    Position Stop(void) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// Sets this interval's reference name.
+    ///
+    /// \param[in] name
+    /// \returns reference to this interval
+    ///
+    GenomicInterval& Name(const std::string& name);
+
+    /// Sets this underlying Interval
+    ///
+    /// \param[in] interval
+    /// \returns reference to this interval
+    ///
+    GenomicInterval& Interval(const PacBio::BAM::Interval<Position>& interval);
+
+    /// Sets this interval's start coordinate.
+    ///
+    /// \param[in] start
+    /// \returns reference to this interval
+    ///
+    GenomicInterval& Start(const Position start);
+
+    /// Sets this interval's stop coordinate.
+    ///
+    /// \param[in] stop
+    /// \returns reference to this interval
+    ///
+    GenomicInterval& Stop(const Position stop);
+
+    /// \}
+
+private:
+    std::string name_;
+    PacBio::BAM::Interval<Position> interval_;
+};
+
+} // namespace BAM
+} // namspace PacBio
+
+#include "pbbam/internal/GenomicInterval.inl"
+
+#endif // GENOMICINTERVAL_H
diff --git a/include/pbbam/GenomicIntervalQuery.h b/include/pbbam/GenomicIntervalQuery.h

new file mode 100644 (file)

index 0000000..7df7721
--- /dev/null
+++ b/include/pbbam/GenomicIntervalQuery.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file GenomicIntervalQuery.h
+/// \brief Defines the GenomicIntervalQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef GENOMICINTERVALQUERY_H
+#define GENOMICINTERVALQUERY_H
+
+#include "pbbam/GenomicInterval.h"
+#include "pbbam/internal/QueryBase.h"
+#include <memory>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The GenomicIntervalQuery class provides iterable access to a
+///        DataSet's %BAM records, limiting results to those overlapping a
+///        GenomicInterval.
+///
+/// Example:
+/// \include code/GenomicIntervalQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".bai" index file.
+///       Use BamFile::EnsureStandardIndexExists before creating the query if
+///       one may not be present.
+///
+class PBBAM_EXPORT GenomicIntervalQuery : public internal::IQuery
+{
+public:
+
+    /// \brief Constructs a new GenomiIntervalQuery, limiting record results to
+    ///        only those overalpping a GenomicInterval.
+    ///
+    /// \param[in] interval genomic interval of interest
+    /// \param[in] dataset  input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         BAI files.
+    ///
+    GenomicIntervalQuery(const GenomicInterval& interval,
+                         const PacBio::BAM::DataSet& dataset);
+    ~GenomicIntervalQuery(void);
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r);
+
+public:
+    /// \brief Sets a new genomic interval.
+    ///
+    /// This allows the same dataset/query to be re-used over multiple regions of
+    /// interest:
+    ///
+    /// \include code/GenomicIntervalQuery_Reuse.txt
+    ///
+    /// \param[in] interval new genomic interval
+    /// \returns reference to this query
+    ///
+    GenomicIntervalQuery& Interval(const GenomicInterval& interval);
+
+    /// \returns Current genomic interval active on this query.
+    const GenomicInterval& Interval(void) const;
+
+private:
+    struct GenomicIntervalQueryPrivate;
+    std::unique_ptr<GenomicIntervalQueryPrivate> d_;
+};
+
+} // namespace BAM
+} // namspace PacBio
+
+#endif // GENOMICINTERVALQUERY_H
diff --git a/include/pbbam/IRecordWriter.h b/include/pbbam/IRecordWriter.h

new file mode 100644 (file)

index 0000000..9acf1db
--- /dev/null
+++ b/include/pbbam/IRecordWriter.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file IRecordWriter.h
+/// \brief Defines the IRecordWriter interface.
+//
+// Author: Derek Barnett
+
+#ifndef IRECORDWRITER_H
+#define IRECORDWRITER_H
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class BamRecordImpl;
+
+class IRecordWriter
+{
+public:
+    virtual ~IRecordWriter(void);
+
+public:
+
+    /// \brief Try to flush any buffered data to file.
+    ///
+    /// \note The underlying implementation may not necessarily flush buffered
+    ///       data immediately, especially in a multithreaded writer situation.
+    ///       Let the writer go out of scope to fully ensure flushing.
+    ///
+    /// \throws std::runtime_error if flush fails
+    ///
+    virtual void TryFlush(void) =0;
+
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] record BamRecord object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    virtual void Write(const BamRecord& record) =0;
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] recordImpl BamRecordImpl object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    virtual void Write(const BamRecordImpl& recordImpl) =0;
+
+protected:
+    IRecordWriter(void);
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // IRECORDWRITER_H
diff --git a/include/pbbam/IndexedFastaReader.h b/include/pbbam/IndexedFastaReader.h

new file mode 100644 (file)

index 0000000..b382d96
--- /dev/null
+++ b/include/pbbam/IndexedFastaReader.h
@@ -0,0 +1,170 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file IndexedFastaReader.h
+/// \brief Defines the IndexedFastaReader class.
+//
+// Author: David Alexander
+
+#ifndef INDEXEDFASTAREADER_H
+#define INDEXEDFASTAREADER_H
+
+#include "pbbam/Orientation.h"
+#include "pbbam/Position.h"
+#include <htslib/faidx.h>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+
+class GenomicInterval;
+class BamRecord;
+
+/// \brief The IndexedFastaReader class provides random-access to FASTA file
+///        data.
+///
+class IndexedFastaReader {
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    IndexedFastaReader(void) = delete;
+    IndexedFastaReader(const std::string& filename);
+    IndexedFastaReader(const IndexedFastaReader& src);
+    IndexedFastaReader& operator=(const IndexedFastaReader& rhs);
+    ~IndexedFastaReader(void);
+
+    /// \}
+
+public:
+    /// name Sequence Access
+    /// \{
+
+    /// \brief Fetches FASTA sequence for desired interval.
+    ///
+    /// \param[in] id       reference sequence name
+    /// \param[in] begin    start position
+    /// \param[in] end      end position
+    ///
+    /// \returns sequence string at desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string Subsequence(const std::string& id,
+                            Position begin,
+                            Position end) const;
+
+    /// \brief Fetches FASTA sequence for desired interval.
+    ///
+    /// \param[in] interval desired interval
+    ///
+    /// \returns sequence string at desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string Subsequence(const GenomicInterval& interval) const;
+
+    /// \brief Fetches FASTA sequence for desired interval.
+    ///
+    /// \param[in] htslibRegion htslib/samtools-formatted REGION string
+    ///                         representing the desired interval
+    ///
+    /// \returns sequence string at desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string Subsequence(const char* htslibRegion) const;
+
+    /// \brief Fetches FASTA sequence corresponding to a BamRecord, oriented and
+    ///        gapped as requested.
+    ///
+    /// For example, "native" orientation and "gapped" will return the reference
+    /// sequence with gaps inserted, as would align against the read in "native"
+    /// orientation.
+    ///
+    /// \param[in] bamRecord        input BamRecord to derive interval/CIGAR
+    ///                             data
+    /// \param[in] orientation      orientation of output
+    /// \param[in] gapped           if true, gaps/padding will be inserted, per
+    ///                             record's CIGAR info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns sequence string over the record's interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string ReferenceSubsequence(const BamRecord& bamRecord,
+                                     const Orientation orientation=Orientation::GENOMIC,
+                                     const bool gapped=false,
+                                     const bool exciseSoftClips=false) const;
+
+    /// \}
+
+public:
+    /// \name File Attributes
+    /// \{
+
+    /// \returns true if FASTA file contains a sequence matching \p name
+    bool HasSequence(const std::string& name) const;
+
+    /// \returns number of sequences stored in FASTA file
+    int NumSequences(void) const;
+
+    /// \returns length of FASTA sequence
+    ///
+    /// \throws std::runtime_error if length could not be determined
+    ///
+    int SequenceLength(const std::string& name) const;
+
+    /// \}
+
+private:
+    std::string filename_;
+    faidx_t* handle_;
+
+private:
+    void Close(void);
+    bool Open(const std::string& filename);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // INDEXEDFASTAREADER_H
diff --git a/include/pbbam/Interval.h b/include/pbbam/Interval.h

new file mode 100644 (file)

index 0000000..3f5a40e
--- /dev/null
+++ b/include/pbbam/Interval.h
@@ -0,0 +1,151 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Interval.h
+/// \brief Defines the Interval class.
+//
+// Author: Derek Barnett
+
+#ifndef INTERVAL_H
+#define INTERVAL_H
+
+#include "pbbam/Config.h"
+#include <string>
+
+#define BOOST_ICL_USE_STATIC_BOUNDED_INTERVALS
+#include <boost/icl/discrete_interval.hpp>
+#include <boost/icl/interval_traits.hpp>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief Represents a half-open (right-open) interval [start, stop)
+///
+/// \note This class is agnostic whether the values are 0-based or 1-based.
+///       Client code should primarily work with GenomicInterval, which does
+///       enforce this distinction.
+///
+template<typename T>
+class Interval
+{
+public:
+    typedef boost::icl::discrete_interval<T> interval_type;
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty interval [0,0)
+    Interval(void);
+
+    /// \brief Creates a 'singleton' interval [val,val+1)
+    Interval(const T val);
+
+    /// brief Creates an interval from [start, stop) */
+    Interval(const T start, const T stop);
+
+    Interval(const Interval<T>& other);
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    /// \returns true if both intervals share the same endpoints
+    bool operator==(const Interval<T>& other) const;
+
+    /// \returns true if either interval's endpoints differ
+    bool operator!=(const Interval<T>& other) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns interval's start coordinate
+    T Start(void) const;
+
+    /// Sets this interval's start coordinate.
+    ///
+    /// \param[in] start
+    /// \returns reference to this interval
+    ///
+    Interval<T>& Start(const T& start);
+
+    /// \returns interval's stop coordinate
+    T Stop(void) const;
+
+    /// Sets this interval's stop coordinate.
+    ///
+    /// \param[in] stop
+    /// \returns reference to this interval
+    ///
+    Interval<T>& Stop(const T& stop);
+
+    /// \}
+
+public:
+    /// \name Interval Operations
+
+    /// \returns true if this interval is fully covered by (or contained in) \p other
+    bool CoveredBy(const Interval<T>& other) const;
+
+    //// \returns true if this interval covers (or contains) \p other
+    bool Covers(const Interval<T>& other) const;
+
+    /// \returns true if intervals interset
+    bool Intersects(const Interval<T>& other) const;
+
+    /// \returns true if interval is valid (e.g. start < stop)
+    bool IsValid(void) const;
+
+    /// \returns interval length
+    size_t Length(void) const;
+
+    /// \}
+
+private:
+    interval_type data_;
+};
+
+} // namespace BAM
+} // namspace PacBio
+
+#include "pbbam/internal/Interval.inl"
+
+#endif // GENOMICINTERVAL_H
diff --git a/include/pbbam/LocalContextFlags.h b/include/pbbam/LocalContextFlags.h

new file mode 100644 (file)

index 0000000..0c59707
--- /dev/null
+++ b/include/pbbam/LocalContextFlags.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file LocalContextFlags.h
+/// \brief Defines the LocalContextFlags enum & helper method(s).
+//
+// Author: Lance Hepler
+
+#ifndef LOCALCONTEXTFLAGS_H
+#define LOCALCONTEXTFLAGS_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The LocalContextFlags enum defines the flags that can be used
+///        to describe a subread's "local context", i.e. whether it is
+///        flanked by barcodes/adapters or its pass orientation.
+///
+enum LocalContextFlags : uint8_t
+{
+    NO_LOCAL_CONTEXT = 0,   ///< No context information available
+    ADAPTER_BEFORE   = 1,   ///< Adapter precedes subread
+    ADAPTER_AFTER    = 2,   ///< Adapter follows subread
+    BARCODE_BEFORE   = 4,   ///< Barcode precedes subread
+    BARCODE_AFTER    = 8,   ///< Barcode follows subread
+    FORWARD_PASS     = 16,  ///< Subread's orientation is 'forward pass'
+    REVERSE_PASS     = 32   ///< Subread's orientation is 'reverse pass'
+};
+
+
+/// \returns a LocalContextFlags value containing the result of the bitwise-OR
+///          operation of \p lhs and \p rhs.
+// constexpr is implicitly inline
+constexpr LocalContextFlags operator|(const LocalContextFlags lhs, const LocalContextFlags rhs)
+{
+    return static_cast<LocalContextFlags>(static_cast<int>(lhs) | static_cast<int>(rhs));
+}
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // LOCALCONTEXTFLAGS_H
diff --git a/include/pbbam/MD5.h b/include/pbbam/MD5.h

new file mode 100644 (file)

index 0000000..03a1979
--- /dev/null
+++ b/include/pbbam/MD5.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file MD5.h
+/// \brief Defines basic MD5 hash utilities
+//
+// Author: Brett Bowman
+
+#ifndef MD5_H
+#define MD5_H
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief MD5 hash of a string as a 32-digit hexadecimal string
+///
+std::string MD5Hash(const std::string& str);
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // MD5_H
diff --git a/include/pbbam/Orientation.h b/include/pbbam/Orientation.h

new file mode 100644 (file)

index 0000000..c354822
--- /dev/null
+++ b/include/pbbam/Orientation.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Orientation.h
+/// \brief Defines the Orientation enum.
+//
+// Author: Derek Barnett
+
+#ifndef ORIENTATION_H
+#define ORIENTATION_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the orientations recognized by BamRecord, for
+///        presenting "per-base" data.
+///
+/// Orientation::NATIVE indicates that data should be presented in the subread's
+/// original form.
+///
+/// Orientation::GENOMIC indicates that data should be presented relative to
+/// genomic forward strand. This means that data will be reversed (or
+/// reverse-complemented) if the subread was aligned to the reverse strand.
+///
+enum class Orientation
+{
+    NATIVE      ///< Present data in 'raw' original orientation, regardless of aligned Strand
+  , GENOMIC     ///< Present data in aligned orientation, always relative to Strand::FORWARD.
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // ORIENTATION_H
diff --git a/include/pbbam/PbiBasicTypes.h b/include/pbbam/PbiBasicTypes.h

new file mode 100644 (file)

index 0000000..4006ed4
--- /dev/null
+++ b/include/pbbam/PbiBasicTypes.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiBasicTypes.h
+/// \brief Defines the basic data structures used in PBI lookups.
+//
+// Author: Derek Barnett
+
+#ifndef PBIBASICTYPES_H
+#define PBIBASICTYPES_H
+
+#include "pbbam/Compare.h"
+#include "pbbam/Config.h"
+#include <deque>
+#include <utility>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The IndexResultBlock class represents a contiguous group of records
+///        returned from a PBI lookup.
+///
+/// Contiguous reads that satisfy a PBI lookup query will be merged down into a
+/// single block. This helps to minimize the number of seeks in subsequent read
+/// operations.
+///
+/// An PBI-enabled reader or query can iterate over a list of IndexResultBlocks;
+/// for each block, seeking to the first record and then sequentially reading
+/// 'numReads' consecutive records before needing to seek again.
+///
+struct PBBAM_EXPORT IndexResultBlock
+{
+public:
+    IndexResultBlock(void);
+    IndexResultBlock(size_t idx, size_t numReads);
+
+public:
+    bool operator==(const IndexResultBlock& other) const;
+    bool operator!=(const IndexResultBlock& other) const;
+
+public:
+    size_t  firstIndex_;     ///< index of block's first record in BAM/PBI files (e.g. i-th record)
+    size_t  numReads_;       ///< number of reads in this block
+    int64_t virtualOffset_;  ///< virtual offset of first record in this block
+};
+
+/// \brief container of PBI result blocks
+///
+typedef std::deque<IndexResultBlock> IndexResultBlocks;
+
+/// \brief container of raw PBI indices
+///
+/// This is the primary result of PbiFilter -associated classes. This raw list
+/// can participate in set operations (union, intersect) for compound filters,
+/// and then be merged down into IndexResultBlocks for actual data file
+/// random-access.
+///
+typedef std::vector<size_t> IndexList;
+
+/// \brief pair representing a range of PBI indices: where interval
+///        is [first, second)
+///
+/// Used primarily by the PBI's CoordinateSortedData components.
+///
+/// \sa PbiReferenceEntry, PbiRawReferenceData, & ReferenceLookupData
+///
+typedef std::pair<size_t, size_t> IndexRange;
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/PbiBasicTypes.inl"
+
+#endif // PBIBASICTYPES_H
diff --git a/include/pbbam/PbiBuilder.h b/include/pbbam/PbiBuilder.h

new file mode 100644 (file)

index 0000000..d1d83bc
--- /dev/null
+++ b/include/pbbam/PbiBuilder.h
@@ -0,0 +1,210 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiBuilder.h
+/// \brief Defines the PbiBuilder class.
+//
+// Author: Derek Barnett
+
+#ifndef PBIBUILDER_H
+#define PBIBUILDER_H
+
+#include "pbbam/Config.h"
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class PbiRawData;
+
+namespace internal { class PbiBuilderPrivate; }
+
+/// \brief The PbiBuilder class construct PBI index data from %BAM record data.
+///
+/// Records are added one-by-one. This allows for either whole-file indexing of
+/// existing %BAM files or for indexing "on-the-fly" alongside a %BAM file as it
+/// is generated.
+///
+/// For simple PBI creation from existing %BAM files, see PbiFile::CreateFrom.
+/// This is the recommended approach, unless finer control or additional
+/// processing is needed.
+///
+class PBBAM_EXPORT PbiBuilder
+{
+public:
+    /// \brief This enum allows you to control the compression level of the
+    ///        output PBI file.
+    ///
+    /// Values are equivalent to zlib compression levels. See its documentation
+    /// for more details: http://www.zlib.net/manual.html
+    ///
+    enum CompressionLevel
+    {
+        CompressionLevel_0 = 0
+      , CompressionLevel_1 = 1
+      , CompressionLevel_2 = 2
+      , CompressionLevel_3 = 3
+      , CompressionLevel_4 = 4
+      , CompressionLevel_5 = 5
+      , CompressionLevel_6 = 6
+      , CompressionLevel_7 = 7
+      , CompressionLevel_8 = 8
+      , CompressionLevel_9 = 9
+
+      , DefaultCompression = -1
+      , NoCompression      = CompressionLevel_0
+      , FastCompression    = CompressionLevel_1
+      , BestCompression    = CompressionLevel_9
+    };
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Initializes builder to write data to \p pbiFilename.
+    ///
+    /// \param[in] pbiFilename      output filename
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, PbiBuilder will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \throws std::runtime_error if PBI file cannot be opened for writing
+    ///
+    PbiBuilder(const std::string& pbiFilename,
+               const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+               const size_t numThreads = 4);
+
+    /// \brief Initializes builder to write data to \p pbiFilename.
+    ///
+    /// Reference data-tracking structures will be initialized to expect
+    /// \p numReferenceSequences. (This is useful so that we can mark any
+    /// references that lack observed data appropriately).
+    ///
+    /// \param[in] pbiFilename              output filename
+    /// \param[in] numReferenceSequences    number of possible reference
+    ///                                     sequences, e.g. BamHeader::NumSequences
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, PbiBuilder will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \throws std::runtime_error if PBI file cannot be opened for writing
+    ///
+    PbiBuilder(const std::string& pbiFilename,
+               const size_t numReferenceSequences,
+               const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+               const size_t numThreads = 4);
+
+    /// \brief Initializes builder to write data to \p pbiFilename.
+    ///
+    /// Reference data-tracking structures will be initialized to expect
+    /// \p numReferenceSequences, but only if \p isCoordinateSorted is true.
+    ///
+    /// \param[in] pbiFilename              output filename
+    /// \param[in] numReferenceSequences    number of possible reference
+    ///                                     sequences, e.g. BamHeader::NumSequences
+    /// \param[in] isCoordinateSorted       if false, disables reference
+    ///                                     sequence tracking
+    ///                                     (BamHeader::SortOrder != "coordinate")
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, PbiBuilder will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \throws std::runtime_error if PBI file cannot be opened for writing
+    ///
+    PbiBuilder(const std::string& pbiFilename,
+               const size_t numReferenceSequences,
+               const bool isCoordinateSorted,
+               const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+               const size_t numThreads = 4);
+
+    /// \brief Destroys builder, writing its data out to PBI file.
+    ///
+    /// On destruction, data summaries are calculated, raw data is written to
+    /// file, and file handle closed.
+    ///
+    ~PbiBuilder(void);
+
+    /// \}
+
+public:
+    /// \name Index Building
+    /// \{
+
+    /// \brief Adds \p record's data to underlying raw data structure.
+    ///
+    /// \note \p vOffset is a BGZF \b virtual offset into the %BAM file. To get
+    ///          this value, you should use one of the following: \n
+    ///        - while reading existing %BAM: BamReader::VirtualTell \n
+    ///        - while writing new %BAM:      BamWriter::Write(const BamRecord& record, int64_t* vOffset) \n
+    ///
+    ///
+    /// To build a PBI index while generating a %BAM file:
+    /// \include code/PbiBuilder_WithWriter.txt
+    ///
+    /// To build a PBI index from an existing %BAM file:
+    /// \include code/PbiBuilder_WithReader.txt
+    ///
+    /// \param[in] record   input BamRecord to pull index data from
+    /// \param[in] vOffset  \b virtual offset into %BAM file where record begins
+    ///
+    void AddRecord(const BamRecord& record, const int64_t vOffset);
+
+    /// \returns const reference to current raw index data. Mostly only used for
+    ///          testing; shouldn't be needed by most client code.
+    ///
+    const PbiRawData& Index(void) const;
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::PbiBuilderPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // PBIBUILDER_H
diff --git a/include/pbbam/PbiFile.h b/include/pbbam/PbiFile.h

new file mode 100644 (file)

index 0000000..89bffa3
--- /dev/null
+++ b/include/pbbam/PbiFile.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiFile.h
+/// \brief Defines the PbiFile enums, typedefs, and methods.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILE_H
+#define PBIFILE_H
+
+#include "pbbam/Config.h"
+#include "pbbam/PbiBuilder.h"
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile;
+
+namespace PbiFile
+{
+    /// \brief This enum describes the PBI file sections
+    ///
+    enum Section
+    {
+         BASIC     = 0x0000  ///< BasicData     (required)
+       , MAPPED    = 0x0001  ///< MappedData    (always optional)
+       , REFERENCE = 0x0002  ///< ReferenceData (always optional)
+       , BARCODE   = 0x0004  ///< BarcodeData   (always optional)
+
+       , ALL  = BASIC | MAPPED | REFERENCE | BARCODE    ///< Synonym for 'all sections'
+    };
+
+    /// \brief Helper typedef for storing multiple Section flags.
+    ///
+    typedef uint16_t Sections;
+
+    /// \brief This enum describes the PBI file version.
+    enum VersionEnum
+    {
+        Version_3_0_0 = 0x030000        ///< v3.0.0
+      , Version_3_0_1 = 0x030001        ///< v3.0.1
+
+      , CurrentVersion = Version_3_0_1  ///< Synonym for the current PBI version.
+    };
+
+    /// \brief Builds PBI index data from the supplied %BAM file and writes a
+    ///        ".pbi" file.
+    ///
+    /// \param[in] bamFile source %BAM file
+    ///
+    /// \throws std::runtime_error if index file could not be created
+    ///
+    PBBAM_EXPORT void CreateFrom(const BamFile& bamFile,
+                                 const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+                                 const size_t numThreads = 4);
+
+} // namespace PbiFile
+} // namespace BAM
+} // namespace PacBio
+
+#endif // PBIFILE_H
diff --git a/include/pbbam/PbiFilter.h b/include/pbbam/PbiFilter.h

new file mode 100644 (file)

index 0000000..65ef7ef
--- /dev/null
+++ b/include/pbbam/PbiFilter.h
@@ -0,0 +1,343 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiFilter.h
+/// \brief Defines the PbiFilter class & helper 'concept'.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILTER_H
+#define PBIFILTER_H
+
+#include "pbbam/DataSet.h"
+#include "pbbam/PbiBasicTypes.h"
+#include "pbbam/PbiIndex.h"
+#include <boost/concept_check.hpp>
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal { struct PbiFilterPrivate; }
+
+/// \brief The PbiFilterConcept class provides compile-time enforcement of the
+///        required interface for PbiFilter's child filters.
+///
+template<typename T>
+struct PbiFilterConcept
+{
+    BOOST_CONCEPT_USAGE(PbiFilterConcept)
+    {
+        // All PBI filters (built-in or client-define) need only provide this
+        // interface:
+        //
+        //    bool Accepts(const PbiRawData& index, const size_t row) const;
+        //
+        const PbiRawData index;
+        bool result = filter.Accepts(index, 0);
+        (void)result;
+    }
+
+private:
+    T filter;
+//    PbiRawData index;
+};
+
+/// \brief The PbiFilter class provides a mechanism for performing PBI-enabled
+///        lookups.
+///
+/// The PbiFilter API is designed to be flexible, both built-in and for
+/// client-side customization. Built-in filters are provided for common queries,
+/// and client code can define and use custom filters as well. More complex
+/// filtering rules can be constructed via composition of simpler child filters.
+///
+/// Filter objects used as children of PbiFilter need only provide a method that
+/// matches this signature:
+///
+/// \include code/PbiFilter_Interface.txt
+///
+/// This requirement is enforced internally, using the PbiFilterConcept to
+/// require a compatible interface without requiring inheritance. This approach
+/// allows composition of heterogeneous filter types without worrying about a
+/// class hierarchy, pointer ownership across library/client boundaries, etc.
+///
+/// Thus a client application can define a custom filter if the built-in filters
+/// do not quite meet requirements. This filter may then be used in further
+/// PbiFilter composition, or directly to PbiFilterQuery
+///
+/// \include code/PbiFilter_CustomFilter.txt
+///
+/// As mentioned above, complex filters can be built up using multiple "child"
+/// filters. These complex filters are constructed by using either
+/// PbiFilter::Union (logical-OR over all direct children) or
+/// PbiFilter::Intersection (logical-AND over direct children).
+///
+/// \include code/PbiFilter_Composition.txt
+///
+class PBBAM_EXPORT PbiFilter
+{
+public:
+    enum CompositionType
+    {
+        INTERSECT
+      , UNION
+    };
+
+public:
+    /// \name Set Operations
+    /// \{
+
+    /// \brief Creates a PbiFilter that acts as intersection of the input
+    ///        filters.
+    ///
+    /// A record must satisfy \b all of this filter's direct "child" filters.
+    ///
+    /// Equivalent to:
+    /// \include code/PbiFilter_Intersection_Copy.txt
+    ///
+    /// \param[in] filters  vector of child filters
+    /// \returns composite filter
+    ///
+    static PbiFilter Intersection(const std::vector<PbiFilter>& filters);
+
+    /// \brief Creates a PbiFilter that acts as an intersection of the input
+    ///        filters.
+    ///
+    /// A record must satisfy \b all of this filter's direct "child" filters.
+    ///
+    /// Equivalent to:
+    /// \include code/PbiFilter_Intersection_Move.txt
+    ///
+    /// \param[in] filters  vector of child filters
+    /// \returns composite filter
+    ///
+    static PbiFilter Intersection(std::vector<PbiFilter>&& filters);
+
+    /// \brief Creates a PbiFilter that acts as a union of the input filters.
+    ///
+    /// A record must satisfy \b any of this filter's direct "child" filters.
+    ///
+    /// Equivalent to:
+    /// \include code/PbiFilter_Union_Copy.txt
+    ///
+    /// \param[in] filters  vector of child filters
+    /// \returns composite filter
+    ///
+    static PbiFilter Union(const std::vector<PbiFilter>& filters);
+
+    /// \brief Creates a PbiFilter that acts as a union of the input filters.
+    ///
+    /// A record must satisfy \b any of this filter's direct "child" filters.
+    ///
+    /// Equivalent to:
+    /// \include code/PbiFilter_Union_Move.txt
+    ///
+    /// \param[in] filters  vector of child filters
+    /// \returns composite filter
+    ///
+    static PbiFilter Union(std::vector<PbiFilter>&& filters);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a PbiFilter from a %DataSet's described filters.
+    ///
+    /// A DataSet may contain a Filters element, itself a list of Filter
+    /// elements. Each Filter element will contain a Properties element, itself
+    /// a list of Property elements.
+    ///
+    /// The Filters hierarchy looks like this (in its XML output):
+    /// \verbinclude examples/plaintext/PbiFilter_DataSetXmlFilters.txt
+    ///
+    /// The resulting PbiFilter represents a union over all Filter elements,
+    /// with each Filter element requiring an intersection of all of its
+    /// Property criteria. These Property elements are mapped to built-in PBI
+    /// filter types. To use the labels in the example XML above, the filter
+    /// created here is equivalent to:
+    ///
+    /// (A && B) || (C && D)
+    ///
+    /// If a DataSet lacks any Filters, then an empty PbiFilter will be created
+    /// - corresponding to the dataset's entire contents.
+    ///
+    /// \param[in] dataset  maybe containing filters
+    /// \returns composite filter
+    ///
+    static PbiFilter FromDataSet(const DataSet& dataset);
+
+public:
+
+    /// \brief Creates an empty filter.
+    ///
+    /// \note An empty filter will result in all records being returned, e.g.
+    ///       for query iteration.
+    ///
+    /// \param[in] type composition type. Any additional child filters added to
+    ///                 this composite will be treated according to this type.
+    ///                 If INTERSECT, a record must match all child filters. If
+    ///                 UNION, a record must match any child filter.
+    ///
+    PbiFilter(const CompositionType type = INTERSECT);
+
+    /// \brief Creates a composite filter (of INTERSECT type) with an initial
+    ///        child filter.
+    ///
+    /// \note T must satisfy PbiFilterConcept
+    ///
+    /// \param[in] filter initial child filter
+    ///
+    template<typename T>
+    PbiFilter(const T& filter);
+
+    /// \brief Creates a composite filter (of INTERSECT type) with an initial
+    ///        child filter.
+    ///
+    /// \note T must satisfy PbiFilterConcept
+    ///
+    /// \param[in] filter initial child filter
+    ///
+    template<typename T>
+    PbiFilter(T&& filter);
+
+    /// \brief Creates a composite filter (of INTERSECT type) with a list of
+    ///        initial child filters.
+    ///
+    /// \param[in] filters initial child filters
+    ///
+    PbiFilter(const std::vector<PbiFilter>& filters);
+
+    /// \brief Creates composite filter (of INTERSECT type) with a list of
+    ///        initial child filters.
+    ///
+    /// \param[in] filters initial child filters
+    ///
+    PbiFilter(std::vector<PbiFilter>&& filters);
+
+    PbiFilter(const PbiFilter& other);
+    PbiFilter(PbiFilter&& other) noexcept;
+    PbiFilter& operator=(const PbiFilter& other);
+    PbiFilter& operator=(PbiFilter&& other) noexcept;
+    ~PbiFilter(void);
+
+    /// \}
+
+public:
+    /// \name Composition
+    /// \{
+
+    /// \brief Adds a new child filter of type T.
+    ///
+    /// \param[in] filter   additional child filter. Type T must satisfy
+    ///                     PbiFilterConcept.
+    /// \returns reference to this filter
+    ///
+    template<typename T>
+    PbiFilter& Add(const T& filter);
+
+    /// \brief Adds a new child filter of type T.
+    ///
+    /// \param[in] filter   additional child filter. Type T must satisfy
+    ///                     PbiFilterConcept.
+    /// \returns reference to this filter
+    ///
+    template<typename T>
+    PbiFilter& Add(T&& filter);
+
+    /// \brief Adds a new child filter.
+    ///
+    /// \param[in] filter   additional child filter
+    /// \returns reference to this filter
+    ///
+    PbiFilter& Add(const PbiFilter& filter);
+
+    /// \brief Adds a new child filter.
+    ///
+    /// \param[in] filter   additional child filter
+    /// \returns reference to this filter
+    ///
+    PbiFilter& Add(PbiFilter&& filter);
+
+    /// \brief Add child filters.
+    ///
+    /// \param[in] filters  additional child filters
+    /// \returns reference to this filter
+    ///
+    PbiFilter& Add(const std::vector<PbiFilter>& filters);
+
+    /// \brief Add child filters.
+    ///
+    /// \param[in] filters  additional child filters
+    /// \returns reference to this filter
+    ///
+    PbiFilter& Add(std::vector<PbiFilter>&& filters);
+
+    /// \returns true if this filter has no child filters.
+    bool IsEmpty(void) const;
+
+    /// \}
+
+public:
+    /// \name Lookup
+    /// \{
+
+    /// \brief Performs the PBI index lookup, combining child results a
+    ///        composite filter.
+    ///
+    /// \param[in] idx  PBI (raw) index object
+    /// \param[in] row  record number in %BAM/PBI files
+    ///
+    /// \returns true if record at \p row passes this filter criteria, 
+    ///          including children (if any)
+    ///
+    bool Accepts(const BAM::PbiRawData& idx, const size_t row) const;
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::PbiFilterPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/PbiFilter.inl"
+#include "pbbam/PbiFilterTypes.h"
+
+#endif // PBIFILTER_H
diff --git a/include/pbbam/PbiFilterQuery.h b/include/pbbam/PbiFilterQuery.h

new file mode 100644 (file)

index 0000000..120a30d
--- /dev/null
+++ b/include/pbbam/PbiFilterQuery.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiFilterQuery.h
+/// \brief Defines the PbiFilterQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILTERQUERY_H
+#define PBIFILTERQUERY_H
+
+#include "pbbam/Config.h"
+#include "pbbam/PbiFilter.h"
+#include "pbbam/internal/QueryBase.h"
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The PbiFilter class provides iterable access to a DataSet's %BAM
+///        records, limiting results to those matching filter criteria.
+///
+/// Example:
+/// \include code/PbiFilterQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT PbiFilterQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new PbiFilterQuery, limiting record results to only
+    ///        those matching filter criteria
+    ///
+    /// \param[in] filter   filtering criteria
+    /// \param[in] dataset  input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         PBI files.
+    ///
+    PbiFilterQuery(const PbiFilter& filter, const DataSet& dataset);
+
+    ~PbiFilterQuery(void);
+
+public:
+
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r);
+
+private:
+    struct PbiFilterQueryPrivate;
+    std::unique_ptr<PbiFilterQueryPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // PBIFILTERQUERY_H
diff --git a/include/pbbam/PbiFilterTypes.h b/include/pbbam/PbiFilterTypes.h

new file mode 100644 (file)

index 0000000..52524ce
--- /dev/null
+++ b/include/pbbam/PbiFilterTypes.h
@@ -0,0 +1,1023 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiFilterTypes.h
+/// \brief Defines the built-in PBI filters.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILTERTYPES_H
+#define PBIFILTERTYPES_H
+
+#include "pbbam/Compare.h"
+#include "pbbam/PbiFilter.h"
+#include "pbbam/PbiIndex.h"
+#include <boost/optional.hpp>
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+
+/// \internal
+///
+/// Provides basic container for value/compare-type pair
+///
+template<typename T>
+struct FilterBase
+{
+public:
+    T value_;
+    boost::optional<std::vector<T> > multiValue_;
+    Compare::Type cmp_;
+protected:
+    FilterBase(const T& value, const Compare::Type cmp);
+    FilterBase(T&& value, const Compare::Type cmp);
+    FilterBase(const std::vector<T>& values);
+    FilterBase(std::vector<T>&& values);
+protected:
+    bool CompareHelper(const T& lhs) const;
+private:
+    bool CompareSingleHelper(const T& lhs) const;
+    bool CompareMultiHelper(const T& lhs) const;
+};
+
+/// \internal
+///
+/// Dispatches the lookup to BarcodeLookupData
+///
+template<typename T, BarcodeLookupData::Field field>
+struct BarcodeDataFilterBase : public FilterBase<T>
+{
+protected:
+    BarcodeDataFilterBase(const T& value, const Compare::Type cmp);
+    BarcodeDataFilterBase(T&& value, const Compare::Type cmp);
+    BarcodeDataFilterBase(const std::vector<T>& values);
+    BarcodeDataFilterBase(std::vector<T>&& values);
+public:
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \internal
+///
+/// Dispatches the lookup to BasicLookupData
+///
+template<typename T, BasicLookupData::Field field>
+struct BasicDataFilterBase : public FilterBase<T>
+{
+protected:
+    BasicDataFilterBase(const T& value, const Compare::Type cmp);
+    BasicDataFilterBase(T&& value, const Compare::Type cmp);
+    BasicDataFilterBase(const std::vector<T>& values);
+    BasicDataFilterBase(std::vector<T>&& values);
+public:
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \internal
+///
+/// Dispatches the lookup to MappedLookupData
+///
+template<typename T, MappedLookupData::Field field>
+struct MappedDataFilterBase : public FilterBase<T>
+{
+protected:
+    MappedDataFilterBase(const T& value, const Compare::Type cmp);
+    MappedDataFilterBase(T&& value, const Compare::Type cmp);
+    MappedDataFilterBase(const std::vector<T>& values);
+    MappedDataFilterBase(std::vector<T>&& values);
+public:
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+} // namespace internal
+
+/// \brief The PbiAlignedEndFilter class provides a PbiFilter-compatible filter
+///        on aligned end.
+///
+/// Example: \include code/PbiAlignedEndFilter.txt
+///
+/// \sa BamRecord::AlignedEnd
+///
+struct PbiAlignedEndFilter
+    : public internal::MappedDataFilterBase<uint32_t, MappedLookupData::A_END>
+{
+public:
+    /// \brief Creates a filter on aligned end.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiAlignedEndFilter(const uint32_t position,
+                        const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiAlignedLengthFilter class provides a PbiFilter-compatible
+///        filter on aligned length.
+///
+/// Example: \include code/PbiAlignedLengthFilter.txt
+///
+/// \sa BamRecord::AlignedEnd, BamRecord::AlignedStart
+///
+struct PbiAlignedLengthFilter : public internal::FilterBase<uint32_t>
+{
+public:
+    /// \brief Creates a filter on aligned length.
+    ///
+    /// \param[in] length value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiAlignedLengthFilter(const uint32_t length,
+                           const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \brief The PbiAlignedStartFilter class provides a PbiFilter-compatible
+///        filter on aligned start.
+///
+/// Example: \include code/PbiAlignedStartFilter.txt
+///
+/// \sa BamRecord::AlignedStart
+///
+struct PbiAlignedStartFilter
+    : public internal::MappedDataFilterBase<uint32_t, MappedLookupData::A_START>
+{
+public:
+    /// \brief Creates a filter on aligned start.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiAlignedStartFilter(const uint32_t position,
+                          const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiAlignedStrandFilter class provides a PbiFilter-compatible
+///        filter on aligned strand.
+///
+/// Example: \include code/PbiAlignedStrandFilter.txt
+///
+/// \sa BamRecord::AlignedStrand
+///
+struct PbiAlignedStrandFilter
+    : public internal::MappedDataFilterBase<Strand, MappedLookupData::STRAND>
+{
+public:
+    /// \brief Creates a strand filter.
+    ///
+    /// \param[in] strand  strand value to compare on
+    /// \param[in] cmp     compare type
+    ///
+    PbiAlignedStrandFilter(const Strand strand,
+                           const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiBarcodeFilter class provides a PbiFilter-compatible filter on
+///        barcode ID.
+///
+/// Any record with this barcode ID (forward or reverse) will pass this filter.
+///
+/// Example: \include code/PbiBarcodeFilter.txt
+///
+/// \sa BamRecord::BarcodeForward, BamRecord::BarcodeReverse
+///
+struct PbiBarcodeFilter
+{
+public:
+    /// \brief Creates a single-value barcode filter.
+    ///
+    /// \param[in] barcode  barcode ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeFilter(const int16_t barcode,
+                     const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' barcode filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly, in either bc_forward or bc_reverse.
+    ///
+    /// \param[in] whitelist  barcode IDs to compare on
+    ///
+    PbiBarcodeFilter(const std::vector<int16_t>& whitelist);
+
+    /// \brief Creates a 'whitelisted' barcode filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly, in either bc_forward or bc_reverse.
+    ///
+    /// \param[in] whitelist  barcode IDs to compare on
+    ///
+    PbiBarcodeFilter(std::vector<int16_t>&& whitelist);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    PbiFilter compositeFilter_;
+};
+
+/// \brief The PbiBarcodeForwardFilter class provides a PbiFilter-compatible
+///        filter on forward barcode ID.
+///
+/// Example: \include code/PbiBarcodeForwardFilter.txt
+///
+/// \sa BamRecord::BarcodeForward
+///
+struct PbiBarcodeForwardFilter
+    : public internal::BarcodeDataFilterBase<int16_t, BarcodeLookupData::BC_FORWARD>
+{
+public:
+    /// \brief Creates a single-value forward barcode filter.
+    ///
+    /// \param[in] bcFwdId  (forward) barcode ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeForwardFilter(const int16_t bcFwdId,
+                            const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' forward barcode filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly, in bc_forward.
+    ///
+    /// \param[in] whitelist  barcode IDs to compare on
+    ///
+    PbiBarcodeForwardFilter(const std::vector<int16_t>& whitelist);
+
+    /// \brief Creates a 'whitelisted' forward barcode filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly, in bc_forward.
+    ///
+    /// \param[in] whitelist  barcode IDs to compare on
+    ///
+    PbiBarcodeForwardFilter(std::vector<int16_t>&& whitelist);
+};
+
+/// \brief The PbiBarcodeQualityFilter class provides a PbiFilter-compatible
+///        filter on  barcode quality.
+///
+/// Example: \include code/PbiBarcodeQualityFilter.txt
+///
+/// \sa BamRecord::BarcodeQuality
+///
+struct PbiBarcodeQualityFilter
+    : public internal::BarcodeDataFilterBase<uint8_t, BarcodeLookupData::BC_QUALITY>
+{
+public:
+    /// \brief Creates a single-value barcode quality filter.
+    ///
+    /// \param[in] bcQuality    barcode quality to compare on
+    /// \param[in] cmp          compare type
+    ///
+    PbiBarcodeQualityFilter(const uint8_t bcQuality,
+                            const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiBarcodeReverseFilter class provides a PbiFilter-compatible
+///        filter on forward barcode ID.
+///
+/// Example: \include code/PbiBarcodeReverseFilter.txt
+///
+/// \sa BamRecord::BarcodeReverse
+///
+struct PbiBarcodeReverseFilter
+    : public internal::BarcodeDataFilterBase<int16_t, BarcodeLookupData::BC_REVERSE>
+{
+public:
+    /// \brief Creates a single-value reverse barcode filter.
+    ///
+    /// \param[in] bcRevId  (reverse) barcode ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeReverseFilter(const int16_t bcRevId,
+                            const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' reverse barcode filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly, in bc_reverse.
+    ///
+    /// \param[in] whitelist  barcode IDs to compare on
+    ///
+    PbiBarcodeReverseFilter(const std::vector<int16_t>& whitelist);
+
+    /// \brief Creates a 'whitelisted' reverse barcode filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly, in bc_reverse.
+    ///
+    /// \param[in] whitelist  barcode IDs to compare on
+    ///
+    PbiBarcodeReverseFilter(std::vector<int16_t>&& whitelist);
+};
+
+/// \brief The PbiBarcodesFilter class provides a PbiFilter-compatible filter on
+///        both forward & reverse barcode IDs.
+///
+/// A record must match both IDs to pass the filter.
+///
+/// Example: \include code/PbiBarcodesFilter.txt
+///
+/// \sa BamRecord::Barcodes
+///
+struct PbiBarcodesFilter
+{
+public:
+    /// \brief Creates a barcodes filter from a std::pair of IDs.
+    ///
+    /// pair.first -> BarcodeForward\n
+    /// pair.second -> BarcodeReverse
+    ///
+    /// \param[in] barcodes barcode IDs to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodesFilter(const std::pair<int16_t, int16_t> barcodes,
+                      const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a barcodes filter from forward & reverse IDs.
+    ///
+    /// \param[in] bcForward    forward barcode ID to compare on
+    /// \param[in] bcReverse    reverse barcode ID to compare on
+    /// \param[in] cmp          compare type
+    ///
+    PbiBarcodesFilter(const int16_t bcForward,
+                      const int16_t bcReverse,
+                      const Compare::Type cmp = Compare::EQUAL);
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    PbiFilter compositeFilter_;
+};
+
+/// \brief The PbiIdentityFilter class provides a PbiFilter-compatible filter on
+///        read identity (% aligned match).
+///
+/// Read identity is equivalent to: 1.0 - (nMM + nDel + nIns)/readLength.
+///
+/// Example: \include code/PbiIdentityFilter.txt
+///
+struct PbiIdentityFilter : public internal::FilterBase<float>
+{
+public:
+    /// \brief Creates a read identity filter.
+    ///
+    /// \param[in] identity value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiIdentityFilter(const float identity,
+                      const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \brief The PbiLocalContextFilter class provides a PbiFilter-compatible
+///        filter on local context (adapter, barcode, etc.).
+///
+/// The primary Compare::Type operators intended for this filter are:
+/// Compare::EQUAL, Compare::NOT_EQUAL, Compare::CONTAINS, and
+/// Compare::NOT_CONTAINS.
+///
+/// Example: \include code/PbiLocalContextFilter.txt
+///
+struct PbiLocalContextFilter
+    : public internal::BasicDataFilterBase<LocalContextFlags,
+                                           BasicLookupData::CONTEXT_FLAG >
+{
+public:
+    PbiLocalContextFilter(const LocalContextFlags& flags,
+                          const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiMapQualityFilter class provides a PbiFilter-compatible filter on
+///        mapping quality.
+///
+/// Example: \include code/PbiMapQualityFilter.txt
+///
+/// \sa BamRecord::MapQuality
+///
+struct PbiMapQualityFilter
+    : public internal::MappedDataFilterBase<uint8_t, MappedLookupData::MAP_QUALITY>
+{
+public:
+    /// \brief Creates a map quality filter.
+    ///
+    /// \param[in] mapQual  value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiMapQualityFilter(const uint8_t mapQual,
+                        const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiMovieNameFilter class provides a PbiFilter-compatible filter
+///        on movie name.
+///
+/// Example: \include code/PbiMovieNameFilter.txt
+///
+/// \sa BamRecord::MovieName
+///
+struct PbiMovieNameFilter
+{
+public:
+    /// \brief Creates a single-value movie name filter.
+    ///
+    /// \param[in] movieName    movie name to compare on
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match movie name, exactly.
+    ///
+    PbiMovieNameFilter(const std::string& movieName);
+
+    /// \brief Creates a 'whitelisted' movie name filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    movie names to compare on
+    ///
+    PbiMovieNameFilter(const std::vector<std::string>& whitelist);
+
+    /// \brief Creates a 'whitelisted' movie name filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    movie names to compare on
+    ///
+    PbiMovieNameFilter(std::vector<std::string>&& whitelist);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+   PbiFilter compositeFilter_;
+};
+
+/// \brief The PbiNumDeletedBasesFilter class provides a PbiFilter-compatible
+///        filter on the number of deleted bases.
+///
+/// Example: \include code/PbiNumDeletedBasesFilter.txt
+///
+/// \sa BamRecord::NumDeletedBases
+///
+struct PbiNumDeletedBasesFilter
+    : public internal::MappedDataFilterBase<size_t, MappedLookupData::N_DEL>
+{
+public:
+    /// \brief Creates a filter on the number of deleted bases.
+    ///
+    /// \param[in] numDeletions value to compare on
+    /// \param[in] cmp          compare type
+    ///
+    PbiNumDeletedBasesFilter(const size_t numDeletions,
+                             const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiNumInsertededBasesFilter class provides a PbiFilter-compatible
+///        filter on the number of inserted bases.
+///
+/// Example: \include code/PbiNumInsertedBasesFilter.txt
+///
+/// \sa BamRecord::NumInsertedBases
+///
+struct PbiNumInsertedBasesFilter
+    : public internal::MappedDataFilterBase<size_t, MappedLookupData::N_INS>
+{
+public:
+    /// \brief Creates a filter on the number of inserted bases.
+    ///
+    /// \param[in] numInsertions    value to compare on
+    /// \param[in] cmp              compare type
+    ///
+    PbiNumInsertedBasesFilter(const size_t numInsertions,
+                              const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiNumMatchesFilter class provides a PbiFilter-compatible filter
+///        on the number of matched bases.
+///
+/// Example: \include code/PbiNumMatchesFilter.txt
+///
+/// \sa BamRecord::NumMatches
+///
+struct PbiNumMatchesFilter
+    : public internal::MappedDataFilterBase<size_t, MappedLookupData::N_M>
+{
+public:
+    /// \brief Creates a filter on the number of matched bases.
+    ///
+    /// \param[in] numMatchedBases  value to compare on
+    /// \param[in] cmp              compare type
+    ///
+    PbiNumMatchesFilter(const size_t numMatchedBases,
+                        const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiNumMismatchesFilter class provides a PbiFilter-compatible
+///        filter on the number of mismatched bases.
+///
+/// Example: \include code/PbiNumMismatchesFilter.txt
+///
+/// \sa BamRecord::NumMismatches
+///
+struct PbiNumMismatchesFilter
+    : public internal::MappedDataFilterBase<size_t, MappedLookupData::N_MM>
+{
+public:
+    /// \brief Creates a filter on the number of mismatched bases.
+    ///
+    /// \param[in] numMismatchedBases   value to compare on
+    /// \param[in] cmp                  compare type
+    ///
+    PbiNumMismatchesFilter(const size_t numMismatchedBases,
+                           const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiQueryEndFilter class provides a PbiFilter-compatible filter
+///        on query end.
+///
+/// Example: \include code/PbiQueryEndFilter.txt
+///
+/// \sa BamRecord::QueryEnd
+///
+struct PbiQueryEndFilter
+    : public internal::BasicDataFilterBase<int32_t, BasicLookupData::Q_END>
+{
+public:
+    /// \brief Creates a filter on query end position.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiQueryEndFilter(const int32_t position,
+                      const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiQueryLengthFilter class provides a PbiFilter-compatible filter
+///        on query length.
+///
+/// queryLength = (queryEnd - queryStart)
+///
+/// Example: \include code/PbiQueryLengthFilter.txt
+///
+/// \sa BamRecord::QueryEnd, BamRecord::QueryStart
+///
+struct PbiQueryLengthFilter : public internal::FilterBase<int32_t>
+{
+public:
+    /// \brief Creates a filter on query length
+    ///
+    /// \param[in] length   value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiQueryLengthFilter(const int32_t length,
+                         const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \brief The PbiQueryNameFilter class provides a PbiFilter-compatible filter
+///        on name length.
+///
+/// Example: \include code/PbiQueryNameFilter.txt
+///
+/// \sa BamRecord::FullName
+///
+struct PbiQueryNameFilter
+{
+public:
+    /// \brief Creates a single-value query name filter.
+    ///
+    /// \param[in] qname    query name to compare on
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match query name, exactly.
+    ///
+    PbiQueryNameFilter(const std::string& qname);
+
+    /// \brief Creates a 'whitelisted' query name filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    query names to compare on
+    ///
+    PbiQueryNameFilter(const std::vector<std::string>& whitelist);
+
+    PbiQueryNameFilter(const PbiQueryNameFilter& other);
+    ~PbiQueryNameFilter(void);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    struct PbiQueryNameFilterPrivate;
+    std::unique_ptr<PbiQueryNameFilterPrivate> d_;
+};
+
+/// \brief The PbiQueryStartFilter class provides a PbiFilter-compatible filter
+///        on query start.
+///
+/// Example: \include code/PbiQueryStartFilter.txt
+///
+/// \sa BamRecord::QueryStart
+///
+struct PbiQueryStartFilter
+    : public internal::BasicDataFilterBase<int32_t, BasicLookupData::Q_START>
+{
+public:
+    /// \brief Creates a filter on query start position.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiQueryStartFilter(const int32_t position,
+                        const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiReadAccuracyFilter class provides a PbiFilter-compatible filter
+///        on read accuracy.
+///
+/// Example: \include code/PbiReadAccuracyFilter.txt
+///
+/// \sa BamRecord::ReadAccuracy
+///
+struct PbiReadAccuracyFilter
+    : public internal::BasicDataFilterBase<Accuracy, BasicLookupData::READ_QUALITY>
+{
+public:
+    /// \brief Creates a filter on read accuracy.
+    ///
+    /// \param[in] accuracy value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReadAccuracyFilter(const Accuracy accuracy,
+                          const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiReadGroupFilter class provides a PbiFilter-compatible filter
+///        on read group.
+///
+/// Example: \include code/PbiReadGroupFilter.txt
+///
+/// \sa BamRecord::ReadGroup,
+///     BamRecord::ReadGroupId,
+///     BamRecord::ReadGroupNumericId
+///
+struct PbiReadGroupFilter
+    : public internal::BasicDataFilterBase<int32_t, BasicLookupData::RG_ID>
+{
+public:
+    /// \brief Creates a filter on read group (numeric) ID value
+    ///
+    /// \param[in] rgId     numeric read group ID
+    /// \param[in] cmp      compare type
+    ///
+    /// \sa BamRecord::ReadGroupNumericId
+    ///
+    PbiReadGroupFilter(const int32_t rgId,
+                       const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a filter on printable read group ID value
+    ///
+    /// \param[in] rgId     read group ID string
+    /// \param[in] cmp      compare type
+    ///
+    /// \sa BamRecord::ReadGroupId
+    ///
+    PbiReadGroupFilter(const std::string rgId,
+                       const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a filter on read group (object).
+    ///
+    /// \param[in] rg   read group object
+    /// \param[in] cmp  compare type
+    ///
+    /// \sa BamRecord::ReadGroup
+    ///
+    PbiReadGroupFilter(const ReadGroupInfo& rg,
+                       const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' filter on read group numeric IDs.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    read group IDs to compare on
+    ///
+    PbiReadGroupFilter(const std::vector<int32_t>& whitelist);
+
+    /// \brief Creates a 'whitelisted' filter on read group numeric IDs.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    read group IDs to compare on
+    ///
+    PbiReadGroupFilter(std::vector<int32_t>&& whitelist);
+
+    /// \brief Creates a 'whitelisted' filter on read group printable IDs.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    read group ID strings to compare on
+    ///
+    PbiReadGroupFilter(const std::vector<std::string>& whitelist);
+
+    /// \brief Creates a 'whitelisted' filter on read group printable IDs.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    read group ID strings to compare on
+    ///
+    PbiReadGroupFilter(std::vector<std::string>&& whitelist);
+
+    /// \brief Creates a 'whitelisted' filter using read group objects.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    read group objects to compare on
+    ///
+    PbiReadGroupFilter(const std::vector<ReadGroupInfo>& whitelist);
+
+    /// \brief Creates a 'whitelisted' filter using read group objects.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    read group objects to compare on
+    ///
+    PbiReadGroupFilter(std::vector<ReadGroupInfo>&& whitelist);
+};
+
+/// \brief The PbiReferenceEndFilter class provides a PbiFilter-compatible
+///        filter on reference end.
+///
+/// Example: \include code/PbiReferenceEndFilter.txt
+///
+/// \sa BamRecord::ReferenceEnd
+///
+struct PbiReferenceEndFilter
+    : public internal::MappedDataFilterBase<uint32_t, MappedLookupData::T_END>
+{
+public:
+    /// \brief Creates a filter on reference end.
+    ///
+    /// \param[in] tEnd     value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReferenceEndFilter(const uint32_t tEnd,
+                          const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiReferenceIdFilter class provides a PbiFilter-compatible
+///        filter on reference ID.
+///
+/// Example: \include code/PbiReferenceIdFilter.txt
+///
+/// \sa BamRecord::ReferenceId
+///
+struct PbiReferenceIdFilter
+    : public internal::MappedDataFilterBase<int32_t, MappedLookupData::T_ID>
+{
+public:
+    /// \brief Creates a single-value reference ID filter.
+    ///
+    /// \param[in] tId  reference ID to compare on
+    /// \param[in] cmp  compare type
+    ///
+    PbiReferenceIdFilter(const int32_t tId,
+                         const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' reference ID filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    reference IDs to compare on
+    ///
+    PbiReferenceIdFilter(const std::vector<int32_t>& whitelist);
+
+    /// \brief Creates a 'whitelisted' reference ID filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    reference IDs to compare on
+    ///
+    PbiReferenceIdFilter(std::vector<int32_t>&& whitelist);
+};
+
+/// \brief The PbiReferenceNameFilter class provides a PbiFilter-compatible
+///        filter on reference name.
+///
+/// Example: \include code/PbiReferenceNameFilter.txt
+///
+/// \sa BamRecord::ReferenceName
+///
+struct PbiReferenceNameFilter
+{
+public:
+    /// \brief Creates a single-value reference name filter.
+    ///
+    /// \param[in] rname    reference ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReferenceNameFilter(const std::string& rname,
+                           const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' reference name filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    reference names to compare on
+    ///
+    PbiReferenceNameFilter(const std::vector<std::string>& whitelist);
+
+    /// \brief Creates a 'whitelisted' reference name filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    reference names to compare on
+    ///
+    PbiReferenceNameFilter(std::vector<std::string>&& whitelist);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    mutable bool initialized_;
+    mutable PbiFilter subFilter_;
+    std::string rname_;
+    boost::optional<std::vector<std::string> > rnameWhitelist_;
+    Compare::Type cmp_;
+
+private:
+    // marked const so we can delay setup of filter in Accepts(), once we have
+    // access to PBI/BAM input. modified values marked mutable accordingly
+    void Initialize(const PbiRawData& idx) const;
+};
+
+/// \brief The PbiReferenceStartFilter class provides a PbiFilter-compatible
+///        filter on reference start.
+///
+/// Example: \include code/PbiReferenceStartFilter.txt
+///
+/// \sa BamRecord::ReferenceStart
+///
+struct PbiReferenceStartFilter
+    : public internal::MappedDataFilterBase<uint32_t, MappedLookupData::T_START>
+{
+public:
+    /// \brief Creates a filter on reference start.
+    ///
+    /// \param[in] tStart   value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReferenceStartFilter(const uint32_t tStart,
+                            const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiZmwFilter class provides a PbiFilter-compatible filter on
+///        ZMW hole number.
+///
+/// Example: \include code/PbiZmwFilter.txt
+///
+/// \sa BamRecord::HoleNumber
+///
+struct PbiZmwFilter : public internal::BasicDataFilterBase<int32_t,
+                                                           BasicLookupData::ZMW>
+{
+public:
+    /// \brief Creates a single-value ZMW hole number filter.
+    ///
+    /// \param[in] zmw  value to compare on
+    /// \param[in] cmp  compare type
+    ///
+    PbiZmwFilter(const int32_t zmw,
+                 const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' ZMW hole number filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    ZMW hole numbers to compare on
+    ///
+    PbiZmwFilter(const std::vector<int32_t>& whitelist);
+
+    /// \brief Creates a 'whitelisted' ZMW hole number filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    ZMW hole numbers to compare on
+    ///
+    PbiZmwFilter(std::vector<int32_t>&& whitelist);
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/PbiFilterTypes.inl"
+
+#endif // PBIFILTERTYPES_H
diff --git a/include/pbbam/PbiIndex.h b/include/pbbam/PbiIndex.h

new file mode 100644 (file)

index 0000000..09b61b8
--- /dev/null
+++ b/include/pbbam/PbiIndex.h
@@ -0,0 +1,162 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiIndex.h
+/// \brief Defines the PbiIndex class.
+//
+// Author: Derek Barnett
+
+#ifndef PBIINDEX_H
+#define PBIINDEX_H
+
+#include "pbbam/Config.h"
+#include "pbbam/PbiFile.h"
+#include "pbbam/PbiLookupData.h"
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal { class PbiIndexPrivate; }
+
+/// \brief The PbiIndex class provides an representation of PBI index data that
+///        is rearranged for quick lookups.
+///
+/// The PbiIndex class itself provides access to a few high-level attributes
+/// (e.g. version, number of records, etc.). The actual lookup data is stored
+/// in its member components:
+///     BasicLookupData,
+///     MappedLookupData,
+///     ReferenceLookupData, &
+///     BarcodeLookupData .
+///
+class PBBAM_EXPORT PbiIndex
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a PbiIndex lookup structure from a PBI file.
+    ///
+    /// \param[in] pbiFilename  filename
+    ///
+    /// \throws std::runtime_error if failed to load data from file
+    ///
+    PbiIndex(const std::string& pbiFilename);
+
+    PbiIndex(const PbiIndex& other);
+    PbiIndex(PbiIndex&& other);
+    PbiIndex& operator=(const PbiIndex& other);
+    PbiIndex& operator=(PbiIndex&& other);
+    ~PbiIndex(void);
+
+    /// \}
+
+public:
+    /// \name PBI General Attributes
+    /// \{
+
+    /// \returns true if index has BarcodeData section
+    bool HasBarcodeData(void) const;
+
+    /// \returns true if index has MappedData section
+    bool HasMappedData(void) const;
+
+    /// \returns true if index has ReferenceData section
+    bool HasReferenceData(void) const;
+
+    /// \returns true if index has \b section
+    /// \param[in] section PbiFile::Section identifier
+    ///
+    bool HasSection(const PbiFile::Section section) const;
+
+    /// \returns index filename ("*.pbi")
+    ///
+    /// \note Returns an empty string if the underlying data was generated, not
+    ///       loaded from file.
+    ///
+    std::string Filename(void) const;
+
+    /// \returns enum flags representing the file sections present
+    PbiFile::Sections FileSections(void) const;
+
+    /// \returns the number of records in the PBI (& associated %BAM)
+    uint32_t NumReads(void) const;
+
+    /// \returns the PBI file's version
+    PbiFile::VersionEnum Version(void) const;
+
+    /// \}
+
+public:
+    /// \name Lookup Data Components
+    /// \{
+
+    /// \returns const reference to BarcodeData lookup structure
+    ///
+    /// May be empty, check result of HasBarcodeData.
+    ///
+    const BarcodeLookupData& BarcodeData(void) const;
+
+    /// \returns const reference to BasicData lookup structure
+    const BasicLookupData& BasicData(void) const;
+
+    /// \returns const reference to MappedData lookup structure
+    ///
+    /// May be empty, check result of HasMappedData.
+    ///
+    const MappedLookupData& MappedData(void) const;
+
+    /// \returns const reference to reference data lookup structure
+    ///
+    /// May be empty, check result of HasReferenceData.
+    ///
+    const ReferenceLookupData& ReferenceData(void) const;
+
+    /// }
+
+private:
+    PbiIndex(void);
+    std::unique_ptr<internal::PbiIndexPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "internal/PbiIndex.inl"
+
+#endif // PBIINDEX_H
diff --git a/include/pbbam/PbiIndexedBamReader.h b/include/pbbam/PbiIndexedBamReader.h

new file mode 100644 (file)

index 0000000..17c46b5
--- /dev/null
+++ b/include/pbbam/PbiIndexedBamReader.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiIndexedBamReader.h
+/// \brief Defines the PbiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#ifndef PBIINDEXEDBAMREADER_H
+#define PBIINDEXEDBAMREADER_H
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamReader.h"
+#include "pbbam/PbiBasicTypes.h"
+#include "pbbam/PbiFilter.h"
+#include "pbbam/PbiIndex.h"
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal { struct PbiIndexedBamReaderPrivate; }
+
+/// \brief The PbiIndexedBamReader class provides read-only iteration over %BAM
+///        records, limited to some filtering criteria.
+///
+/// The PacBio BAM index (*.pbi) is used to allow random-access operations.
+///
+class PBBAM_EXPORT PbiIndexedBamReader : public BamReader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Constructs %BAM reader, with an initial filter.
+    ///
+    /// All reads that satisfy the filter will be available.
+    ///
+    /// \param[in] filter       PbiFilter or compatible object
+    /// \param[in] bamFilename  input %BAM filename
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(const PbiFilter& filter, const std::string& bamFilename);
+
+    /// \brief Constructs %BAM reader, with an initial filter.
+    ///
+    /// All reads that satisfy the filter will be available.
+    ///
+    /// \param[in] filter       PbiFilter or compatible object
+    /// \param[in] bamFile      input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(const PbiFilter& filter, const BamFile& bamFile);
+
+    /// \brief Constructs %BAM reader, with an initial filter.
+    ///
+    /// All reads that satisfy the filter will be available.
+    ///
+    /// \param[in] filter       PbiFilter or compatible object
+    /// \param[in] bamFile      input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(const PbiFilter& filter, BamFile&& bamFile);
+
+    /// \brief Constructs %BAM reader, with no initial filter.
+    ///
+    /// Useful for delaying either specifying the filtering criteria or
+    /// performing the PBI lookups.
+    ///
+    /// \param[in] bamFilename  input %BAM filename
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(const std::string& bamFilename);
+
+    /// \brief Constructs %BAM reader, with no initial filter.
+    ///
+    /// Useful for delaying either specifying the filtering criteria or
+    /// performing the PBI lookups.
+    ///
+    /// \param[in] bamFile      input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(const BamFile& bamFile);
+
+    /// \brief Constructs %BAM reader, with no initial filter.
+    ///
+    /// Useful for delaying either specifying the filtering criteria or
+    /// performing the PBI lookups.
+    ///
+    /// \param[in] bamFile      input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(BamFile&& bamFile);
+
+    ~PbiIndexedBamReader(void);
+
+    /// \}
+
+public:
+    /// \name Filtering & Index Data
+    /// \{
+
+    /// \returns the current filter active on this reader
+    const PbiFilter& Filter(void) const;
+
+//    /// \returns the reader's underlying index data
+//    const PbiIndex& Index(void) const;
+
+public:
+    /// \brief Sets a new filter on the reader.
+    ///
+    /// \param[in] filter
+    /// \returns reference to this reader
+    ///
+    PbiIndexedBamReader& Filter(const PbiFilter& filter);
+
+    /// \}
+
+protected:
+    int ReadRawData(BGZF* bgzf, bam1_t* b);
+
+private:
+    std::unique_ptr<internal::PbiIndexedBamReaderPrivate> d_;
+};
+
+} // namespace internal
+} // namespace BAM
+
+#endif // PBIINDEXEDBAMREADER_H
diff --git a/include/pbbam/PbiLookupData.h b/include/pbbam/PbiLookupData.h

new file mode 100644 (file)

index 0000000..398c349
--- /dev/null
+++ b/include/pbbam/PbiLookupData.h
@@ -0,0 +1,718 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiLookupData.h
+/// \brief Defines the classes used for PBI data lookup.
+//
+// Author: Derek Barnett
+
+#ifndef PBILOOKUPDATA_H
+#define PBILOOKUPDATA_H
+
+#include "pbbam/Config.h"
+#include "pbbam/Compare.h"
+#include "pbbam/PbiBasicTypes.h"
+#include <deque>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+class PbiRawBarcodeData;
+class PbiRawBasicData;
+class PbiRawMappedData;
+class PbiRawReferenceData;
+
+/// \brief The OrderedLookup class provides a quick lookup structure for
+///        PBI index data, where key values are sorted.
+///
+/// The main, underlying lookup structure is essentailly a std::map, where the
+/// key is some value (e.g. readAccuracy) and the value is the list of indices
+/// (i-th record) in the %BAM file.
+///
+/// This lookup class is one of the main building blocks for the PBI index
+/// lookup components.
+///
+/// \param T    type of key stored (Accuracy for readAccuracy, int32_t for ZMW,
+///             etc.)
+///
+template<typename T>
+class OrderedLookup
+{
+public:
+    typedef T                                       key_type;
+    typedef IndexList                               value_type;
+    typedef std::map<key_type, value_type>          container_type;
+    typedef typename container_type::iterator       iterator;
+    typedef typename container_type::const_iterator const_iterator;
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty OrderedLookup structure.
+    ///
+    OrderedLookup(void);
+
+    /// \brief Creates an OrderedLookup struture, from another's underlying
+    ///        lookup container.
+    ///
+    /// \param[in] data     lookup data container
+    ///
+    OrderedLookup(const container_type& data);
+
+    /// \brief Creates an OrderedLookup struture, from another's underlying
+    ///        lookup container.
+    ///
+    /// \param[in] data     lookup data container
+    ///
+    OrderedLookup(container_type&& data);
+
+    /// \brief Creates an OrderedLookup struture, from raw data.
+    ///
+    /// \param[in] rawData  raw data values, where i is the index into the %BAM
+    ///                     file, and rawData[i] is the key value
+    ///
+    OrderedLookup(const std::vector<T>& rawData);
+
+    /// \brief Creates an OrderedLookup struture, from raw data.
+    ///
+    /// \param[in] rawData  raw data values, where i is the index into the %BAM
+    ///                     file, and rawData[i] is the key value
+    ///
+    OrderedLookup(std::vector<T>&& rawData);
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    /// \returns true if this lookup is same as \p other
+    bool operator==(const OrderedLookup<T>& other) const;
+
+    /// \returns true if this lookup is not the same as \p other
+    bool operator!=(const OrderedLookup<T>& other) const;
+
+    /// \}
+
+public:
+    /// \name STL-Compatibility Methods
+    /// \{
+
+    /// \returns an iterator to the first element in the underlying container
+    iterator begin(void);
+
+    /// \returns a const iterator to the first element in the underlying
+    ///          container
+    const_iterator begin(void) const;
+
+    /// \returns a const iterator to the first element in the underlying
+    ///
+    const_iterator cbegin(void) const;
+
+    /// \returns an iterator after the last element in the underlying container
+    iterator end(void);
+
+    /// \returns a const iterator after the last element in the underlying
+    ///          container
+    const_iterator end(void) const;
+
+    /// \returns a const iterator after the last element in the underlying
+    ///          container
+    const_iterator cend(void) const;
+
+    /// \returns true if underlying container is empty
+    bool empty(void) const;
+
+    /// \returns number of keys in the container
+    size_t size(void) const;
+
+    /// \}
+
+public:
+    /// \name Lookup Data
+    /// \{
+
+    /// \brief Performs a lookup into the underlying data.
+    ///
+    /// \param[in] key      key value to lookup
+    /// \param[in] compare  compare type
+    ///
+    /// \returns sorted list of unique indices that satisfy the lookup key &
+    ///          compare type
+    ///
+    IndexList LookupIndices(const key_type& key,
+                            const Compare::Type& compare) const;
+
+    /// \brief Converts the lookup structure back into its raw data.
+    ///
+    /// \returns raw data values, where i is the index into the %BAM file, and
+    ///          rawData[i] is the key value
+    ///
+    std::vector<T> Unpack(void) const;
+
+    /// \}
+
+private:
+    IndexList LookupInclusiveRange(const const_iterator& begin,
+                                   const const_iterator& end) const;
+
+    IndexList LookupExclusiveRange(const const_iterator& begin,
+                                   const const_iterator& end,
+                                   const key_type& key) const;
+
+private:
+    container_type data_;
+};
+
+/// \brief The UnorderedLookup class provides a quick lookup structure for
+///        PBI index data, where key values are not sorted.
+///
+/// The main, underlying lookup structure is essentailly a std::unordered_map,
+/// where the key is some value (e.g. read group ID) and the value is the list
+/// of indices (i-th record) in the %BAM file.
+///
+/// This lookup class is one of the main building blocks for the PBI index
+/// lookup components.
+///
+/// \param T    type of key stored (Accuracy for readAccuracy, int32_t for ZMW,
+///             etc.)
+///
+template<typename T>
+class UnorderedLookup
+{
+public:
+    typedef T                                        key_type;
+    typedef IndexList                                value_type;
+    typedef std::unordered_map<key_type, value_type> container_type;
+    typedef typename container_type::iterator        iterator;
+    typedef typename container_type::const_iterator  const_iterator;
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty UnorderedLookup structure.
+    ///
+    UnorderedLookup(void);
+
+    /// \brief Creates an UnorderedLookup struture, from another's underlying
+    ///        lookup container.
+    ///
+    /// \param[in] data     lookup data container
+    ///
+    UnorderedLookup(const container_type& data);
+
+    /// \brief Creates an UnorderedLookup struture, from another's underlying
+    ///        lookup container.
+    ///
+    /// \param[in] data     lookup data container
+    ///
+    UnorderedLookup(container_type&& data);
+
+    /// \brief Creates an UnorderedLookup struture, from raw data.
+    ///
+    /// \param[in] rawData  raw data values, where i is the index into the %BAM
+    ///                     file, and rawData[i] is the key value
+    ///
+    UnorderedLookup(const std::vector<T>& rawData);
+
+    /// \brief Creates an UnorderedLookup struture, from raw data.
+    ///
+    /// \param[in] rawData  raw data values, where i is the index into the %BAM
+    ///                     file, and rawData[i] is the key value
+    ///
+    UnorderedLookup(std::vector<T>&& rawData);
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    /// \returns true if this lookup is same as \p other
+    bool operator==(const UnorderedLookup<T>& other) const;
+
+    /// \returns true if this lookup is not the same as \p other
+    bool operator!=(const UnorderedLookup<T>& other) const;
+
+    /// \}
+
+public:
+    /// \name STL-Compatibility Methods
+    /// \{
+
+    /// \returns an iterator to the first element in the underlying container
+    iterator begin(void);
+
+    /// \returns a const iterator to the first element in the underlying
+    ///          container
+    const_iterator begin(void) const;
+
+    /// \returns a const iterator to the first element in the underlying
+    ///
+    const_iterator cbegin(void) const;
+
+    /// \returns an iterator after the last element in the underlying container
+    iterator end(void);
+
+    /// \returns a const iterator after the last element in the underlying
+    ///          container
+    const_iterator end(void) const;
+
+    /// \returns a const iterator after the last element in the underlying
+    ///          container
+    const_iterator cend(void) const;
+
+    /// \returns true if underlying container is empty
+    bool empty(void) const;
+
+    /// \returns number of keys in the container
+    size_t size(void) const;
+
+    /// \}
+
+public:
+    /// \name Lookup Data
+    /// \{
+
+    /// \brief Performs a lookup into the underlying data.
+    ///
+    /// \param[in] key      key value to lookup
+    /// \param[in] compare  compare type
+    ///
+    /// \returns sorted list of unique indices that satisfy the lookup key &
+    ///          compare type
+    ///
+    IndexList LookupIndices(const key_type& key,
+                            const Compare::Type& compare) const;
+
+    /// \brief Converts the lookup structure back into its raw data.
+    ///
+    /// \returns raw data values, where i is the index into the %BAM file, and
+    ///          rawData[i] is the key value
+    ///
+    std::vector<T> Unpack(void) const;
+
+    /// \}
+
+private:
+    template<typename Compare>
+    IndexList LookupHelper(const key_type& key,
+                           const Compare& cmp) const;
+
+private:
+    container_type data_;
+};
+
+/// \brief The BasicLookupData class provides quick lookup access to the
+///        "BasicData" section of the PBI index.
+///
+class PBBAM_EXPORT BasicLookupData
+{
+public:
+    /// \brief This enum describes the component fields of the BasicData
+    ///        section.
+    enum Field
+    {
+        RG_ID
+      , Q_START
+      , Q_END
+      , ZMW
+      , READ_QUALITY
+      , CONTEXT_FLAG
+      , VIRTUAL_OFFSET
+    };
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty lookup data object.
+    BasicLookupData(void);
+
+    /// \brief Creates a lookup data object from the corresponding raw data.
+    ///
+    /// \param[in] rawData  raw data loaded from a PBI file
+    ///
+    BasicLookupData(const PbiRawBasicData& rawData);
+
+    /// \}
+
+public:
+    /// \name Lookup Data Methods
+    /// \{
+
+    /// \brief Adds \b virtual file offset data to the index lookup result
+    ///        blocks.
+    ///
+    /// A PBI lookup will result in a number of index lists, depending on the
+    /// complexity of the PbiFilter involved. These index lists are then merged
+    /// down into blocks of contiguous values, where each block describes a
+    /// particular record index and the number of subsequent, contiguous reads
+    /// that immediately follow it. In this manner, we need only perform seeks
+    /// to the first record of each block.
+    ///
+    /// This method takes such blocks and annotates them with the corresponding
+    /// \b virtual file offset. Subsequent %BAM readers can use this information
+    /// to control file seeks.
+    ///
+    /// \param[in,out] blocks
+    ///
+    /// \throws std::out_of_range if a block has an invalid index value
+    ///
+    void ApplyOffsets(IndexResultBlocks& blocks) const;
+
+    /// \brief This method dispatches a single-value lookup query to the proper
+    ///         data member.
+    ///
+    /// Client code, such as custom filters, should use this when possible, only
+    /// touching the raw fields for more complex operations (e.g. when unpacking
+    /// is necessary).
+    ///
+    /// \param[in] field            section field to lookup
+    /// \param[in] value            value to lookup
+    /// \param[in] compareType      compare type
+    ///
+    /// \returns sorted list of unique indices that satisfy the lookup
+    ///
+    template<typename T>
+    IndexList Indices(const BasicLookupData::Field& field,
+                      const T& value,
+                      const Compare::Type& compareType = Compare::EQUAL) const;
+
+    /// \brief This method dispatches a multi-value lookup query to the proper
+    ///        data member.
+    ///
+    /// Client code, such as custom filters, should use this when possible, only
+    /// touching the raw fields for more complex operations (e.g. when unpacking
+    /// is necessary).
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Results will correspond to an exact match on at
+    ///       least one value in the list.
+    ///
+    /// \param[in] field        section field to lookup
+    /// \param[in] values       values to lookup
+    ///
+    /// \returns sorted list of unique indices that satisfy the lookup
+    ///
+    template<typename T>
+    IndexList IndicesMulti(const BasicLookupData::Field& field,
+                           const std::vector<T>& values) const;
+
+    /// \returns the \b virtual file offsets for all records
+    ///
+    const std::vector<int64_t>& VirtualFileOffsets(void) const;
+
+    /// \}
+
+public:
+    /// \brief Lookup Data Members
+    /// \{
+
+    // map ordering doesn't make sense, optimize for direct lookup
+    UnorderedLookup<int32_t> rgId_;
+
+    // numeric comparisons make sense, keep key ordering preserved
+    OrderedLookup<int32_t>  qStart_;
+    OrderedLookup<int32_t>  qEnd_;
+    OrderedLookup<int32_t>  holeNumber_;
+    OrderedLookup<float>    readQual_;
+
+    // see if this works, or if can use unordered, 'direct' query
+    OrderedLookup<uint8_t> ctxtFlag_;
+
+    // offsets
+    std::vector<int64_t> fileOffset_;
+
+    /// \}
+};
+
+/// \brief The MappedLookupData class provides quick lookup access to the
+///        "MappedData" section of the PBI index.
+///
+class PBBAM_EXPORT MappedLookupData
+{
+public:
+    /// \brief This enum describes the component fields of the MappedData
+    ///        section.
+    enum Field
+    {
+        T_ID
+      , T_START
+      , T_END
+      , A_START
+      , A_END
+      , N_M
+      , N_MM
+      , N_INS
+      , N_DEL
+      , MAP_QUALITY
+      , STRAND
+    };
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty lookup data object.
+    MappedLookupData(void);
+
+    /// \brief Creates a lookup data object from the corresponding raw data.
+    ///
+    /// \param[in] rawData  raw data loaded from a PBI file
+    ///
+    MappedLookupData(const PbiRawMappedData& rawData);
+
+    /// \}
+
+public:
+    /// \name Lookup Data Methods
+    /// \{
+
+    /// \brief This method dispatches a single-value lookup query to the proper
+    ///         data member.
+    ///
+    /// Client code, such as custom filters, should use this when possible, only
+    /// touching the raw fields for more complex operations (e.g. when unpacking
+    /// is necessary).
+    ///
+    /// \param[in] field            section field to lookup
+    /// \param[in] value            value to lookup
+    /// \param[in] compareType      compare type
+    ///
+    /// \returns sorted list of unique indices that satisfy the lookup
+    ///
+    template<typename T>
+    IndexList Indices(const MappedLookupData::Field& field,
+                      const T& value,
+                      const Compare::Type& compareType = Compare::EQUAL) const;
+
+    /// \brief This method dispatches a multi-value lookup query to the proper
+    ///        data member.
+    ///
+    /// Client code, such as custom filters, should use this when possible, only
+    /// touching the raw fields for more complex operations (e.g. when unpacking
+    /// is necessary).
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Results will correspond to an exact match on at
+    ///       least one value in the list.
+    ///
+    /// \param[in] field        section field to lookup
+    /// \param[in] values       values to lookup
+    ///
+    /// \returns sorted list of unique indices that satisfy the lookup
+    ///
+    template<typename T>
+    IndexList IndicesMulti(const MappedLookupData::Field& field,
+                           const std::vector<T>& values) const;
+
+    /// \}
+
+public:
+    /// \name Lookup Data Members
+    /// \{
+
+    // numeric comparisons make sense, keep key ordering preserved
+    OrderedLookup<int32_t>  tId_;
+    OrderedLookup<uint32_t> tStart_;
+    OrderedLookup<uint32_t> tEnd_;
+    OrderedLookup<uint32_t> aStart_;
+    OrderedLookup<uint32_t> aEnd_;
+    OrderedLookup<uint32_t> nM_;
+    OrderedLookup<uint32_t> nMM_;
+    OrderedLookup<uint8_t>  mapQV_;
+
+    // generated values, not stored directly in PBI file
+    OrderedLookup<uint32_t> nIns_;
+    OrderedLookup<uint32_t> nDel_;
+
+    // no need for map overhead, just store direct indices
+    IndexList reverseStrand_;
+    IndexList forwardStrand_;
+
+    /// \}
+};
+
+/// \brief The ReferenceLookupData class provides quick lookup access to the
+///        "CoordinateSortedData" section of the PBI index.
+///
+class PBBAM_EXPORT ReferenceLookupData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty lookup data object.
+    ///
+    ReferenceLookupData(void);
+
+    /// \brief Creates a lookup data object from the corresponding raw data.
+    ///
+    /// \param[in] rawData  raw data loaded from a PBI file
+    ///
+    ReferenceLookupData(const PbiRawReferenceData& rawData);
+
+    /// \}
+
+public:
+    /// \name Lookup Data Methods
+    /// \{
+
+    /// \brief Retrieves the index range for all records that map to a
+    ///        particular reference.
+    ///
+    /// Client code, such as custom filters, should use this when possible, only
+    /// touching the raw fields for more complex operations (e.g. when unpacking
+    /// is necessary).
+    ///
+    /// \param[in] tId      reference ID to lookup
+    ///
+    /// \returns resulting index range [begin, end). If \p tId is unknown,
+    ///          will return IndexRange(-1,-1) .
+    ///
+    IndexRange Indices(const int32_t tId) const;
+
+    /// \}
+
+public:
+    /// \name Lookup Data Members
+    /// \{
+
+    // references_[tId] = [begin, end) indices
+    std::unordered_map<int32_t, IndexRange> references_;
+
+    /// \}
+};
+
+/// \brief The BarcodeLookupData class provides quick lookup access to the
+///        "BarcodeData" section of the PBI index.
+///
+class PBBAM_EXPORT BarcodeLookupData
+{
+public:
+    /// \brief This enum describes the component fields of the BarcodeData
+    ///        section.
+    enum Field
+    {
+        BC_FORWARD
+      , BC_REVERSE
+      , BC_QUALITY
+    };
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty lookup data object.
+    ///
+    BarcodeLookupData(void);
+
+    /// \brief Creates a lookup data object from the corresponding raw data.
+    ///
+    /// \param[in] rawData  raw data loaded from a PBI file
+    ///
+    BarcodeLookupData(const PbiRawBarcodeData& rawData);
+
+    /// \}
+
+public:
+    /// \name Lookup Data Methods
+    /// \{
+
+    /// \brief This method dispatches a single-value lookup query to the proper
+    ///         data member.
+    ///
+    /// Client code, such as custom filters, should use this when possible, only
+    /// touching the raw fields for more complex operations (e.g. when unpacking
+    /// is necessary).
+    ///
+    /// \param[in] field            section field to lookup
+    /// \param[in] value            value to lookup
+    /// \param[in] compareType      compare type
+    ///
+    /// \returns sorted list of unique indices that satisfy the lookup
+    ///
+    template<typename T>
+    IndexList Indices(const BarcodeLookupData::Field& field,
+                      const T& value,
+                      const Compare::Type& compareType = Compare::EQUAL) const;
+
+    /// \brief This method dispatches a multi-value lookup query to the proper
+    ///        data member.
+    ///
+    /// Client code, such as custom filters, should use this when possible, only
+    /// touching the raw fields for more complex operations (e.g. when unpacking
+    /// is necessary).
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Results will correspond to an exact match on at
+    ///       least one value in the list.
+    ///
+    /// \param[in] field        section field to lookup
+    /// \param[in] values       values to lookup
+    ///
+    /// \returns sorted list of unique indices that satisfy the lookup
+    ///
+    template<typename T>
+    IndexList IndicesMulti(const BarcodeLookupData::Field& field,
+                           const std::vector<T>& values) const;
+
+    /// \}
+
+public:
+    /// \name Lookup Data Members
+    /// \{
+
+    // numeric comparisons make sense, keep key ordering preserved
+    OrderedLookup<int16_t> bcForward_;
+    OrderedLookup<int16_t> bcReverse_;
+    OrderedLookup<int8_t>  bcQual_;
+
+    /// \}
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "internal/PbiLookupData.inl"
+
+#endif // PBILOOKUPDATA_H
diff --git a/include/pbbam/PbiRawData.h b/include/pbbam/PbiRawData.h

new file mode 100644 (file)

index 0000000..6e1e974
--- /dev/null
+++ b/include/pbbam/PbiRawData.h
@@ -0,0 +1,531 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiRawData.h
+/// \brief Defines the classes used for working with raw PBI data.
+//
+// Author: Derek Barnett
+
+#ifndef PBIRAWDATA_H
+#define PBIRAWDATA_H
+
+#include "pbbam/Config.h"
+#include "pbbam/PbiFile.h"
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class DataSet;
+
+/// \brief The PbiRawBarcodeData class represents the raw data stored in the
+///        "BarcodeData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawBarcodeData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure.
+    PbiRawBarcodeData(void);
+
+    /// \brief Creates an empty data structure, preallocating space for a known
+    ///        number of records.
+    PbiRawBarcodeData(uint32_t numReads);
+
+    PbiRawBarcodeData(const PbiRawBarcodeData& other);
+    PbiRawBarcodeData(PbiRawBarcodeData&& other);
+    PbiRawBarcodeData& operator=(const PbiRawBarcodeData& other);
+    PbiRawBarcodeData& operator=(PbiRawBarcodeData&& other);
+
+    /// \}
+
+public:
+    /// \name Index Construction
+    /// \{
+
+    /// \brief Adds a record's barcode data.
+    ///
+    /// \param[in] b    %BAM record
+    ///
+    void AddRecord(const BamRecord& b);
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<int16_t> bcForward_;
+    std::vector<int16_t> bcReverse_;
+    std::vector<int8_t>  bcQual_;
+
+    /// \}
+};
+
+/// \brief The PbiRawMappedData class represents the raw data stored in the
+///        "MappedData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawMappedData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure.
+    PbiRawMappedData(void);
+
+    /// \brief Creates an empty data structure, preallocating space for a known
+    ///        number of records.
+    PbiRawMappedData(uint32_t numReads);
+
+    PbiRawMappedData(const PbiRawMappedData& other);
+    PbiRawMappedData(PbiRawMappedData&& other);
+    PbiRawMappedData& operator=(const PbiRawMappedData& other);
+    PbiRawMappedData& operator=(PbiRawMappedData&& other);
+
+    /// \}
+
+public:
+    /// \name Index Construction
+    /// \{
+
+    /// \brief Adds a record's mapping data.
+    ///
+    /// \param[in] b    %BAM record
+    ///
+    void AddRecord(const BamRecord& b);
+
+    /// \}
+
+public:
+    /// \name Index Data Query
+    /// \{
+
+    /// \brief Calculates the number of deleted bases for a particular record.
+    ///
+    /// Convenvience method. Equivalent to:
+    /// \code{.cpp}
+    /// NumDeletedAndInsertedBasesAt(i).first;
+    /// \endcode
+    ///
+    /// \param[in] recordIndex  i-th record
+    /// \returns number of deleted bases
+    ///
+    uint32_t NumDeletedBasesAt(size_t recordIndex) const;
+
+    /// \brief Calculates the number of inserted bases for a particular record.
+    ///
+    /// Convenvience method. Equivalent to:
+    /// \code{.cpp}
+    /// NumDeletedAndInsertedBasesAt(i).second;
+    /// \endcode
+    ///
+    /// \param[in] recordIndex  i-th record
+    /// \returns number of inserted bases
+    ///
+    uint32_t NumInsertedBasesAt(size_t recordIndex) const;
+
+    /// \brief Calculates the number of deleted & inserted bases for a
+    ///        particular record.
+    ///
+    /// \param[in] recordIndex  i-th record in the data set
+    /// \returns a pair consisting of (numDeletions,numInsertions)
+    ///
+    std::pair<uint32_t, uint32_t>
+    NumDeletedAndInsertedBasesAt(size_t recordIndex) const;
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<int32_t>  tId_;
+    std::vector<uint32_t> tStart_;
+    std::vector<uint32_t> tEnd_;
+    std::vector<uint32_t> aStart_;
+    std::vector<uint32_t> aEnd_;
+    std::vector<uint8_t>  revStrand_;
+    std::vector<uint32_t> nM_;
+    std::vector<uint32_t> nMM_;
+    std::vector<uint8_t>  mapQV_;
+
+    /// \}
+};
+
+/// \brief The PbiReferenceEntryClass represents a single reference in the PBI
+///        CoordinateSorted section.
+///
+/// A reference entry consists of an associated reference ID (tId), as well as
+/// start and end indices into the %BAM or PBI.
+///
+/// \note Rows are given in the interval [start, end).
+///
+class PBBAM_EXPORT PbiReferenceEntry
+{
+public:
+    typedef uint32_t ID;
+    typedef uint32_t Row;
+
+public:
+    static const ID  UNMAPPED_ID;
+    static const Row UNSET_ROW;
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a default entry.
+    ///
+    /// - default ID:   PbiReferenceEntry::UNMAPPED_ID \n
+    /// - default rows: PbiReferenceEntry::UNSET_ROW
+    ///
+    PbiReferenceEntry(void);
+
+    /// \brief Creates a reference entry, with no rows set.
+    ///
+    /// - default rows: PbiReferenceEntry::UNSET_ROW
+    ///
+    PbiReferenceEntry(ID id);
+
+    /// \brief Creates a reference entry, with rows set.
+    ///
+    PbiReferenceEntry(ID id, Row beginRow, Row endRow);
+
+    PbiReferenceEntry(const PbiReferenceEntry& other);
+    PbiReferenceEntry(PbiReferenceEntry&& other);
+    PbiReferenceEntry& operator=(const PbiReferenceEntry& other);
+    PbiReferenceEntry& operator=(PbiReferenceEntry&& other);
+
+    bool operator==(const PbiReferenceEntry& other) const;
+
+    /// \}
+
+public:
+    /// \name Reference Data Members
+    /// \{
+
+    ID  tId_;
+    Row beginRow_;
+    Row endRow_;
+
+    /// \}
+};
+
+/// \brief The PbiRawReferenceData class represents the raw data stored in the
+///        "CoordinateSortedData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawReferenceData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure.
+    PbiRawReferenceData(void);
+
+    /// \brief Creates an empty data structure, preallocating space for a
+    ///        number of references.
+    ///
+    /// This constructor is recommended as this is the safest way to ensure that
+    /// references without observed mappings are included in the final output.
+    ///
+    PbiRawReferenceData(uint32_t numRefs);
+
+    PbiRawReferenceData(const PbiRawReferenceData& other);
+    PbiRawReferenceData(PbiRawReferenceData&& other);
+    PbiRawReferenceData& operator=(const PbiRawReferenceData& other);
+    PbiRawReferenceData& operator=(PbiRawReferenceData&& other);
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<PbiReferenceEntry> entries_;
+
+    /// \}
+};
+
+/// \brief The PbiRawBasicData class represents the raw data stored in the
+///        "BasicData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawBasicData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure.
+    PbiRawBasicData(void);
+
+    /// \brief Creates an empty data structure, preallocating space for a known
+    ///        number of records.
+    PbiRawBasicData(uint32_t numReads);
+
+    PbiRawBasicData(const PbiRawBasicData& other);
+    PbiRawBasicData(PbiRawBasicData&& other);
+    PbiRawBasicData& operator=(const PbiRawBasicData& other);
+    PbiRawBasicData& operator=(PbiRawBasicData&& other);
+
+    /// \}
+
+public:
+    /// \name Index Construction
+    /// \{
+
+    /// \brief Adds a record's mapping data.
+    ///
+    /// \param[in] b        %BAM record
+    /// \param[in] offset   \b virtual file offset where record begins
+    ///
+    void AddRecord(const BamRecord& b, int64_t offset);
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<int32_t>  rgId_;
+    std::vector<int32_t>  qStart_;
+    std::vector<int32_t>  qEnd_;
+    std::vector<int32_t>  holeNumber_;
+    std::vector<float>    readQual_;
+    std::vector<uint8_t>  ctxtFlag_;
+    std::vector<int64_t>  fileOffset_;
+    std::vector<uint16_t> fileNumber_;
+
+    /// \}
+};
+
+/// \deprecated For legacy-code support only, and will be removed soon.
+///             Use PbiRawBasicData instead.
+///
+typedef PbiRawBasicData PbiRawSubreadData;
+
+/// \brief The PbiRawData class provides an representation of raw PBI index
+///        data, used mostly for construction or I/O.
+///
+/// The PbiRawData class itself provides access to a few high-level attributes
+/// (e.g. version, number of records, etc.). The actual index data is stored
+/// in its member components:
+///     PbiRawBasicData,
+///     PbiRawMappedData,
+///     PbiRawReferenceData, &
+///     PbiRawBarcodeData .
+///
+class PBBAM_EXPORT PbiRawData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty raw data structure, ready for building.
+    ///
+    PbiRawData(void);
+
+    /// \brief Loads raw PBI data from a file.
+    ///
+    /// \param[in] pbiFilename      ".pbi" filename
+    ///
+    /// \throws std::runtime_error if file contents cannot be loaded properly
+    ///
+    PbiRawData(const std::string& pbiFilename);
+
+    /// \brief Loads a raw, aggregate PBI data from a dataset
+    ///
+    /// This constructor creates a raw index object that contains an aggregation
+    /// of index data across the dataset.
+    ///
+    /// \note ReferenceData (the per-reference table for coordinate-sorted data)
+    ///       is not currently available for the index aggregate. All other
+    ///       per-record data sections will be present.
+    ///
+    /// \param[in] dataset  DataSet object
+    ///
+    /// \throws std::runtime_error if file(s) contents cannot be loaded properly
+    ///
+    explicit PbiRawData(const DataSet& dataset);
+
+    PbiRawData(const PbiRawData& other);
+    PbiRawData(PbiRawData&& other);
+    PbiRawData& operator=(const PbiRawData& other);
+    PbiRawData& operator=(PbiRawData&& other);
+    ~PbiRawData(void);
+
+    /// \}
+
+public:
+    /// \name PBI General Attributes
+    /// \{
+
+    /// \returns true if index has BarcodeData section
+    bool HasBarcodeData(void) const;
+
+    /// \returns true if index has MappedData section
+    bool HasMappedData(void) const;
+
+    /// \returns true if index has ReferenceData section
+    bool HasReferenceData(void) const;
+
+    /// \returns true if index has \b section
+    /// \param[in] section PbiFile::Section identifier
+    ///
+    bool HasSection(const PbiFile::Section section) const;
+
+    /// \returns index filename ("*.pbi")
+    ///
+    /// \note Returns an empty string if the underlying data was calculated in
+    ///       code or aggregated from a DataSet, rather than loaded from a
+    ///       single PBI file.
+    ///
+    std::string Filename(void) const;
+
+    /// \returns enum flags representing the file sections present
+    PbiFile::Sections FileSections(void) const;
+
+    /// \returns the number of records in the PBI(s)
+    uint32_t NumReads(void) const;
+
+    /// \returns the PBI file's version
+    PbiFile::VersionEnum Version(void) const;
+
+    /// \}
+
+public:
+    /// \name Raw Data Components
+    /// \{
+
+    /// \returns const reference to BarcodeData lookup structure
+    ///
+    /// May be empty, check result of HasBarcodeData.
+    ///
+    const PbiRawBarcodeData& BarcodeData(void) const;
+
+    /// \returns const reference to BasicData lookup structure
+    const PbiRawBasicData& BasicData(void) const;
+
+    /// \returns const reference to MappedData lookup structure
+    ///
+    /// May be empty, check result of HasMappedData.
+    ///
+    const PbiRawMappedData& MappedData(void) const;
+
+    /// \returns const reference to reference data lookup structure
+    ///
+    /// May be empty, check result of HasReferenceData.
+    ///
+    const PbiRawReferenceData& ReferenceData(void) const;
+
+    /// \}
+
+public:
+    /// \name PBI General Attributes
+    /// \{
+
+    /// \brief Sets the file section flags.
+    ///
+    /// \param[in] sections     section flags
+    /// \returns reference to this index
+    ///
+    PbiRawData& FileSections(PbiFile::Sections sections);
+
+    /// \brief Sets the number of indexed records.
+    ///
+    /// \param[in] num  number of records
+    /// \returns reference to this index
+    ///
+    PbiRawData& NumReads(uint32_t num);
+
+    /// \brief Sets PBI file version.
+    ///
+    /// \param[in] version  file version
+    /// \returns reference to this index
+    ///
+    PbiRawData& Version(PbiFile::VersionEnum version);
+
+    /// \}
+
+public:
+    /// \name Raw Data Components
+    /// \{
+
+    /// \returns reference to BarcodeData lookup structure
+    ///
+    /// May be empty, check result of HasBarcodeData.
+    ///
+    PbiRawBarcodeData& BarcodeData(void);
+
+    /// \returns reference to BasicData lookup structure
+    PbiRawBasicData& BasicData(void);
+
+    /// \returns reference to MappedData lookup structure
+    ///
+    /// May be empty, check result of HasMappedData.
+    ///
+    PbiRawMappedData& MappedData(void);
+
+    /// \returns reference to reference data lookup structure
+    ///
+    /// May be empty, check result of HasReferenceData.
+    ///
+    PbiRawReferenceData& ReferenceData(void);
+
+    /// \}
+
+private:
+    std::string          filename_;
+    PbiFile::VersionEnum version_;
+    PbiFile::Sections    sections_;
+    uint32_t             numReads_;
+    PbiRawBarcodeData    barcodeData_;
+    PbiRawMappedData     mappedData_;
+    PbiRawReferenceData  referenceData_;
+    PbiRawBasicData      basicData_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/PbiRawData.inl"
+
+#endif // PBIRAWDATA_H
diff --git a/include/pbbam/Position.h b/include/pbbam/Position.h

new file mode 100644 (file)

index 0000000..aece8c2
--- /dev/null
+++ b/include/pbbam/Position.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Position.h
+/// \brief Defines the Position typedef.
+//
+// Author: Derek Barnett
+
+#ifndef POSITION_H
+#define POSITION_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This type is used to refer to genomic positions.
+/// \typedef typedef int32_t PacBio::BAM::Position
+///
+/// We use a signed integer because SAM/BAM uses the -1 value to indicate
+/// unknown or unmapped positions.
+///
+typedef int32_t Position;
+
+/// \brief This constant is widely used as a "missing" or "invalid" position
+///        marker.
+///
+static const Position UnmappedPosition = Position(-1);
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // POSITION_H
diff --git a/include/pbbam/ProgramInfo.h b/include/pbbam/ProgramInfo.h

new file mode 100644 (file)

index 0000000..e137707
--- /dev/null
+++ b/include/pbbam/ProgramInfo.h
@@ -0,0 +1,222 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ProgramInfo.h
+/// \brief Defines the ProgramInfo class.
+//
+// Author: Derek Barnett
+
+#ifndef PROGRAMINFO_H
+#define PROGRAMINFO_H
+
+#include "pbbam/Config.h"
+#include <map>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ProgramInfo class represents a program entry (\@PG) in the SAM
+///        header.
+///
+class PBBAM_EXPORT ProgramInfo
+{
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \brief Creates a ProgramInfo object from SAM-formatted text.
+    ///
+    /// \param[in] sam  SAM-formatted text
+    /// \returns program info object
+    ///
+    static ProgramInfo FromSam(const std::string& sam);
+
+    /// \brief Converts a ProgramInfo object to its SAM-formatted text.
+    ///
+    /// \param[in] prog     input ProgramInfo object
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    static std::string ToSam(const ProgramInfo& prog);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty program info object.
+    ProgramInfo(void);
+
+    /// \brief Creates a program info object with an ID.
+    ///
+    /// \param[in] id       program ID (\@PG:ID)
+    ///
+    ProgramInfo(const std::string& id);
+
+
+    ProgramInfo(const ProgramInfo& other);
+    ProgramInfo(ProgramInfo&& other);
+    ProgramInfo& operator=(const ProgramInfo& other);
+    ProgramInfo& operator=(ProgramInfo&& other);
+    ~ProgramInfo(void);
+
+    /// \}
+
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \returns true if program info is valid
+    ///
+    /// Currently this checks to see that ProgramInfo::Id does not contain an
+    /// empty string.
+    ///
+    bool IsValid(void) const;
+
+    /// \brief Converts this object to its SAM-formatted text.
+    ///
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    std::string ToSam(void) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns string value of \@PG:CL
+    std::string CommandLine(void) const;
+
+    /// \returns any non-standard tags added to the \@PG entry
+    ///
+    /// Result map consists of {tagName => value}.
+    ///
+    std::map<std::string, std::string> CustomTags(void) const;
+
+    /// \returns string value of \@PG:DS
+    std::string Description(void) const;
+
+    /// \returns string value of \@PG:ID
+    std::string Id(void) const;
+
+    /// \returns string value of \@PG:PN
+    std::string Name(void) const;
+
+    /// \returns string value of \@PG:PP
+    std::string PreviousProgramId(void) const;
+
+    /// \returns string value of \@PG:VN
+    std::string Version(void) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets the value for \@PG:CL
+    ///
+    /// \param[in] cmd      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& CommandLine(const std::string& cmd);
+
+    /// \brief Sets a new collection of non-standard tags.
+    ///
+    /// Custom tag map entries should consist of {tagName => value}.
+    ///
+    /// \param[in] custom      new tags
+    /// \returns reference to this object
+    ///
+    ProgramInfo& CustomTags(const std::map<std::string, std::string>& custom);
+
+    /// \brief Sets the value for \@PG:DS
+    ///
+    /// \param[in] description      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Description(const std::string& description);
+
+    /// \brief Sets the value for \@PG:ID
+    ///
+    /// \param[in] id      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Id(const std::string& id);
+
+    /// \brief Sets the value for \@PG:PN
+    ///
+    /// \param[in] name      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Name(const std::string& name);
+
+    /// \brief Sets the value for \@PG:PP
+    ///
+    /// \param[in] id      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& PreviousProgramId(const std::string& id);
+
+    /// \brief Sets the value for \@PG:VN
+    ///
+    /// \param[in] version      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Version(const std::string& version);
+
+    /// \}
+
+private:
+    std::string commandLine_;       // CL:<CommandLine>
+    std::string description_;       // DS:<Description>
+    std::string id_;                // ID:<ID>  * must be unique for valid SAM *
+    std::string name_;              // PN:<Name>
+    std::string previousProgramId_; // PP:<PreviousProgramID>
+    std::string version_;           // VN:<Version>
+
+    // custom attributes
+    std::map<std::string, std::string> custom_;     // tag => value
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/ProgramInfo.inl"
+
+#endif // PROGRAMINFO_H
diff --git a/include/pbbam/PulseBehavior.h b/include/pbbam/PulseBehavior.h

new file mode 100644 (file)

index 0000000..79ec0da
--- /dev/null
+++ b/include/pbbam/PulseBehavior.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PulseBehavior.h
+/// \brief Defines the PulseBehavior enum.
+//
+// Author: Derek Barnett
+
+#ifndef PULSEBEHAVIOR_H
+#define PULSEBEHAVIOR_H
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the pulsecall modes supported by BamRecord tag
+///        accessors.
+///
+enum class PulseBehavior
+{
+    BASECALLS_ONLY  ///< "Squashed" pulses not included, only basecalls.
+  , ALL             ///< All pulses included.
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // PULSEBEHAVIOR_H
diff --git a/include/pbbam/QNameQuery.h b/include/pbbam/QNameQuery.h

new file mode 100644 (file)

index 0000000..ad93d03
--- /dev/null
+++ b/include/pbbam/QNameQuery.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file QNameQuery.h
+/// \brief Defines the QNameQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef QNAMEQUERY_H
+#define QNAMEQUERY_H
+
+#include "pbbam/internal/QueryBase.h"
+#include <memory>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The QNameQuery class provides iterable access to a DataSet's records,
+///        with each iteration of the query returning a contiguous block of
+///        records that share a name.
+///
+/// There is no random-access here. It is simply a sequential read-through,
+/// grouping contiguous results that share a BamRecord::FullName.
+///
+/// \note The name is not ideal - but for legacy reasons, it will remain as-is
+///       for now. It will likely become something more explicit, like
+///       "SequentialQNameGroupQuery", so that the name "QNameQuery" will be
+///       available for a built-in query on a QNAME filter (or whitelist). This
+///       will make it more consistent with other queries (ReadAccuracyQuery,
+///       SubreadLengthQuery, ZmwQuery, etc).
+///
+class PBBAM_EXPORT QNameQuery : public internal::IGroupQuery
+{
+public:
+
+    /// \brief Creates a new QNameQuery.
+    ///
+    /// \param[in] dataset      input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM files
+    ///
+    QNameQuery(const DataSet& dataset);
+    ~QNameQuery(void);
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(std::vector<BamRecord>& records);
+
+private:
+    struct QNameQueryPrivate;
+    std::unique_ptr<QNameQueryPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // QNAMEQUERY_H
diff --git a/include/pbbam/QualityValue.h b/include/pbbam/QualityValue.h

new file mode 100644 (file)

index 0000000..ab108d0
--- /dev/null
+++ b/include/pbbam/QualityValue.h
@@ -0,0 +1,115 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file QualityValue.h
+/// \brief Defines the QualityValue class.
+//
+// Author: Derek Barnett
+
+#ifndef QUALITYVALUE_H
+#define QUALITYVALUE_H
+
+#include "pbbam/Config.h"
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The QualityValue class represents a FASTQ-compatible quality value.
+///
+/// Integers are clamped to [0, 93] (corresponding to ASCII printable chars
+/// [!-~]).
+///
+/// Use QualityValue::FromFastq for constructing entries from FASTQ encoding
+/// characters. Otherwise, the resulting QualityValue will be interpreted using
+/// the character's numeric value (ignoring the FASTQ offset of 33).
+///
+class PBBAM_EXPORT QualityValue
+{
+public:
+    static const uint8_t MAX;
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \brief Creates a QualityValue from a FASTQ-encoding character.
+    ///
+    /// \param[in] c    FASTQ character
+    /// \returns quality value representing (c - 33)
+    ///
+    static QualityValue FromFastq(const char c);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    ///  \{
+
+    /// \brief Creates a QualityValue with specified value.
+    ///
+    /// \param[in] value    quality value
+    ///
+    QualityValue(const uint8_t value = 0);
+
+    QualityValue(const QualityValue& other);
+    ~QualityValue(void);
+
+    /// \}
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \returns the FASTQ-encoding char for this QualityValue
+    char Fastq(void) const;
+
+    /// \returns the integer value of this QualityValue
+    operator uint8_t(void) const;
+
+    /// \}
+
+private:
+    uint8_t value_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/QualityValue.inl"
+
+#endif // QUALITYVALUE_H
diff --git a/include/pbbam/QualityValues.h b/include/pbbam/QualityValues.h

new file mode 100644 (file)

index 0000000..af054f6
--- /dev/null
+++ b/include/pbbam/QualityValues.h
@@ -0,0 +1,200 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file QualityValues.h
+/// \brief Defines the QualityValues class.
+//
+// Author: Derek Barnett
+
+#ifndef QUALITYVALUES_H
+#define QUALITYVALUES_H
+
+#include "pbbam/QualityValue.h"
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The QualityValues class represents a sequence of FASTQ-compatible
+///        quality values. See QualityValue documentation for more details.
+///
+class PBBAM_EXPORT QualityValues : public std::vector<QualityValue>
+{
+public:
+    /// \brief Creates a QualityValues object from a FASTQ-encoded string.
+    ///
+    /// \param[in] fastq    FASTQ-encoded string
+    /// \returns corresponding QualityValues object
+    ///
+    static QualityValues FromFastq(const std::string& fastq);
+
+public:
+    /// \name Constructors & Related Methods
+    ///  \{
+
+    /// \brief Default constructor - creates an empty QualityValues object.
+    QualityValues(void);
+
+    /// \brief Creates a QualityValues object from a FASTQ-encoded string.
+    ///
+    /// \param[in] fastqString  FASTQ-encoded string
+    ///
+    explicit QualityValues(const std::string& fastqString);
+
+    /// \brief Creates a QualityValues object from a vector of QualityValue
+    ///        elements.
+    ///
+    /// \param[in] quals    vector of QualityValue elements
+    ///
+    explicit QualityValues(const std::vector<QualityValue>& quals);
+
+    /// \brief Creates a QualityValues object from a vector of QualityValue
+    ///        elements.
+    ///
+    /// \param[in] quals    vector of QualityValue elements
+    ///
+    QualityValues(std::vector<QualityValue>&& quals);
+
+    /// \brief Creates a QualityValues object from a vector of (numeric) quality
+    ///        values.
+    ///
+    /// \param[in] quals    vector of quality value numbers
+    ///
+    explicit QualityValues(const std::vector<uint8_t>& quals);
+
+    /// \brief Creates a QualityValues object from the contents of the range:
+    ///        [first, last)
+    ///
+    /// \param[in] first    input iterator, whose element is a numeric quality
+    /// \param[in] last     input iterator, whose element is a numeric quality
+    ///
+    QualityValues(const std::vector<uint8_t>::const_iterator first,
+                  const std::vector<uint8_t>::const_iterator last);
+
+    /// \brief Creates a QualityValues object from the contents of the range:
+    ///        [first, last)
+    ///
+    /// \param[in] first    input iterator, whose element is a QualityValue
+    /// \param[in] last     input iterator, whose element is a QualityValue
+    ///
+    QualityValues(const QualityValues::const_iterator first,
+                  const QualityValues::const_iterator last);
+
+    /// \brief Copy constructor
+    QualityValues(const QualityValues& other);
+
+    /// \brief Move constructor
+    QualityValues(QualityValues&& other);
+
+    /// \brief Copy assignment operator
+    ///
+    /// \param[in] other    QualityValues object
+    ///
+    QualityValues& operator=(const QualityValues& other);
+
+    /// \brief Move assignment operator
+    ///
+    /// \param[in] other    QualityValues object
+    ///
+    QualityValues& operator=(QualityValues&& other);
+
+    /// \brief Copy assignment operator
+    ///
+    /// \param[in] quals    vector of QualityValue elements
+    ///
+    QualityValues& operator=(const std::vector<QualityValue>& quals);
+
+    /// \brief Move assignment operator
+    ///
+    /// \param[in] quals    vector of QualityValue elements
+    ///
+    QualityValues& operator=(std::vector<QualityValue>&& quals);
+
+    /// \brief Destructor
+    ~QualityValues(void);
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    bool operator==(const std::string& other) const;
+    bool operator!=(const std::string& other) const;
+
+    /// \}
+
+public:
+    /// \name Iterators
+    /// \{
+
+    /// \returns a const_iterator to the beginning of the sequence
+    std::vector<QualityValue>::const_iterator cbegin(void) const;
+
+    /// \returns a const_iterator to the element following the last element
+    std::vector<QualityValue>::const_iterator cend(void) const;
+
+    /// \returns a const_iterator to the beginning of the sequence
+    std::vector<QualityValue>::const_iterator begin(void) const;
+
+    /// \returns a const_iterator to the element following the last element
+    std::vector<QualityValue>::const_iterator end(void) const;
+
+    /// \returns an iterator to the beginning of the sequence
+    std::vector<QualityValue>::iterator begin(void);
+
+    /// \returns an iterator to the element following the last element
+    std::vector<QualityValue>::iterator end(void);
+
+    /// \}
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \returns the FASTQ-encoded string for this sequence of quality values
+    std::string Fastq(void) const;
+
+    /// \}
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/QualityValues.inl"
+
+#endif // QUALITYVALUES_H
diff --git a/include/pbbam/ReadAccuracyQuery.h b/include/pbbam/ReadAccuracyQuery.h

new file mode 100644 (file)

index 0000000..1eecb6c
--- /dev/null
+++ b/include/pbbam/ReadAccuracyQuery.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ReadAccuracyQuery.h
+/// \brief Defines the ReadAccuracyQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef READACCURACYQUERY_H
+#define READACCURACYQUERY_H
+
+#include "pbbam/Accuracy.h"
+#include "pbbam/Compare.h"
+#include "pbbam/Config.h"
+#include "pbbam/internal/QueryBase.h"
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ReadAccuracyQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a read accuracy
+///        criterion.
+///
+/// Example:
+/// \include code/ReadAccuracyQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT ReadAccuracyQuery : public internal::IQuery
+{
+public:
+
+    /// \brief Creates a new ReadAccuracyQuery, limiting record results to only
+    ///        those matching a read accuracy criterion.
+    ///
+    /// \param[in] accuracy     read accuracy value
+    /// \param[in] compareType  compare operator
+    /// \param[in] dataset      input data source(s)
+    ///
+    /// \sa BamRecord::ReadAccuracy
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI
+    ///         files.
+    ///
+    ReadAccuracyQuery(const Accuracy accuracy,
+                      const Compare::Type compareType,
+                      const DataSet& dataset);
+
+    ~ReadAccuracyQuery(void);
+
+public:
+
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r);
+
+private:
+    struct ReadAccuracyQueryPrivate;
+    std::unique_ptr<ReadAccuracyQueryPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // READACCURACYQUERY_H
diff --git a/include/pbbam/ReadGroupInfo.h b/include/pbbam/ReadGroupInfo.h

new file mode 100644 (file)

index 0000000..c6c8a5f
--- /dev/null
+++ b/include/pbbam/ReadGroupInfo.h
@@ -0,0 +1,636 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ReadGroupInfo.h
+/// \brief Defines the ReadGroupInfo class.
+//
+// Author: Derek Barnett
+
+#ifndef READGROUPINFO_H
+#define READGROUPINFO_H
+
+#include "pbbam/Config.h"
+#include "pbbam/exception/InvalidSequencingChemistryException.h"
+#include <map>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum describes the base features that may be present in a read
+///        group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class BaseFeature
+{
+    DELETION_QV
+  , DELETION_TAG
+  , INSERTION_QV
+  , MERGE_QV
+  , SUBSTITUTION_QV
+  , SUBSTITUTION_TAG
+  , IPD
+  , PULSE_WIDTH
+  , PKMID
+  , PKMEAN
+  , PKMID2
+  , PKMEAN2
+  , LABEL
+  , LABEL_QV
+  , ALT_LABEL
+  , ALT_LABEL_QV
+  , PULSE_MERGE_QV
+  , PULSE_CALL
+  , PRE_PULSE_FRAMES
+  , PULSE_CALL_WIDTH
+  , START_FRAME
+};
+
+/// \brief This enum describes the encoding types used for frame data within a
+///        read group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class FrameCodec
+{
+    RAW
+  , V1
+};
+
+/// \brief This enum describes the experimental design of the barcodes within a
+///        read group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class BarcodeModeType
+{
+   NONE
+ , SYMMETRIC
+ , ASYMMETRIC
+};
+
+/// \brief This enum describes the type of value encoded by barcode quality,
+///        within a read group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class BarcodeQualityType
+{
+    NONE
+  , SCORE
+  , PROBABILITY
+};
+
+/// \brief This enum describes the instrument type / platform model,
+///        within a read group's records.
+///
+/// This information is stored in its description (\@RG:PM).
+///
+enum class PlatformModelType
+{
+    ASTRO
+  , RS
+  , SEQUEL
+};
+
+/// \brief The ReadGroupInfo class represents a read group entry (\@RG) in the
+///        SAM header.
+///
+class PBBAM_EXPORT ReadGroupInfo
+{
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \brief Creates a ReadGroupInfo object from SAM-formatted text.
+    ///
+    /// \param[in] sam  SAM-formatted text
+    /// \returns read group info object
+    ///
+    static ReadGroupInfo FromSam(const std::string& sam);
+
+    /// \brief Converts a ReadGroupInfo object to its SAM-formatted text.
+    ///
+    /// \param[in] rg     input ReadGroupInfo object
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    static std::string ToSam(const ReadGroupInfo& rg);
+
+    /// \brief Converts a read group ID (string) to its numeric value.
+    ///
+    /// \param[in] rgId     read group ID string
+    /// \returns numeric value of ID
+    ///
+    static int32_t IdToInt(const std::string& rgId);
+
+    /// \brief Converts a read group ID number to its string representation.
+    ///
+    /// \param[in] id     read group ID number
+    /// \returns hexadecimal string representation of ID
+    ///
+    static std::string IntToId(const int32_t id);
+
+    /// \returns sequencing chemistry from (bindingKig, sequencingKit,
+    ///          basecallerVersion)
+    ///
+    static std::string SequencingChemistryFromTriple(const std::string& bindingKit,
+                                                     const std::string& sequencingKit,
+                                                     const std::string& basecallerVersion);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty read group info object.
+    ReadGroupInfo(void);
+
+    /// \brief Creates a read group info object with an ID.
+    ///
+    /// \param[in] id   string representation of read group ID
+    ///
+    ReadGroupInfo(const std::string& id);
+
+    /// \brief Creates a read group info object from a movie name & read type.
+    ///
+    /// \param[in] movieName    sequencing movie name
+    /// \param[in] readType     string version of record type
+    ///
+    /// \sa RecordType
+    ///
+    ReadGroupInfo(const std::string& movieName,
+                  const std::string& readType);
+
+    /// \brief Creates a read group info object from a movie name, read type,
+    ///        and platform model.
+    ///
+    /// \param[in] movieName    sequencing movie name
+    /// \param[in] readType     string version of record type
+    /// \param[in] platform     platform model type
+    ///
+    /// \sa RecordType
+    ///
+    ReadGroupInfo(const std::string& movieName,
+                  const std::string& readType,
+                  const PlatformModelType platform);
+
+    ReadGroupInfo(const ReadGroupInfo& other);
+    ReadGroupInfo(ReadGroupInfo&& other);
+    ReadGroupInfo& operator=(const ReadGroupInfo& other);
+    ReadGroupInfo& operator=(ReadGroupInfo&& other);
+    ~ReadGroupInfo(void);
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    bool operator==(const ReadGroupInfo& other) const;
+
+    /// \}
+
+public:
+    /// \name Conversion & Validation
+    /// \{
+
+    /// \returns true if read group info is valid
+    ///
+    /// Currently this checks to see that ReadGroupInfo::Id does not contain an
+    /// empty string.
+    ///
+    bool IsValid(void) const;
+
+    /// \brief Converts this object to its SAM-formatted text.
+    ///
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    std::string ToSam(void) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns the number of barcode sequences in BarcodeFile
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    size_t BarcodeCount(void) const;
+
+    /// \returns name of FASTA file containing barcode sequences
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    std::string  BarcodeFile(void) const;
+
+    /// \returns MD5 hash of the contents of BarcodeFile
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    std::string BarcodeHash(void) const;
+
+    /// \returns experimental design type of barcodes
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    BarcodeModeType BarcodeMode(void) const;
+
+    /// \returns type of value encoded in the 'bq' tag
+    ///
+    /// \throws std::runtime_error if barcode data is not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    BarcodeQualityType BarcodeQuality(void) const;
+
+    /// \returns basecaller version number (e.g. "2.1")
+    std::string BasecallerVersion(void) const;
+
+    /// \returns tag name in use for the specified for base feature
+    std::string BaseFeatureTag(const BaseFeature& feature) const;
+
+    /// \returns binding kit part number (e.g. "100236500")
+    std::string BindingKit(void) const;
+
+    /// \returns true if reads are classified as spike-in controls
+    bool Control(void) const;
+
+    /// \returns any non-standard tags added to the \@PG entry
+    ///
+    /// Result map consists of {tagName => value}.
+    ///
+    std::map<std::string, std::string> CustomTags(void) const;
+
+    /// \returns string value of \@RG:DT
+    std::string Date(void) const;
+
+    /// \returns string value of \@RG:FO
+    std::string FlowOrder(void) const;
+
+    /// \returns frame rate in Hz
+    std::string FrameRateHz(void) const;
+
+    /// \returns true if read group has barcode data
+    bool HasBarcodeData(void) const;
+
+    /// \returns true if read group has an entry for the specified base feature
+    bool HasBaseFeature(const BaseFeature& feature) const;
+
+    /// \returns string value of \@RG:ID
+    std::string Id(void) const;
+
+    /// \returns codec type in use for IPD
+    FrameCodec IpdCodec(void) const;
+
+    /// \returns string value of \@RG:KS
+    std::string KeySequence(void) const;
+
+    /// \returns string value of \@RG:LB
+    std::string Library(void) const;
+
+    /// \returns movie name (stored in \@RG:PU)
+    std::string MovieName(void) const;
+
+    /// \returns string value of \@RG:PL
+    std::string Platform(void) const;
+
+    /// \returns string value of \@RG:PM
+    PlatformModelType PlatformModel(void) const;
+
+    /// \returns string value of \@RG:PI
+    std::string PredictedInsertSize(void) const;
+
+    /// \returns string value of \@RG:PG
+    std::string Programs(void) const;
+
+    /// \returns codec type in use for PulseWidth
+    FrameCodec PulseWidthCodec(void) const;
+
+    /// \returns string value of read type
+    std::string ReadType(void) const;
+
+    /// \returns string value of \@RG:SM
+    std::string Sample(void) const;
+
+    /// \returns string value of \@RG:CN
+    std::string SequencingCenter(void) const;
+
+    /// \returns sequencing chemistry name
+    std::string SequencingChemistry(void) const;
+
+    /// \returns sequencing kit part number
+    std::string SequencingKit(void) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets read group's barcode data.
+    ///
+    /// Barcode fields are either absent or all must be present.
+    ///
+    /// \param[in] barcodeFile      barcode filename
+    /// \param[in] barcodeHash      MD5 hash of barcode file
+    /// \param[in] barcodeCount     number of records in barcode file
+    /// \param[in] barcodeMode      experimental design of barcodes
+    /// \param[in] barcodeQuality   type of barcode quality value
+    ///
+    /// \sa BarcodeFile \n
+    ///     BarcodeHash \n
+    ///     BarcodeCount \n
+    ///     BarcodeMode \n
+    ///     BarcodeQuality \n
+    ///     ReadGroupInfo::ClearBarcodeData
+    ///
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BarcodeData(const std::string& barcodeFile,
+                               const std::string& barcodeHash,
+                               size_t barcodeCount,
+                               BarcodeModeType barcodeMode,
+                               BarcodeQualityType barcodeQuality);
+
+    /// \brief Sets the basecaller version number.
+    ///
+    /// \param[in] versionNumber   new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BasecallerVersion(const std::string& versionNumber);
+
+    /// \brief Sets the tag to be used for a particular base feature.
+    ///
+    /// \param[in] feature      feature type begin updated
+    /// \param[in] tag          new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BaseFeatureTag(const BaseFeature& feature,
+                                  const std::string& tag);
+
+    /// \brief Sets the binding kit part number.
+    ///
+    /// \param[in] kitNumber    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BindingKit(const std::string& kitNumber);
+
+    /// \brief Removes all barcode data from this read group.
+    ///
+    /// \returns reference to this read group
+    ///
+    ReadGroupInfo& ClearBarcodeData(void);
+
+    /// \brief Removes all base features from this read group.
+    ///
+    /// \returns reference to this read group
+    ///
+    ReadGroupInfo& ClearBaseFeatures(void);
+
+    /// \brief Sets whether read group's records are classifed as spike-in
+    ///        controls.
+    ///
+    /// \param[in] ctrl     true if records are spike-in controls
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Control(const bool ctrl);
+
+    /// \brief Sets a new collection of non-standard tags.
+    ///
+    /// Custom tag map entries should consist of {tagName => value}.
+    ///
+    /// \param[in] custom      new tags
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& CustomTags(const std::map<std::string, std::string>& custom);
+
+    /// \brief Sets the value for \@RG:DT
+    ///
+    /// \param[in] date      new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Date(const std::string& date);
+
+    /// \brief Sets the value for \@RG:FO
+    ///
+    /// \param[in] order     new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& FlowOrder(const std::string& order);
+
+    /// \brief Sets the frame rate.
+    ///
+    /// \param[in] frameRateHz     string value of frame rate in Hz
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& FrameRateHz(const std::string& frameRateHz);
+
+    /// \brief Sets the read group's ID.
+    ///
+    /// \param[in] id     string value of ID
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Id(const std::string& id);
+
+    /// \brief Sets the read group's ID, from movie name & read type
+    ///
+    /// \param[in] movieName    sequencing movie name
+    /// \param[in] readType     string version of read type
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Id(const std::string& movieName,
+                      const std::string& readType);
+
+    /// \brief Sets the codec type used for IPD
+    ///
+    /// \param[in] codec    codec type
+    /// \param[in] tag      IPD tag
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& IpdCodec(const FrameCodec& codec,
+                            const std::string& tag = std::string());
+
+    /// \brief Sets the value for \@RG:KS
+    ///
+    /// \param[in] sequence      new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& KeySequence(const std::string& sequence);
+
+    /// \brief Sets the value for \@RG:LB
+    ///
+    /// \param[in] library      new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Library(const std::string& library);
+
+    /// \brief Sets the value for movie name (stored in \@RG:PU).
+    ///
+    /// \param[in] movieName    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& MovieName(const std::string& movieName);
+
+    /// \brief Sets the value for \@RG:PI
+    ///
+    /// \param[in] size         new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& PredictedInsertSize(const std::string& size);
+
+    /// \brief Sets the value for \@RG:PG
+    ///
+    /// \param[in] programs     new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Programs(const std::string& programs);
+
+    /// \brief Sets the value for \@RG:PM
+    ///
+    /// \param[in] platformModel new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& PlatformModel(const PlatformModelType& platform);
+
+    /// \brief Sets the codec type used for PulseWidth
+    ///
+    /// \param[in] codec    codec type
+    /// \param[in] tag      pulse width tag
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& PulseWidthCodec(const FrameCodec& codec,
+                                   const std::string& tag = std::string());
+
+    /// \brief Sets the read type.
+    ///
+    /// \param[in] type    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& ReadType(const std::string& type);
+
+    /// \brief Removes a particular base feature from this read group.
+    ///
+    /// \param[in] feature      feature to remove
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& RemoveBaseFeature(const BaseFeature& feature);
+
+    /// \brief Sets the value for \@RG:SM
+    ///
+    /// \param[in] sample       new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Sample(const std::string& sample);
+
+    /// \brief Sets the value for \@RG:CN
+    ///
+    /// \param[in] center       new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& SequencingCenter(const std::string& center);
+
+    /// \brief Sets the sequencing kit part number.
+    ///
+    /// \param[in] kitNumber    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& SequencingKit(const std::string& kitNumber);
+
+    /// \}
+
+private:
+    std::string id_;                    // ID * must be unique for valid SAM *
+    std::string sequencingCenter_;      // CN
+    std::string date_;                  // DT * (ISO-8601) *
+    std::string flowOrder_;             // FO
+    std::string keySequence_;           // KS
+    std::string library_;               // LB
+    std::string programs_;              // PG
+    std::string predictedInsertSize_;   // PI
+    std::string movieName_;             // PU
+    std::string sample_;                // SM
+
+    PlatformModelType platformModel_;   // PM
+
+    // DS:<Description> components
+    std::string readType_;
+    std::string bindingKit_;
+    std::string sequencingKit_;
+    std::string basecallerVersion_;
+    std::string frameRateHz_;
+    bool        control_ = false;
+    FrameCodec  ipdCodec_;
+    FrameCodec  pulseWidthCodec_;
+    bool        hasBarcodeData_ = false;
+    std::string barcodeFile_;
+    std::string barcodeHash_;
+    size_t      barcodeCount_ = 0;
+    BarcodeModeType barcodeMode_ = BarcodeModeType::NONE;
+    BarcodeQualityType barcodeQuality_ = BarcodeQualityType::NONE;
+    std::map<BaseFeature, std::string> features_;
+
+    // custom attributes
+    std::map<std::string, std::string> custom_; // tag => value
+
+private:
+    std::string EncodeSamDescription(void) const;
+    void DecodeSamDescription(const std::string& description);
+};
+
+/// \brief Creates a read group ID from a movie name & read type.
+///
+/// \param[in] movieName    sequencing movie name
+/// \param[in] readType     string version of read type
+///
+/// \returns hexadecimal string read group ID
+///
+PBBAM_EXPORT
+std::string MakeReadGroupId(const std::string& movieName,
+                            const std::string& readType);
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/ReadGroupInfo.inl"
+
+#endif // READGROUPINFO_H
diff --git a/include/pbbam/RecordType.h b/include/pbbam/RecordType.h

new file mode 100644 (file)

index 0000000..9688211
--- /dev/null
+++ b/include/pbbam/RecordType.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file RecordType.h
+/// \brief Defines the RecordType enum.
+//
+// Author: Derek Barnett
+
+#ifndef RECORDTYPE_H
+#define RECORDTYPE_H
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the possible PacBio BAM record types.
+///
+/// \sa ReadGroupInfo::ReadType
+///
+enum class RecordType
+{
+    ZMW         ///< Polymerase read
+  , HQREGION    ///< High-quality region
+  , SUBREAD     ///< Subread (
+  , CCS         ///< Circular consensus sequence
+  , SCRAP       ///< Additional sequence (barcodes, adapters, etc.)
+  , UNKNOWN     ///< Unknown read type
+
+  , POLYMERASE = ZMW ///< \deprecated as of PacBio BAM spec v 3.0.4 (use RecordType::ZMW instead)
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // RECORDTYPE_H
diff --git a/include/pbbam/SamTagCodec.h b/include/pbbam/SamTagCodec.h

new file mode 100644 (file)

index 0000000..cc4def4
--- /dev/null
+++ b/include/pbbam/SamTagCodec.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2014, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file SamTagCodec.h
+/// \brief Defines the SamTagCodec class.
+//
+// Author: Derek Barnett
+
+#ifndef SAMTAGCODEC_H
+#define SAMTAGCODEC_H
+
+#include "pbbam/Config.h"
+#include "pbbam/TagCollection.h"
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The SamTagCodec class provides text-based encoding/decoding of %BAM
+///        tag data.
+///
+/// \note SamTagCodec is mostly an implementation and/or testing detail, and may
+///       be removed from the public API.
+///
+class PBBAM_EXPORT SamTagCodec
+{
+public:
+    /// \name Tag Collection Methods
+    /// \{
+
+    /// \brief Creates a TagCollection from SAM-formatted tag data.
+    ///
+    /// \param[in] tagString    SAM-formmated string
+    /// \returns resulting tag collection
+    ///
+    static TagCollection Decode(const std::string& tagString);
+
+    /// \brief Creates SAM-formatted string from a TagCollection.
+    ///
+    /// \param[in] tags     TagCollection containing tag data
+    /// \returns SAM-formatted string
+    ///
+    static std::string Encode(const PacBio::BAM::TagCollection& tags);
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // SAMTAGCODEC_H
diff --git a/include/pbbam/SamWriter.h b/include/pbbam/SamWriter.h

new file mode 100644 (file)

index 0000000..b407d7e
--- /dev/null
+++ b/include/pbbam/SamWriter.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file SamWriter.h
+/// \brief Defines the SamWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef SAMWRITER_H
+#define SAMWRITER_H
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/IRecordWriter.h"
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal { class SamWriterPrivate; }
+
+/// \brief The SamWriter class provides a writing interface for creating
+///        new SAM files.
+///
+/// \note The underlying buffered data may not be flushed to the file until the
+///       destructor is called. Trying to access the file (reading, stat-ing,
+///       indexing, etc.) before the SamWriter is destroyed yields undefined
+///       behavior. Enclose the SamWriter in some form of local scope (curly
+///       braces, a separate function, etc.) to ensure that its destructor is
+///       called before proceeding to read-based operations.
+///
+/// \code{.cpp}
+///  {
+///     SamWriter w(...);
+///     // write data
+///  }
+///  // now safe to access the new file
+/// \endcode
+///
+///
+class SamWriter : public IRecordWriter
+{
+public:
+    /// \brief Opens a SAM file for writing & writes the header information.
+    ///
+    /// \note Set \p filename to "-" for stdout.
+    ///
+    /// \param[in] filename     path to output SAM file
+    /// \param[in] header       BamHeader object
+    ///
+    /// \throws std::runtime_error if there was a problem opening the file for
+    ///         writing or if an error occurred while writing the header
+    ///
+    SamWriter(const std::string& filename, const BamHeader& header);
+
+    /// Fully flushes all buffered data & closes file.
+    ///
+    ~SamWriter(void);
+
+public:
+    
+    /// \brief Try to flush any buffered data to file.
+    ///
+    /// \note The underlying implementation may not necessarily flush buffered
+    ///       data immediately, especially in a multithreaded writer situation.
+    ///       Let the SamWriter go out of scope to fully ensure flushing.
+    ///
+    /// \throws std::runtime_error if flush fails
+    ///
+    void TryFlush(void) override;
+
+    /// \brief Write a record to the output SAM file.
+    ///
+    /// \param[in] record BamRecord object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecord& record) override;
+
+    /// \brief Write a record to the output SAM file.
+    ///
+    /// \param[in] recordImpl BamRecordImpl object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecordImpl& recordImpl) override;
+
+private:
+    std::unique_ptr<internal::SamWriterPrivate> d_;
+    DISABLE_MOVE_AND_COPY(SamWriter);
+};
+
+} // namesapce BAM
+} // namespace PacBio
+
+#endif // SAMWRITER_H
diff --git a/include/pbbam/SequenceInfo.h b/include/pbbam/SequenceInfo.h

new file mode 100644 (file)

index 0000000..88b8dd1
--- /dev/null
+++ b/include/pbbam/SequenceInfo.h
@@ -0,0 +1,232 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file SequenceInfo.h
+/// \brief Defines the SequenceInfo class.
+//
+// Author: Derek Barnett
+
+#ifndef SEQUENCEINFO_H
+#define SEQUENCEINFO_H
+
+#include "pbbam/Config.h"
+#include <map>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The SequenceInfo class represents a program entry (\@SQ) in the SAM
+///        header.
+///
+class PBBAM_EXPORT SequenceInfo
+{
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \brief Creates a SequenceInfo object from SAM-formatted text.
+    ///
+    /// \param[in] sam  SAM-formatted text
+    /// \returns program info object
+    ///
+    static SequenceInfo FromSam(const std::string& sam);
+
+    /// \brief Converts a SequenceInfo object to its SAM-formatted text.
+    ///
+    /// \param[in] seq     input SequenceInfo object
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    static std::string ToSam(const SequenceInfo& seq);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty sequence info object.
+    SequenceInfo(void);
+
+    /// \brief Creates a sequence info object with name & (optional) length.
+    ///
+    /// \param[in] name       sequence name (\@SQ:SN)
+    /// \param[in] length     sequence length (\@SQ:LN)
+    ///
+    SequenceInfo(const std::string& name,
+                 const std::string& length = "0");
+
+    SequenceInfo(const SequenceInfo& other);
+    SequenceInfo(SequenceInfo&& other);
+    SequenceInfo& operator=(const SequenceInfo& other);
+    SequenceInfo& operator=(SequenceInfo&& other);
+    ~SequenceInfo(void);
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    bool operator==(const SequenceInfo& other) const;
+    bool operator!=(const SequenceInfo& other) const;
+
+    /// \}
+
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \returns true if sequence info is valid
+    ///
+    /// Currently this checks to see that Name is non-empty and Length is within
+    /// the accepted range.
+    ///
+    bool IsValid(void) const;
+
+    /// \brief Converts this object to its SAM-formatted text.
+    ///
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    std::string ToSam(void) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns string value of \@SQ:AS
+    std::string AssemblyId(void) const;
+
+    /// \returns string value of \@SQ:M5
+    std::string Checksum(void) const;
+
+    /// \returns any non-standard tags added to the \@PG entry
+    ///
+    /// Result map consists of {tagName => value}.
+    ///
+    std::map<std::string, std::string> CustomTags(void) const;
+
+    /// \returns string value of \@SQ:LN
+    std::string Length(void) const;
+
+    /// \returns string value of \@SQ:SN
+    std::string Name(void) const;
+
+    /// \returns string value of \@SQ:SP
+    std::string Species(void) const;
+
+    /// \returns string value of \@SQ:UR
+    std::string Uri(void) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets the value for \@SQ:AS
+    ///
+    /// \param[in] id      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& AssemblyId(const std::string& id);
+
+    /// \brief Sets the value for \@SQ:M5
+    ///
+    /// \param[in] checksum      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Checksum(const std::string& checksum);
+
+    /// \brief Sets a new collection of non-standard tags.
+    ///
+    /// Custom tag map entries should consist of {tagName => value}.
+    ///
+    /// \param[in] custom      new tags
+    /// \returns reference to this object
+    ///
+    SequenceInfo& CustomTags(const std::map<std::string, std::string>& custom);
+
+    /// \brief Sets the value for \@SQ:LN
+    ///
+    /// \param[in] length      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Length(const std::string& length);
+
+    /// \brief Sets the value for \@SQ:SN
+    ///
+    /// \param[in] name      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Name(const std::string& name);
+
+    /// \brief Sets the value for \@SQ:SP
+    ///
+    /// \param[in] species     new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Species(const std::string& species);
+
+    /// \brief Sets the value for \@SQ:UR
+    ///
+    /// \param[in] uri      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Uri(const std::string& uri);
+
+    /// \}
+
+private:
+    std::string name_;          // SN:<Name>    * must be unique for valid SAM *
+    std::string length_;        // LN:<Length>  * must be within [0 - 2^31-1] *
+    std::string assemblyId_;    // AS:<AssemblyId>
+    std::string checksum_;      // M5:<Checksum>
+    std::string species_;       // SP:<Species>
+    std::string uri_;           // UR:<URI>
+
+    // custom attributes
+    std::map<std::string, std::string> custom_; // tag => value
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/SequenceInfo.inl"
+
+#endif // SEQUENCEINFO_H
diff --git a/include/pbbam/Strand.h b/include/pbbam/Strand.h

new file mode 100644 (file)

index 0000000..6fa5043
--- /dev/null
+++ b/include/pbbam/Strand.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Strand.h
+/// \brief Defines the Strand enum.
+//
+// Author: Derek Barnett
+
+#ifndef STRAND_H
+#define STRAND_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the strand orientations used for reporting
+///        alignment-related information.
+///
+enum class Strand
+{
+    FORWARD     ///< Forward strand
+  , REVERSE     ///< Reverse strand
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // STRAND_H
diff --git a/include/pbbam/SubreadLengthQuery.h b/include/pbbam/SubreadLengthQuery.h

new file mode 100644 (file)

index 0000000..e8839fe
--- /dev/null
+++ b/include/pbbam/SubreadLengthQuery.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file SubreadLengthQuery.h
+/// \brief Defines the SubreadLengthQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef SUBREADLENGTHQUERY_H
+#define SUBREADLENGTHQUERY_H
+
+#include "pbbam/Compare.h"
+#include "pbbam/Config.h"
+#include "pbbam/internal/QueryBase.h"
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The SubreadLengthQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a subread length
+///        criterion.
+///
+/// Example:
+/// \include code/SubreadLengthQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT SubreadLengthQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new SubreadLengthQuery, limiting record results to only
+    ///        those matching a subread length criterion.
+    ///
+    /// \param[in] length       subread length value
+    /// \param[in] compareType  compare operator
+    /// \param[in] dataset      input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI
+    ///         files.
+    ///
+    SubreadLengthQuery(const int32_t length,
+                       const Compare::Type compareType,
+                       const DataSet& dataset);
+
+    ~SubreadLengthQuery(void);
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r);
+
+private:
+    struct SubreadLengthQueryPrivate;
+    std::unique_ptr<SubreadLengthQueryPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // SUBREADLENGTHQUERY_H
diff --git a/include/pbbam/Tag.h b/include/pbbam/Tag.h

new file mode 100644 (file)

index 0000000..0520e38
--- /dev/null
+++ b/include/pbbam/Tag.h
@@ -0,0 +1,453 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Tag.h
+/// \brief Defines the Tag class.
+//
+// Author: Derek Barnett
+
+#ifndef TAG_H
+#define TAG_H
+
+#include "pbbam/Config.h"
+#include <boost/variant.hpp>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum is used to describe the exact (C++) data type held by a
+///        Tag.
+///
+enum class TagDataType
+{
+    INVALID      = 0     ///< boost::blank
+  , INT8                 ///< int8_t
+  , UINT8                ///< uint8_t
+  , INT16                ///< int16_t
+  , UINT16               ///< uint16_t
+  , INT32        = 5     ///< int32_t
+  , UINT32               ///< uint32_t
+  , FLOAT                ///< float
+  , STRING               ///< std::string
+  , INT8_ARRAY           ///< std::vector<int8_t>
+  , UINT8_ARRAY  = 10    ///< std::vector<uint8_t>
+  , INT16_ARRAY          ///< std::vector<int16_t>
+  , UINT16_ARRAY         ///< std::vector<uint16_t>
+  , INT32_ARRAY          ///< std::vector<int32_t>
+  , UINT32_ARRAY         ///< std::vector<uint32_t>
+  , FLOAT_ARRAY  = 15    ///< std::vector<float>
+};
+
+/// \brief This enum provides additional instructions on interpreting the tag's
+///        value.
+///
+/// Some C++ data types (e.g. std::string) may represent more than one BAM tag
+/// type ('H' vs 'Z'). Thus a TagModifier may be used to indicate how to
+/// properly distinguish between these shared data types.
+///
+enum class TagModifier
+{
+    /// \brief This value indicates that the tag has no modifiers set.
+    ///
+    NONE = 0,
+
+    /// \brief This modifier marks an integer as ASCII.
+    ///
+    /// SAM/BAM has the concept of an ASCII character that is distinct from an
+    /// 8-bit integer. However, there is no such pure separation in C++ - as
+    /// int8_t/uint8_t are likely implemented as typedefs around char/unsigned
+    /// char. Thus this modifier can be used to indicate a tag's value should be
+    /// interpreted as a printable, ASCII character.
+    ///
+    ASCII_CHAR,
+
+    /// \brief This modifier marks std::string data as "hex string", rather than
+    ///        a regular string.
+    ///
+    /// SAM/BAM has a distinction between regular strings and "Hex format"
+    /// strings. However, they are both manipulated in C++ via std::string. Thus
+    /// this modifier can be used to indicate that a tag's string data should be
+    /// interpreted as "Hex format" rather than a regular, literal string.
+    ///
+    HEX_STRING
+};
+
+/// \brief The Tag class represents a SAM/BAM record tag value.
+///
+/// SAM/BAM tags may store values from a variety of types: varying fixed-width
+/// integers, strings, arrays of data, etc.
+///
+/// The Tag class allow tags to be handled in a generic fashion, while
+/// maintaining a high level of type-safety. Only those types recognized by the
+/// SAM/BAM format are allowed, and extracting the value from a tag is subject
+/// to allowed conversion rules, as well.
+///
+// Inspired by (but greatly simplified & modified from) the boost::variant
+// wrapper approach taken by DynamicCpp (https://code.google.com/p/dynamic-cpp)
+//
+class PBBAM_EXPORT Tag
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty, null tag
+    Tag(void);
+
+    /// \brief Creates a Tag from a signed 8-bit integer or character.
+    ///
+    /// Without a TagModifier, the resulting Tag will be annotated as containing
+    /// an 8-bit integer, whether the input \p value was an integer or a char.
+    /// For ASCII tags, use one of these methods:
+    /// \include code/Tag_AsciiCtor.txt
+    ///
+    Tag(int8_t value);
+
+   /// \brief Creates a Tag from a signed 8-bit integer or character,
+   ///        applying the provided modifier.
+    ///
+    /// This method allows direct construction of an ASCII character, rather
+    /// than an 8-bit integer (e.g. Tag('A', TagModifier::ASCII_CHAR) ).
+    ///
+    /// \throws runtime_error if \p modifier is not valid for int8_t data
+    ///
+    Tag(int8_t value, const TagModifier mod);
+
+    /// \brief Creates a Tag from an unsigned 8-bit integer or character.
+    ///
+    /// Without a TagModifier, the resulting Tag will be annotated as containing
+    /// an 8-bit unsigned integer, whether the input \p value was an integer or
+    /// a char. For ASCII tags, use one of these methods:
+    /// \include code/Tag_AsciiCtor.txt
+    ///
+    Tag(uint8_t value);
+
+    /// \brief Creates a Tag from 16-bit integer.
+    Tag(int16_t value);
+
+    /// \brief Creates a Tag from 16-bit unsigned integer.
+    Tag(uint16_t value);
+
+    /// \brief Creates a Tag from 32-bit signed integer.
+    Tag(int32_t value);
+
+    /// \brief Creates a Tag from 32-bit unsigned integer.
+    Tag(uint32_t value);
+
+    /// \brief Creates a Tag from floating-point value.
+    Tag(float value);
+
+    /// \brief Creates a Tag from string data.
+    Tag(const std::string& value);
+
+    /// \brief Creates a Tag from string data, adding modifier.
+    ///
+    /// \throws runtime_error if \p modifier is not valid for string data
+    ///
+    Tag(const std::string& value, const TagModifier mod);
+
+    /// \brief Creates a Tag from a vector of 8-bit integers.
+    Tag(const std::vector<int8_t>& value);
+
+    /// \brief Creates a Tag from a vector of 8-bit unsigned integers.
+    Tag(const std::vector<uint8_t>& value);
+
+    /// \brief Creates a Tag from a vector of 16-bit integers.
+    Tag(const std::vector<int16_t>& value);
+
+    /// \brief Creates a Tag from a vector of 16-bit unsigned integers.
+    Tag(const std::vector<uint16_t>& value);
+
+    /// Constructs a Tag from a vector of 32-bit integers.
+    Tag(const std::vector<int32_t>& value);
+
+    /// \brief Creates a Tag from a vector of 32-bit unsigned integers.
+    Tag(const std::vector<uint32_t>& value);
+
+    /// \brief Creates a Tag from a vector of floating-point values.
+    Tag(const std::vector<float>& value);
+    
+    Tag(const Tag& other);
+    Tag(Tag&& other);
+    ~Tag(void);
+
+    Tag& operator=(boost::blank value);
+    Tag& operator=(int8_t value);
+    Tag& operator=(uint8_t value);
+    Tag& operator=(int16_t value);
+    Tag& operator=(uint16_t value);
+    Tag& operator=(int32_t value);
+    Tag& operator=(uint32_t value);
+    Tag& operator=(float value);
+    Tag& operator=(const std::string& value);
+    Tag& operator=(const std::vector<int8_t>& value);
+    Tag& operator=(const std::vector<uint8_t>& value);
+    Tag& operator=(const std::vector<int16_t>& value);
+    Tag& operator=(const std::vector<uint16_t>& value);
+    Tag& operator=(const std::vector<int32_t>& value);
+    Tag& operator=(const std::vector<uint32_t>& value);
+    Tag& operator=(const std::vector<float>& value);
+    Tag& operator=(const Tag& other);
+    Tag& operator=(Tag&& other);
+
+    bool operator== (const Tag& other) const;
+    bool operator!= (const Tag& other) const;
+
+    /// \}
+
+public:
+    /// \name Data Conversion & Validation
+    /// \{
+
+    /// \brief Converts the tag value to an ASCII character.
+    ///
+    /// Tag must hold an integral type, within the valid ASCII range [33-127].
+    ///
+    /// \returns ASCII character value
+    /// \throws std::runtime_error if not ASCII-compatible
+    ///
+    char ToAscii(void) const;
+
+    /// \returns tag data as signed 8-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    int8_t ToInt8(void) const;
+
+    /// \returns tag data as unsigned 8-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    uint8_t ToUInt8(void) const;
+
+    /// \returns tag data as signed 16-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    int16_t ToInt16(void) const;
+
+    /// \returns tag data as unsigned 16-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    uint16_t ToUInt16(void) const;
+
+    /// \returns tag data as signed 32-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    int32_t ToInt32(void) const;
+
+    /// \returns tag data as unsigned 32-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    uint32_t ToUInt32(void) const;
+
+    /// \returns tag data as float
+    /// \throws std::runtime_error if tag does not contain a value of
+    ///         explicit type: float
+    float ToFloat(void) const;
+
+    /// \returns tag data as std::string
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::string
+    std::string ToString(void) const;
+
+    /// \returns tag data as std::vector<int8_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<int8_t>
+    std::vector<int8_t> ToInt8Array(void) const;
+
+    /// \returns tag data as std::vector<uint8_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<uint8_t>
+    std::vector<uint8_t> ToUInt8Array(void) const;
+
+    /// \returns tag data as std::vector<int16_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<int16_t>
+    std::vector<int16_t> ToInt16Array(void) const;
+
+    /// \returns tag data as std::vector<uint16_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<uint16_t>
+    std::vector<uint16_t> ToUInt16Array(void) const;
+
+    /// \returns tag data as std::vector<int32_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<int32_t>
+    std::vector<int32_t> ToInt32Array(void) const;
+
+    /// \returns tag data as std::vector<uint32_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<uint32_t>
+    std::vector<uint32_t> ToUInt32Array(void) const;
+
+    /// \returns tag data as std::vector<float>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<float>
+    std::vector<float> ToFloatArray(void) const;
+
+    /// \}
+
+public:
+
+    /// \name Data Conversion & Validation
+    ///
+
+    /// \returns true if tag is null (e.g. default-constructed)
+    bool IsNull(void) const;
+
+    /// \returns true if tag contains a value of type: int8_t
+    bool IsInt8(void) const;
+
+    /// \returns true if tag contains a value of type: uint8_t
+    bool IsUInt8(void) const;
+
+    /// \returns true if tag contains a value of type: int16_t
+    bool IsInt16(void) const;
+
+    /// \returns true if tag contains a value of type: uint16_t
+    bool IsUInt16(void) const;
+
+    /// \returns true if tag contains a value of type: int32_t
+    bool IsInt32(void) const;
+
+    /// \returns true if tag contains a value of type: uint32_t
+    bool IsUInt32(void) const;
+
+    /// \returns true if tag contains a value of type: float
+    bool IsFloat(void) const;
+
+    /// \returns true if tag contains a value of type: std::string
+    bool IsString(void) const;
+
+    /// \returns true if tag contains a value of type: std::string \b AND has a
+    ///          TagModifier of TagModifier::HEX_STRING
+    bool IsHexString(void) const;
+
+    /// \returns true if tag contains a value of type: std::vector<int8_t>
+    bool IsInt8Array(void) const;
+
+    /// \returns true if tag contains a value of type: std::vector<uint8_t>
+    bool IsUInt8Array(void) const;
+
+    /// \returns true if tag contains a value of type: std::vector<int16_t>
+    bool IsInt16Array(void) const;
+
+    /// \returns true if tag contains a value of type: std::vector<uint16_t>
+    bool IsUInt16Array(void) const;
+
+    /// \returns true if tag contains a value of type: std::vector<int32_t>
+    bool IsInt32Array(void) const;
+
+    /// \returns true if tag contains a value of type: std::vector<uint32_t>
+    bool IsUInt32Array(void) const;
+
+    /// \returns true if tag contains a value of type: std::vector<float>
+    bool IsFloatArray(void) const;
+
+    /// \returns true if tag contains a value with any signed integer type
+    bool IsSignedInt(void) const;
+
+    /// \returns true if tag contains a value with any unsigned integer type
+    bool IsUnsignedInt(void) const;
+
+    /// \returns true if tag contains a value with any integer type
+    bool IsIntegral(void) const;
+
+    /// \returns true if tag contains a value with any integer or float type
+    bool IsNumeric(void) const;
+
+    /// \returns true if tag contains a vector containing signed integers
+    bool IsSignedArray(void) const;
+
+    /// \returns true if tag contains a vector containing unsigned integers
+    bool IsUnsignedArray(void) const;
+
+    /// \returns true if tag contains a vector containing integers
+    bool IsIntegralArray(void) const;
+
+    /// \returns true if tag contains a vector (integers or floats)
+    bool IsArray(void) const;
+
+    /// \}
+
+public:
+    /// \name Type & Modifier Attributes
+    /// \{
+
+    /// \returns enum value for current tag data
+    TagDataType Type(void) const;
+
+    /// \returns printable type name for current tag data
+    std::string Typename(void) const;
+
+    /// \returns true if tag data modifier \p m is set
+    bool HasModifier(const TagModifier m) const;
+
+    /// \returns current tag data modifier
+    TagModifier Modifier(void) const;
+
+    /// \brief Sets tag data modifier.
+    ///
+    /// \param[in] m    new modifier value
+    ///
+    /// \returns reference to this tag
+    Tag& Modifier(const TagModifier m);
+
+    /// \}
+
+private :
+    // NOTE - keep this synced with TagDataType enum ordering
+    typedef boost::variant<boost::blank, // <-- default constructor creates variant of this type
+                           int8_t,
+                           uint8_t,
+                           int16_t,
+                           uint16_t,
+                           int32_t,
+                           uint32_t,
+                           float,
+                           std::string,
+                           std::vector<int8_t>,
+                           std::vector<uint8_t>,
+                           std::vector<int16_t>,
+                           std::vector<uint16_t>,
+                           std::vector<int32_t>,
+                           std::vector<uint32_t>,
+                           std::vector<float> > var_t;
+
+    var_t data_;
+    TagModifier modifier_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/Tag.inl"
+
+#endif // TAG_H
diff --git a/include/pbbam/TagCollection.h b/include/pbbam/TagCollection.h

new file mode 100644 (file)

index 0000000..11c80ff
--- /dev/null
+++ b/include/pbbam/TagCollection.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file TagCollection.h
+/// \brief Defines the TagCollection class.
+//
+// Author: Derek Barnett
+
+#ifndef TAGCOLLECTION_H
+#define TAGCOLLECTION_H
+
+#include "pbbam/Config.h"
+#include "pbbam/Tag.h"
+#include <map>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The TagCollection class represents a collection (or "dictionary") of
+///        tags.
+///
+/// Tags are mapped to their tag name, a 2-character string.
+///
+class PBBAM_EXPORT TagCollection : public std::map<std::string, Tag>
+{
+public:
+    /// \returns true if the collection contains a tag with \p name
+    bool Contains(const std::string& name) const;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // TAGCOLLECTION_H
diff --git a/include/pbbam/UnmappedReadsQuery.h b/include/pbbam/UnmappedReadsQuery.h

new file mode 100644 (file)

index 0000000..1623dc9
--- /dev/null
+++ b/include/pbbam/UnmappedReadsQuery.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2014, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef UNMAPPEDREADSQUERY_H
+#define UNMAPPEDREADSQUERY_H
+
+// TODO: Not yet implemented.
+//       Can't get bam_itr_queryi(idx, HTS_IDX_NOCOOR, -1, -1) to work reliably at the moment.
+
+//#include "pbbam/QueryBase.h"
+//#include <string>
+
+//namespace PacBio {
+//namespace BAM {
+
+//class BamFile;
+
+//class PBBAM_EXPORT UnmappedReadsQuery : public QueryBase
+//{
+//public:
+//    UnmappedReadsQuery(const BamFile& file);
+
+//protected:
+//    bool GetNext(BamRecord& record);
+
+//private:
+//    std::shared_ptr<samFile>   file_;
+//    std::shared_ptr<bam_hdr_t> header_;
+//    std::shared_ptr<hts_idx_t> index_;
+//    std::shared_ptr<hts_itr_t> iterator_;
+//};
+
+//} // namespace BAM
+//} // namspace PacBio
+
+#endif // UNMAPPEDREADSQUERY_H
diff --git a/include/pbbam/Validator.h b/include/pbbam/Validator.h

new file mode 100644 (file)

index 0000000..03f6c6c
--- /dev/null
+++ b/include/pbbam/Validator.h
@@ -0,0 +1,192 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Validator.h
+/// \brief Defines the Validator class.
+//
+// Author: Derek Barnett
+
+#ifndef VALIDATOR_H
+#define VALIDATOR_H
+
+#include "pbbam/Config.h"
+#include "pbbam/exception/ValidationException.h"
+#include <limits>
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile;
+class BamHeader;
+class BamRecord;
+class ReadGroupInfo;
+
+/// \brief The Validator class provides validation for %BAM data.
+///
+/// There are 2 ways to use this class. If you are only compared with a quick &
+/// dirty, yes/no validation, then you can use the IsValid() methods. This will
+/// swallow the specific cause of the failure, but you don't have to catch an
+/// exception and handle it in your client code. If you want to know,
+/// specifically, what failed, then you can use the Validate*() methods that
+/// will throw a ValidationException if the object is invalid. This exception
+/// will provide more details as to what failed and why.
+///
+/// See documentation for Config.h for details on building pbbam with
+/// auto-validation enabled.
+///
+class PBBAM_EXPORT Validator
+{
+public:
+    /// \brief Checks that a %BAM file conforms to the %PacBio specification.
+    ///
+    /// When \p entireFile is false, this method only checks file metadata. If
+    /// \p entireFile is true, all records are checked as well.
+    ///
+    /// \param[in] file         %BAM header to validate
+    /// \param[in] entireFile   check records in addition to metadata
+    /// \returns true if \p file passes validation checks
+    ///
+    /// \sa Validator::ValidateFileMetdata, Validator::ValidateEntireFile
+    ///
+    static bool IsValid(const BamFile& file, const bool entireFile);
+
+    /// \brief Checks that a %BAM header conforms to the %PacBio specification.
+    ///
+    /// \returns true if \p header passes validation checks
+    ///
+    /// \sa Validator::Validate(const BamHeader& header)
+    ///
+    static bool IsValid(const BamHeader& header);
+
+    /// \brief Checks that a %BAM read group conforms to the %PacBio
+    ///        specification.
+    ///
+    /// \returns true if \p rg passes validation checks
+    ///
+    /// \sa Validator::Validate(const ReadGroupInfo& rg)
+    ///
+    static bool IsValid(const ReadGroupInfo& rg);
+
+    /// \brief Checks that a %BAM record conforms to the %PacBio specification.
+    ///
+    /// \returns true if \p record passes validation checks
+    ///
+    /// \sa Validator::Validate(const BamRecord& record)
+    ///
+    static bool IsValid(const BamRecord& record);
+
+public:
+    /// \brief Checks that a %BAM file's header conforms to the
+    ///        %PacBio specification.
+    ///
+    /// This validation step checks the SAM/%BAM version number, sort order,
+    /// PacBioBAM version number, and calls Validate(readGroup) internally for
+    /// all read groups.
+    ///
+    /// \param[in] file         %BAM header to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p header fails validation checks
+    ///
+    static void Validate(const BamHeader& header,
+                         const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM read group conforms to the %PacBio
+    ///        specification.
+    ///
+    /// \param[in] rg           %BAM read group to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p rg fails validation checks
+    ///
+    static void Validate(const ReadGroupInfo& rg,
+                         const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM record conforms to the %PacBio specification.
+    ///
+    /// \param[in] record       %BAM record to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p record fails validation checks
+    ///
+    static void Validate(const BamRecord& record,
+                         const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM file's (entire) contents conform to the
+    ///        %PacBio specification.
+    ///
+    /// This is equivalent to:
+    ///
+    /// \code
+    /// Validator::ValidateMetadata(file);
+    /// EntireFileQuery query(file);
+    /// for (const BamRecord& record : query)
+    ///     Validator::Validate(record);
+    /// \endcode
+    ///
+    /// \param[in] file         %BAM file to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p file fails validation checks
+    ///
+    static void ValidateEntireFile(const BamFile& file,
+                                   const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM file's metadata conforms to the
+    ///        %PacBio specification.
+    ///
+    /// This validation step checks the filename, ensures EOF marker, and
+    /// presence of PBI. It also calls Validate(file.Header()) internally.
+    ///
+    /// \param[in] file         %BAM header to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p header fails validation checks
+    ///
+    static void ValidateFileMetadata(const BamFile& file,
+                                     const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+private:
+    // hidden constructor
+    Validator(void) = delete;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#include "internal/Validator.inl"
+
+#endif // VALIDATOR_H
diff --git a/include/pbbam/ZmwGroupQuery.h b/include/pbbam/ZmwGroupQuery.h

new file mode 100644 (file)

index 0000000..290d3ad
--- /dev/null
+++ b/include/pbbam/ZmwGroupQuery.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ZmwGroupQuery.h
+/// \brief Defines the ZmwGroupQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWGROUPQUERY_H
+#define ZMWGROUPQUERY_H
+
+#include "pbbam/internal/QueryBase.h"
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ZmwGroupQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a ZMW hole number
+///        whitelist, and grouping those results by hole number.
+///
+/// Example:
+/// \include code/ZmwGroupQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT ZmwGroupQuery : public internal::IGroupQuery
+{
+public:
+    /// \brief Creates a new ZmwGroupQuery, limiting record results to only
+    ///        those matching a ZMW hole number criterion.
+    ///
+    /// \param[in] zmwWhitelist     vector of allowed ZMW hole numbers
+    /// \param[in] dataset          input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         PBI files.
+    ///
+    ZmwGroupQuery(const std::vector<int32_t>& zmwWhitelist,
+                  const DataSet& dataset);
+    ~ZmwGroupQuery(void);
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(std::vector<BamRecord>& records);
+
+private:
+    struct ZmwGroupQueryPrivate;
+    std::unique_ptr<ZmwGroupQueryPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // ZMWGROUPQUERY_H
diff --git a/include/pbbam/ZmwQuery.h b/include/pbbam/ZmwQuery.h

new file mode 100644 (file)

index 0000000..0d6e166
--- /dev/null
+++ b/include/pbbam/ZmwQuery.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ZmwQuery.h
+/// \brief Defines the ZmwQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWQUERY_H
+#define ZMWQUERY_H
+
+#include "pbbam/Config.h"
+#include "pbbam/internal/QueryBase.h"
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ZmwQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a ZMW hole number
+///        whitelist.
+///
+/// Example:
+/// \include code/ZmwQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT ZmwQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new ZmwQuery, limiting record results to only
+    ///        those matching a ZMW hole number criterion.
+    ///
+    /// \param[in] zmwWhitelist     vector of allowed ZMW hole numbers
+    /// \param[in] dataset          input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         PBI files.
+    ///
+    ZmwQuery(const std::vector<int32_t>& zmwWhitelist,
+             const DataSet& dataset);
+
+    ~ZmwQuery(void);
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r);
+
+private:
+    struct ZmwQueryPrivate;
+    std::unique_ptr<ZmwQueryPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // ZMWQUERY_H
diff --git a/include/pbbam/ZmwType.h b/include/pbbam/ZmwType.h

new file mode 100644 (file)

index 0000000..a93e295
--- /dev/null
+++ b/include/pbbam/ZmwType.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ZmwType.h
+/// \brief Defines the ZmwType enum.
+//
+// Author: Armin Töpfer
+
+#ifndef ZMWTYPE_H
+#define ZMWTYPE_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the different ZMW categories of scraps
+///
+enum class ZmwType : char
+{
+    CONTROL   = 'C',
+    MALFORMED = 'M',
+    NORMAL    = 'N',
+    SENTINEL  = 'S'
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // ZMWTYPE_H
diff --git a/include/pbbam/ZmwTypeMap.h b/include/pbbam/ZmwTypeMap.h

new file mode 100644 (file)

index 0000000..4dc781c
--- /dev/null
+++ b/include/pbbam/ZmwTypeMap.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ZmwTypeMap.h
+/// \brief Defines the ZmwTypeMap class.
+//
+// Author: Armin Töpfer
+
+#ifndef ZMWTYPEMAP_H
+#define ZMWTYPEMAP_H
+
+#include <map>
+
+#include "pbbam/Config.h"
+#include "pbbam/ZmwType.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ZmwTypeMap class provides mapping between char codes and
+///        ZmwType enum keys.
+///
+class ZmwTypeMap
+{
+public:
+       static std::map<char, ZmwType> ParseChar;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // ZMWTYPEMAP_H
diff --git a/include/pbbam/exception/InvalidSequencingChemistryException.h b/include/pbbam/exception/InvalidSequencingChemistryException.h

new file mode 100644 (file)

index 0000000..a670bc3
--- /dev/null
+++ b/include/pbbam/exception/InvalidSequencingChemistryException.h
@@ -0,0 +1,103 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file InvalidSequencingChemistryException.h
+/// \brief Defines the InvalidSequencingChemistryException class.
+//
+// Author: Derek Barnett
+
+#ifndef INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H
+#define INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H
+
+#include <exception>
+#include <sstream>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The InvalidSequencingChemistryException class represents an exception
+///        that will be thrown when an invalid sequencing chemistry combination
+///        is encountered.
+///
+class InvalidSequencingChemistryException : public std::exception
+{
+public:
+    InvalidSequencingChemistryException(const std::string& bindingKit,
+                                        const std::string& sequencingKit,
+                                        const std::string& basecallerVersion)
+        : bindingKit_(bindingKit)
+        , sequencingKit_(sequencingKit)
+        , basecallerVersion_(basecallerVersion)
+    {
+        std::stringstream s;
+        s << "unsupported sequencing chemistry combination: " << std::endl
+          << "    binding kit:        " << bindingKit_ << std::endl
+          << "    sequencing kit:     " << sequencingKit_ << std::endl
+          << "    basecaller version: " << basecallerVersion_ << std::endl;
+        what_ = s.str();
+    }
+
+    // This is a work around for the Intel PHI compiler (icpc)
+    ~InvalidSequencingChemistryException() throw()
+    {
+
+    }
+public:
+    const std::string& BindingKit(void) const
+    { return bindingKit_; }
+
+    const std::string& SequencingKit(void) const
+    { return sequencingKit_; }
+
+    const std::string& BasecallerVersion(void) const
+    { return basecallerVersion_; }
+
+public:
+    const char* what(void) const noexcept override
+    { return what_.c_str(); }
+
+protected:
+    std::string bindingKit_;
+    std::string sequencingKit_;
+    std::string basecallerVersion_;
+    std::string what_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H
diff --git a/include/pbbam/exception/ValidationException.h b/include/pbbam/exception/ValidationException.h

new file mode 100644 (file)

index 0000000..92d8f17
--- /dev/null
+++ b/include/pbbam/exception/ValidationException.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ValidationException.h
+/// \brief Defines the ValidationException class.
+//
+// Author: Derek Barnett
+
+#ifndef VALIDATIONEXCEPTION_H
+#define VALIDATIONEXCEPTION_H
+
+#include <map>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ValidationExecption represents an exception that will be thrown
+///        when any error is encountered using the Validator API. In addition to
+///        a default display message, it provides programmatic access to all
+///        reported error messages.
+///
+/// \sa Validator::Validate(const BamRecord& record)
+///
+class ValidationException : public std::runtime_error
+{
+public:
+    typedef std::vector<std::string>         ErrorList;
+    typedef std::map<std::string, ErrorList> ErrorMap;
+
+public:
+    ValidationException(const ErrorMap& fileErrors,
+                        const ErrorMap& readGroupErrors,
+                        const ErrorMap& recordErrors);
+    ValidationException(ErrorMap&& fileErrors,
+                        ErrorMap&& readGroupErrors,
+                        ErrorMap&& recordErrors);
+    // This is a work around for the Intel PHI compiler (icpc)
+    ~ValidationException() throw()
+    {
+
+    }
+public:
+    const ErrorMap& FileErrors(void) const;
+    const ErrorMap& ReadGroupErrors(void) const;
+    const ErrorMap& RecordErrors(void) const;
+
+    const char* what(void) const noexcept override;
+
+private:
+    ErrorMap fileErrors_;
+    ErrorMap readGroupErrors_;
+    ErrorMap recordErrors_;
+    std::string msg_;
+
+private:
+    void FormatMessage(void);
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // VALIDATIONEXCEPTION_H
diff --git a/include/pbbam/internal/Accuracy.inl b/include/pbbam/internal/Accuracy.inl

new file mode 100644 (file)

index 0000000..f859662
--- /dev/null
+++ b/include/pbbam/internal/Accuracy.inl
@@ -0,0 +1,66 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Accuracy.inl
+/// \brief Inline implementations for the Accuracy class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Accuracy.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline Accuracy::Accuracy(float accuracy)
+{
+    if (accuracy < Accuracy::MIN)
+        accuracy = Accuracy::MIN;
+    else if (accuracy > Accuracy::MAX)
+        accuracy = Accuracy::MAX;
+    accuracy_ = accuracy;
+}
+
+inline Accuracy::Accuracy(const Accuracy &other)
+    : accuracy_(other.accuracy_)
+{ }
+
+inline Accuracy::~Accuracy(void) { }
+
+inline Accuracy::operator float(void) const
+{ return accuracy_; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/BamHeader.inl b/include/pbbam/internal/BamHeader.inl

new file mode 100644 (file)

index 0000000..2445a25
--- /dev/null
+++ b/include/pbbam/internal/BamHeader.inl
@@ -0,0 +1,154 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamHeader.inl
+/// \brief Inline implementations for the BamHeader class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamHeader.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class BamHeaderPrivate
+{
+public:
+    std::string version_;
+    std::string pacbioBamVersion_;
+    std::string sortOrder_;
+    std::map<std::string, std::string> headerLineCustom_;
+
+    std::map<std::string, ReadGroupInfo> readGroups_; // id => read group info
+    std::map<std::string, ProgramInfo> programs_;     // id => program info
+    std::vector<std::string> comments_;
+
+    // we need to preserve insertion order, use lookup for access by name
+    std::vector<SequenceInfo> sequences_;
+    std::map<std::string, int32_t> sequenceIdLookup_;
+};
+
+} // namespace internal
+
+inline BamHeader::BamHeader(void)
+    : d_(new internal::BamHeaderPrivate)
+{ }
+
+inline BamHeader::BamHeader(const BamHeader& other)
+    : d_(other.d_)
+{ }
+
+inline BamHeader::BamHeader(BamHeader&& other)
+    : d_(std::move(other.d_))
+{ }
+
+inline BamHeader& BamHeader::operator=(const BamHeader& other)
+{ d_ = other.d_; return *this; }
+
+inline BamHeader& BamHeader::operator=(BamHeader&& other)
+{ d_ = std::move(other.d_); return *this; }
+
+inline BamHeader::~BamHeader(void) { }
+
+inline BamHeader BamHeader::operator+(const BamHeader& other) const
+{ return DeepCopy() += other; }
+
+inline BamHeader& BamHeader::AddComment(const std::string& comment)
+{ d_->comments_.push_back(comment); return *this; }
+
+inline BamHeader& BamHeader::AddProgram(const ProgramInfo& pg)
+{ d_->programs_[pg.Id()] = pg; return *this; }
+
+inline BamHeader& BamHeader::AddReadGroup(const ReadGroupInfo& readGroup)
+{ d_->readGroups_[readGroup.Id()] = readGroup; return *this; }
+
+inline BamHeader& BamHeader::ClearComments(void)
+{ d_->comments_.clear(); return* this; }
+
+inline BamHeader& BamHeader::ClearPrograms(void)
+{ d_->programs_.clear(); return *this; }
+
+inline BamHeader& BamHeader::ClearReadGroups(void)
+{ d_->readGroups_.clear(); return *this; }
+
+inline std::vector<std::string> BamHeader::Comments(void) const
+{ return d_->comments_; }
+
+inline BamHeader& BamHeader::Comments(const std::vector<std::string>& comments)
+{ d_->comments_ = comments; return *this; }
+
+inline bool BamHeader::HasProgram(const std::string& id) const
+{ return d_->programs_.find(id) != d_->programs_.cend(); }
+
+inline bool BamHeader::HasReadGroup(const std::string& id) const
+{ return d_->readGroups_.find(id) != d_->readGroups_.cend(); }
+
+inline bool BamHeader::HasSequence(const std::string& name) const
+{ return d_->sequenceIdLookup_.find(name) != d_->sequenceIdLookup_.cend(); }
+
+inline size_t BamHeader::NumSequences(void) const
+{ return d_->sequences_.size(); }
+
+inline std::string BamHeader::PacBioBamVersion(void) const
+{ return d_->pacbioBamVersion_; }
+
+inline SequenceInfo BamHeader::Sequence(const int32_t id) const
+{ return d_->sequences_.at(id); }
+
+inline std::string BamHeader::SequenceLength(const int32_t id) const
+{ return Sequence(id).Length(); }
+
+inline std::string BamHeader::SequenceName(const int32_t id) const
+{ return Sequence(id).Name(); }
+
+inline std::vector<SequenceInfo> BamHeader::Sequences(void) const
+{ return d_->sequences_; }
+
+inline std::string BamHeader::SortOrder(void) const
+{ return d_->sortOrder_; }
+
+inline BamHeader& BamHeader::SortOrder(const std::string& order)
+{ d_->sortOrder_ = order; return *this; }
+
+inline std::string BamHeader::Version(void) const
+{ return d_->version_; }
+
+inline BamHeader& BamHeader::Version(const std::string& version)
+{ d_->version_ = version; return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/BamRecord.inl b/include/pbbam/internal/BamRecord.inl

new file mode 100644 (file)

index 0000000..d4a7cbd
--- /dev/null
+++ b/include/pbbam/internal/BamRecord.inl
@@ -0,0 +1,86 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecord.inl
+/// \brief Inline implementations for the BamRecord class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline BamRecord BamRecord::Clipped(const BamRecord& input,
+                                    const ClipType clipType,
+                                    const PacBio::BAM::Position start,
+                                    const PacBio::BAM::Position end)
+{
+    return input.Clipped(clipType, start, end);
+}
+
+inline BamRecord BamRecord::Clipped(const ClipType clipType,
+                                    const PacBio::BAM::Position start,
+                                    const PacBio::BAM::Position end) const
+{
+    BamRecord result(*this);
+    result.Clip(clipType, start, end);
+    return result;
+}
+
+inline BamRecord BamRecord::Mapped(const BamRecord& input,
+                                   const int32_t referenceId,
+                                   const Position refStart,
+                                   const Strand strand,
+                                   const Cigar& cigar,
+                                   const uint8_t mappingQuality)
+{
+    return input.Mapped(referenceId, refStart, strand, cigar, mappingQuality);
+}
+
+inline BamRecord BamRecord::Mapped(const int32_t referenceId,
+                                   const Position refStart,
+                                   const Strand strand,
+                                   const Cigar& cigar,
+                                   const uint8_t mappingQuality) const
+{
+    BamRecord result(*this);
+    result.Map(referenceId, refStart, strand, cigar, mappingQuality);
+    return result;
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/BamRecordBuilder.inl b/include/pbbam/internal/BamRecordBuilder.inl

new file mode 100644 (file)

index 0000000..212e831
--- /dev/null
+++ b/include/pbbam/internal/BamRecordBuilder.inl
@@ -0,0 +1,84 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecordBuilder.inl
+/// \brief Inline implementations for the BamRecordBuilder class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamRecordBuilder.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline BamRecordBuilder& BamRecordBuilder::Bin(const uint32_t bin)
+{ core_.bin = bin; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::Flag(const uint32_t flag)
+{ core_.flag = flag; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::InsertSize(const int32_t iSize)
+{ core_.isize = iSize; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::MapQuality(const uint8_t mapQual)
+{ core_.qual = mapQual; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::MatePosition(const int32_t pos)
+{ core_.mpos = pos; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::MateReferenceId(const int32_t id)
+{ core_.mtid = id; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::Position(const int32_t pos)
+{ core_.pos = pos; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::Qualities(const std::string& qualities)
+{ qualities_ = qualities; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::Qualities(std::string&& qualities)
+{ qualities_ = std::move(qualities); return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::ReferenceId(const int32_t id)
+{ core_.tid = id; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::Tags(const TagCollection& tags)
+{ tags_ = tags; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::Tags(TagCollection&& tags)
+{ tags_ = std::move(tags); return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/BamRecordImpl.inl b/include/pbbam/internal/BamRecordImpl.inl

new file mode 100644 (file)

index 0000000..6c0ecef
--- /dev/null
+++ b/include/pbbam/internal/BamRecordImpl.inl
@@ -0,0 +1,216 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecordImpl.inl
+/// \brief Inline implementations for the BamRecordImpl class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamRecordImpl.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline uint32_t BamRecordImpl::Bin(void) const
+{ return d_->core.bin; }
+
+inline BamRecordImpl& BamRecordImpl::Bin(uint32_t bin)
+{ d_->core.bin = bin; return *this; }
+
+inline uint32_t BamRecordImpl::Flag(void) const
+{ return d_->core.flag; }
+
+inline BamRecordImpl& BamRecordImpl::Flag(uint32_t flag)
+{ d_->core.flag = flag; return *this; }
+
+inline int32_t BamRecordImpl::InsertSize(void) const
+{ return d_->core.isize; }
+
+inline BamRecordImpl& BamRecordImpl::InsertSize(int32_t iSize)
+{ d_->core.isize = iSize; return *this; }
+
+inline uint8_t BamRecordImpl::MapQuality(void) const
+{ return d_->core.qual; }
+
+inline BamRecordImpl& BamRecordImpl::MapQuality(uint8_t mapQual)
+{ d_->core.qual = mapQual; return *this; }
+
+inline PacBio::BAM::Position BamRecordImpl::MatePosition(void) const
+{ return d_->core.mpos; }
+
+inline BamRecordImpl& BamRecordImpl::MatePosition(PacBio::BAM::Position pos)
+{ d_->core.mpos = pos; return *this; }
+
+inline int32_t BamRecordImpl::MateReferenceId(void) const
+{ return d_->core.mtid; }
+
+inline BamRecordImpl& BamRecordImpl::MateReferenceId(int32_t id)
+{ d_->core.mtid = id; return *this; }
+
+inline PacBio::BAM::Position BamRecordImpl::Position(void) const
+{ return d_->core.pos; }
+
+inline BamRecordImpl& BamRecordImpl::Position(PacBio::BAM::Position pos)
+{ d_->core.pos = pos; return *this; }
+
+inline int32_t BamRecordImpl::ReferenceId(void) const
+{ return d_->core.tid; }
+
+inline BamRecordImpl& BamRecordImpl::ReferenceId(int32_t id)
+{ d_->core.tid = id; return *this; }
+
+inline bool BamRecordImpl::IsDuplicate(void) const
+{ return (d_->core.flag & BamRecordImpl::DUPLICATE) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetDuplicate(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::DUPLICATE;
+    else    d_->core.flag &= ~BamRecordImpl::DUPLICATE;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsFailedQC(void) const
+{ return (d_->core.flag & BamRecordImpl::FAILED_QC) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetFailedQC(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::FAILED_QC;
+    else    d_->core.flag &= ~BamRecordImpl::FAILED_QC;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsFirstMate(void) const
+{ return (d_->core.flag & BamRecordImpl::MATE_1) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetFirstMate(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::MATE_1;
+    else    d_->core.flag &= ~BamRecordImpl::MATE_1;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsMapped(void) const
+{ return (d_->core.flag & BamRecordImpl::UNMAPPED) == 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetMapped(bool ok)
+{
+    if (ok) d_->core.flag &= ~BamRecordImpl::UNMAPPED;
+    else    d_->core.flag |=  BamRecordImpl::UNMAPPED;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsMateMapped(void) const
+{ return (d_->core.flag & BamRecordImpl::MATE_UNMAPPED) == 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetMateMapped(bool ok)
+{
+    if (ok) d_->core.flag &= ~BamRecordImpl::MATE_UNMAPPED;
+    else    d_->core.flag |=  BamRecordImpl::MATE_UNMAPPED;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsMateReverseStrand(void) const
+{ return (d_->core.flag & BamRecordImpl::MATE_REVERSE_STRAND) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetMateReverseStrand(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::MATE_REVERSE_STRAND;
+    else    d_->core.flag &= ~BamRecordImpl::MATE_REVERSE_STRAND;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsPaired(void) const
+{ return (d_->core.flag & BamRecordImpl::PAIRED) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetPaired(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::PAIRED;
+    else    d_->core.flag &= ~BamRecordImpl::PAIRED;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsPrimaryAlignment(void) const
+{ return (d_->core.flag & BamRecordImpl::SECONDARY) == 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetPrimaryAlignment(bool ok)
+{
+    if (ok) d_->core.flag &= ~BamRecordImpl::SECONDARY;
+    else    d_->core.flag |=  BamRecordImpl::SECONDARY;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsProperPair(void) const
+{ return (d_->core.flag & BamRecordImpl::PROPER_PAIR) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetProperPair(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::PROPER_PAIR;
+    else    d_->core.flag &= ~BamRecordImpl::PROPER_PAIR;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsReverseStrand(void) const
+{ return (d_->core.flag & BamRecordImpl::REVERSE_STRAND) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetReverseStrand(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::REVERSE_STRAND;
+    else    d_->core.flag &= ~BamRecordImpl::REVERSE_STRAND;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsSecondMate(void) const
+{ return (d_->core.flag & BamRecordImpl::MATE_2) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetSecondMate(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::MATE_2;
+    else    d_->core.flag &= ~BamRecordImpl::MATE_2;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsSupplementaryAlignment(void) const
+{ return (d_->core.flag & BamRecordImpl::SUPPLEMENTARY) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetSupplementaryAlignment(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::SUPPLEMENTARY;
+    else    d_->core.flag &= ~BamRecordImpl::SUPPLEMENTARY;
+    return *this;
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/BamRecordView.inl b/include/pbbam/internal/BamRecordView.inl

new file mode 100644 (file)

index 0000000..35486be
--- /dev/null
+++ b/include/pbbam/internal/BamRecordView.inl
@@ -0,0 +1,129 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecordView.inl
+/// \brief Inline implementations for the BamRecordView class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamRecordView.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline BamRecordView::BamRecordView(const BamRecord& record,
+                                    const Orientation orientation,
+                                    const bool aligned,
+                                    const bool exciseSoftClips,
+                                    const PulseBehavior pulseBehavior)
+    : record_(record)
+    , orientation_(orientation)
+    , aligned_(aligned)
+    , exciseSoftClips_(exciseSoftClips)
+    , pulseBehavior_(pulseBehavior)
+{ }
+
+inline QualityValues BamRecordView::AltLabelQVs(void) const
+{ return record_.AltLabelQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::string BamRecordView::AltLabelTags(void) const
+{ return record_.AltLabelTag(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline QualityValues BamRecordView::DeletionQVs(void) const
+{ return record_.DeletionQV(orientation_, aligned_, exciseSoftClips_); }
+
+inline std::string BamRecordView::DeletionTags(void) const
+{ return record_.DeletionTag(orientation_, aligned_, exciseSoftClips_); }
+
+inline QualityValues BamRecordView::InsertionQVs(void) const
+{ return record_.InsertionQV(orientation_, aligned_, exciseSoftClips_); }
+
+inline Frames BamRecordView::IPD(void) const
+{ return record_.IPD(orientation_, aligned_, exciseSoftClips_); }
+
+inline Frames BamRecordView::PrebaseFrames(void) const
+{ return record_.IPD(orientation_, aligned_, exciseSoftClips_); }
+
+inline QualityValues BamRecordView::LabelQVs(void) const
+{ return record_.LabelQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline QualityValues BamRecordView::MergeQVs(void) const
+{ return record_.MergeQV(orientation_, aligned_, exciseSoftClips_); }
+
+inline QualityValues BamRecordView::PulseMergeQVs(void) const
+{ return record_.PulseMergeQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::vector<float> BamRecordView::Pkmean(void) const
+{ return record_.Pkmean(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::vector<float> BamRecordView::Pkmid(void) const
+{ return record_.Pkmid(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::vector<float> BamRecordView::Pkmean2(void) const
+{ return record_.Pkmean2(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::vector<float> BamRecordView::Pkmid2(void) const
+{ return record_.Pkmid2(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline Frames BamRecordView::PrePulseFrames(void) const
+{ return record_.PrePulseFrames(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::string BamRecordView::PulseCalls(void) const
+{ return record_.PulseCall(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline Frames BamRecordView::PulseCallWidth(void) const
+{ return record_.PulseCallWidth(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline Frames BamRecordView::PulseWidths(void) const
+{ return record_.PulseWidth(orientation_, aligned_, exciseSoftClips_); }
+
+inline QualityValues BamRecordView::Qualities(void) const
+{ return record_.Qualities(orientation_, aligned_, exciseSoftClips_); }
+
+inline std::string BamRecordView::Sequence(void) const
+{ return record_.Sequence(orientation_, aligned_, exciseSoftClips_); }
+
+inline std::vector<uint32_t> BamRecordView::StartFrames(void) const
+{ return record_.StartFrame(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline QualityValues BamRecordView::SubstitutionQVs(void) const
+{ return record_.SubstitutionQV(orientation_, aligned_, exciseSoftClips_); }
+
+inline std::string BamRecordView::SubstitutionTags(void) const
+{ return record_.SubstitutionTag(orientation_, aligned_, exciseSoftClips_); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Cigar.inl b/include/pbbam/internal/Cigar.inl

new file mode 100644 (file)

index 0000000..4799a72
--- /dev/null
+++ b/include/pbbam/internal/Cigar.inl
@@ -0,0 +1,77 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Cigar.inl
+/// \brief Inline implemenations for the Cigar class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Cigar.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline Cigar::Cigar(void)
+    : std::vector<CigarOperation>()
+{ }
+
+inline Cigar::Cigar(const Cigar& other)
+    : std::vector<CigarOperation>(other)
+{ }
+
+inline Cigar::Cigar(Cigar&& other)
+    : std::vector<CigarOperation>(std::move(other))
+{ }
+
+inline Cigar& Cigar::operator=(const Cigar& other)
+{
+    std::vector<CigarOperation>::operator=(other);
+    return *this;
+}
+
+inline Cigar& Cigar::operator=(Cigar&& other)
+{
+    std::vector<CigarOperation>::operator=(std::move(other));
+    return *this;
+}
+
+inline Cigar::~Cigar(void) { }
+
+inline Cigar Cigar::FromStdString(const std::string& stdString)
+{ return Cigar(stdString); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/CigarOperation.inl b/include/pbbam/internal/CigarOperation.inl

new file mode 100644 (file)

index 0000000..167528c
--- /dev/null
+++ b/include/pbbam/internal/CigarOperation.inl
@@ -0,0 +1,111 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file CigarOperation.inl
+/// \brief Inline implemenations for the CigarOperation class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/CigarOperation.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline CigarOperation::CigarOperation(void)
+    : type_(CigarOperationType::UNKNOWN_OP)
+    , length_(0)
+{ }
+
+inline CigarOperation::CigarOperation(char c, uint32_t length)
+    : type_(CigarOperation::CharToType(c))
+    , length_(length)
+{
+    if (type_ == CigarOperationType::ALIGNMENT_MATCH)
+        throw std::runtime_error("CIGAR operation 'M' is not allowed in PacBio BAM files. Use 'X/=' instead.");
+}
+
+inline CigarOperation::CigarOperation(CigarOperationType op, uint32_t length)
+    : type_(op)
+    , length_(length)
+{
+    if (type_ == CigarOperationType::ALIGNMENT_MATCH)
+        throw std::runtime_error("CIGAR operation 'M' is not allowed in PacBio BAM files. Use 'X/=' instead.");
+}
+
+inline CigarOperation::CigarOperation(const CigarOperation& other)
+    : type_(other.type_)
+    , length_(other.length_)
+{ }
+
+inline CigarOperation::CigarOperation(CigarOperation&& other)
+    : type_(std::move(other.type_))
+    , length_(std::move(other.length_))
+{ }
+
+inline CigarOperation::~CigarOperation(void) { }
+
+inline uint32_t CigarOperation::Length(void) const
+{ return length_; }
+
+inline CigarOperation& CigarOperation::Length(const uint32_t length)
+{ length_ = length; return *this; }
+
+inline CigarOperationType CigarOperation::Type(void) const
+{ return type_; }
+
+inline CigarOperation &CigarOperation::Type(const CigarOperationType opType)
+{ type_ = opType; return *this; }
+
+inline char CigarOperation::Char(void) const
+{ return CigarOperation::TypeToChar(type_); }
+
+inline CigarOperation &CigarOperation::Char(const char opChar)
+{ type_ = CigarOperation::CharToType(opChar);return *this; }
+
+inline CigarOperation& CigarOperation::operator=(const CigarOperation& other)
+{ type_ = other.type_; length_ = other.length_; return *this; }
+
+inline CigarOperation& CigarOperation::operator=(CigarOperation&& other)
+{ type_ = std::move(other.type_); length_ = std::move(other.length_); return *this; }
+
+inline bool CigarOperation::operator==(const CigarOperation& other) const
+{ return type_ == other.type_ && length_ == other.length_; }
+
+inline bool CigarOperation::operator!=(const CigarOperation& other) const
+{ return !(*this == other); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Compare.inl b/include/pbbam/internal/Compare.inl

new file mode 100644 (file)

index 0000000..4eb5ccf
--- /dev/null
+++ b/include/pbbam/internal/Compare.inl
@@ -0,0 +1,78 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Compare.inl
+/// \brief Inline implementations for the Compare class & inner classes.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Compare.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template <typename T, T> struct MemberFnProxy;
+
+template<typename T, typename R, typename... Args, R (T::*fn)(Args...)const>
+struct MemberFnProxy<R (T::*)(Args...)const, fn>
+{
+    static R call(const T& obj, Args&&... args)
+    {
+        return (obj.*fn)(std::forward<Args>(args)...);
+    }
+};
+
+} // namespace internal
+
+template<typename ValueType,
+         typename Compare::MemberFunctionBaseHelper<ValueType>::MemberFnType fn,
+         typename CompareType>
+inline bool Compare::MemberFunctionBase<ValueType, fn, CompareType>::operator()(const BamRecord& lhs,
+                                                                                const BamRecord& rhs) const
+{
+    using MemberFnType = typename Compare::MemberFunctionBaseHelper<ValueType>::MemberFnType;
+    using Proxy = internal::MemberFnProxy<MemberFnType, fn>;
+
+    CompareType cmp;
+    return cmp(Proxy::call(lhs), Proxy::call(rhs));
+}
+
+inline bool Compare::None::operator()(const BamRecord&, const BamRecord&) const
+{ return false; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/CompositeBamReader.inl b/include/pbbam/internal/CompositeBamReader.inl

new file mode 100644 (file)

index 0000000..f8f301e
--- /dev/null
+++ b/include/pbbam/internal/CompositeBamReader.inl
@@ -0,0 +1,397 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.\r
+//\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted (subject to the limitations in the\r
+// disclaimer below) provided that the following conditions are met:\r
+//\r
+//  * Redistributions of source code must retain the above copyright\r
+//    notice, this list of conditions and the following disclaimer.\r
+//\r
+//  * Redistributions in binary form must reproduce the above\r
+//    copyright notice, this list of conditions and the following\r
+//    disclaimer in the documentation and/or other materials provided\r
+//    with the distribution.\r
+//\r
+//  * Neither the name of Pacific Biosciences nor the names of its\r
+//    contributors may be used to endorse or promote products derived\r
+//    from this software without specific prior written permission.\r
+//\r
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE\r
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC\r
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED\r
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES\r
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\r
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS\r
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF\r
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND\r
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF\r
+// SUCH DAMAGE.\r
+//\r
+// File Description\r
+/// \file CompositeBamReader.inl\r
+/// \brief Inline implementations for the composite BAM readers, for\r
+///        working with multiple input files.\r
+//\r
+// Author: Derek Barnett\r
+\r
+#include "pbbam/CompositeBamReader.h"\r
+#include <algorithm>\r
+#include <set>\r
+#include <sstream>\r
+#include <stdexcept>\r
+\r
+namespace PacBio {\r
+namespace BAM {\r
+namespace internal {\r
+\r
+// -----------------------------------\r
+// Merging helpers\r
+// -----------------------------------\r
+\r
+inline CompositeMergeItem::CompositeMergeItem(std::unique_ptr<BamReader>&& rdr)\r
+    : reader(std::move(rdr))\r
+{ }\r
+\r
+inline CompositeMergeItem::CompositeMergeItem(std::unique_ptr<BamReader>&& rdr,\r
+                              BamRecord&& rec)\r
+    : reader(std::move(rdr))\r
+    , record(std::move(rec))\r
+{ }\r
+\r
+inline CompositeMergeItem::CompositeMergeItem(CompositeMergeItem&& other)\r
+    : reader(std::move(other.reader))\r
+    , record(std::move(other.record))\r
+{ }\r
+\r
+inline CompositeMergeItem& CompositeMergeItem::operator=(CompositeMergeItem&& other)\r
+{\r
+    reader = std::move(other.reader);\r
+    record = std::move(other.record);\r
+    return *this;\r
+}\r
+\r
+inline CompositeMergeItem::~CompositeMergeItem(void) { }\r
+\r
+template<typename CompareType>\r
+inline bool CompositeMergeItemSorter<CompareType>::operator()(const CompositeMergeItem& lhs,\r
+                                                              const CompositeMergeItem& rhs)\r
+{\r
+    const BamRecord& l = lhs.record;\r
+    const BamRecord& r = rhs.record;\r
+    return CompareType()(l, r);\r
+}\r
+\r
+} // namespace internal\r
+\r
+// -----------------------------------\r
+// GenomicIntervalCompositeBamReader\r
+// -----------------------------------\r
+\r
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(const GenomicInterval& interval,\r
+                                                                            const std::vector<BamFile>& bamFiles)\r
+{\r
+    filenames_.reserve(bamFiles.size());\r
+    for(const auto& bamFile : bamFiles)\r
+        filenames_.push_back(bamFile.Filename());\r
+    Interval(interval);\r
+}\r
+\r
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(const GenomicInterval& interval,\r
+                                                                            std::vector<BamFile>&& bamFiles)\r
+{\r
+    filenames_.reserve(bamFiles.size());\r
+    for(auto&& bamFile : bamFiles)\r
+        filenames_.push_back(bamFile.Filename());\r
+    Interval(interval);\r
+}\r
+\r
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(const GenomicInterval& interval,\r
+                                                                            const DataSet& dataset)\r
+    : GenomicIntervalCompositeBamReader(interval, dataset.BamFiles())\r
+{ }\r
+\r
+inline bool GenomicIntervalCompositeBamReader::GetNext(BamRecord& record)\r
+{\r
+    // nothing left to read\r
+    if (mergeItems_.empty())\r
+        return false;\r
+\r
+    // non-destructive 'pop' of first item from queue\r
+    auto firstIter = mergeItems_.begin();\r
+    auto firstItem = internal::CompositeMergeItem{ std::move(firstIter->reader), std::move(firstIter->record) };\r
+    mergeItems_.pop_front();\r
+\r
+    // store its record in our output record\r
+    std::swap(record, firstItem.record);\r
+\r
+    // try fetch 'next' from first item's reader\r
+    // if successful, re-insert it into container & re-sort on our new values\r
+    // otherwise, this item will go out of scope & reader destroyed\r
+    if (firstItem.reader->GetNext(firstItem.record)) {\r
+        mergeItems_.push_front(std::move(firstItem));\r
+        UpdateSort();\r
+    }\r
+\r
+    // return success\r
+    return true;\r
+}\r
+\r
+inline const GenomicInterval& GenomicIntervalCompositeBamReader::Interval(void) const\r
+{ return interval_; }\r
+\r
+inline GenomicIntervalCompositeBamReader& GenomicIntervalCompositeBamReader::Interval(const GenomicInterval& interval)\r
+{\r
+    auto updatedMergeItems = std::deque<internal::CompositeMergeItem>{ };\r
+    auto filesToCreate = std::set<std::string>{ filenames_.cbegin(), filenames_.cend() };\r
+\r
+    // update existing readers\r
+    while (!mergeItems_.empty()) {\r
+\r
+        // non-destructive 'pop' of first item from queue\r
+        auto firstIter = mergeItems_.begin();\r
+        auto firstItem = internal::CompositeMergeItem{ std::move(firstIter->reader), std::move(firstIter->record) };\r
+        mergeItems_.pop_front();\r
+\r
+        // reset interval\r
+        BaiIndexedBamReader* baiReader = dynamic_cast<BaiIndexedBamReader*>(firstItem.reader.get());\r
+        assert(baiReader);\r
+        baiReader->Interval(interval);\r
+\r
+        // try fetch 'next' from first item's reader\r
+        // if successful, re-insert it into container & re-sort on our new values\r
+        // otherwise, this item will go out of scope & reader destroyed\r
+        if (firstItem.reader->GetNext(firstItem.record)) {\r
+            updatedMergeItems.push_front(std::move(firstItem));\r
+            filesToCreate.erase(firstItem.reader->Filename());\r
+        }\r
+    }\r
+\r
+    // create readers for files that were not 'active' for the previous\r
+    std::vector<std::string> missingBai;\r
+    for (auto&& fn : filesToCreate) {\r
+        auto bamFile = BamFile{ fn };\r
+        if (bamFile.StandardIndexExists()) {\r
+            auto item = internal::CompositeMergeItem{ std::unique_ptr<BamReader>{ new BaiIndexedBamReader{ interval, std::move(bamFile) } } };\r
+            if (item.reader->GetNext(item.record))\r
+                updatedMergeItems.push_back(std::move(item));\r
+            // else not an error, simply no data matching interval\r
+        }\r
+        else {\r
+            // maybe handle PBI-backed interval searches if BAI missing, but for now treat as error\r
+            missingBai.push_back(bamFile.Filename());\r
+        }\r
+    }\r
+\r
+    // throw if any files missing BAI\r
+    if (!missingBai.empty()) {\r
+        std::stringstream e;\r
+        e << "failed to open GenomicIntervalCompositeBamReader because the following files are missing a BAI file:" << std::endl;\r
+        for (const auto& fn : missingBai)\r
+            e << "  " << fn << std::endl;\r
+        throw std::runtime_error(e.str());\r
+    }\r
+\r
+    // update our actual container and return\r
+    mergeItems_ = std::move(updatedMergeItems);\r
+    UpdateSort();\r
+    return *this;\r
+}\r
+\r
+struct OrderByPosition\r
+{\r
+    static inline bool less_than(const BamRecord& lhs, const BamRecord& rhs)\r
+    {\r
+        const int32_t lhsId = lhs.ReferenceId();\r
+        const int32_t rhsId = rhs.ReferenceId();\r
+        if (lhsId == -1) return false;\r
+        if (rhsId == -1) return true;\r
+\r
+        if (lhsId == rhsId)\r
+            return lhs.ReferenceStart() < rhs.ReferenceStart();\r
+        else return lhsId < rhsId;\r
+    }\r
+\r
+    static inline bool equals(const BamRecord& lhs, const BamRecord& rhs)\r
+    {\r
+        return lhs.ReferenceId() == rhs.ReferenceId() &&\r
+               lhs.ReferenceStart() == rhs.ReferenceStart();\r
+    }\r
+};\r
+\r
+struct PositionSorter : std::binary_function<internal::CompositeMergeItem, internal::CompositeMergeItem, bool>\r
+{\r
+    bool operator()(const internal::CompositeMergeItem& lhs,\r
+                    const internal::CompositeMergeItem& rhs)\r
+    {\r
+        const BamRecord& l = lhs.record;\r
+        const BamRecord& r = rhs.record;\r
+        return OrderByPosition::less_than(l, r);\r
+    }\r
+};\r
+\r
+inline void GenomicIntervalCompositeBamReader::UpdateSort(void)\r
+{ std::sort(mergeItems_.begin(), mergeItems_.end(), PositionSorter{ }); }\r
+\r
+// ------------------------------\r
+// PbiRequestCompositeBamReader\r
+// ------------------------------\r
+\r
+template<typename OrderByType>\r
+inline PbiFilterCompositeBamReader<OrderByType>::PbiFilterCompositeBamReader(const PbiFilter& filter,\r
+                                                                             const std::vector<BamFile>& bamFiles)\r
+{\r
+    filenames_.reserve(bamFiles.size());\r
+    for(const auto& bamFile : bamFiles)\r
+        filenames_.push_back(bamFile.Filename());\r
+    Filter(filter);\r
+}\r
+\r
+template<typename OrderByType>\r
+inline PbiFilterCompositeBamReader<OrderByType>::PbiFilterCompositeBamReader(const PbiFilter& filter,\r
+                                                                             std::vector<BamFile>&& bamFiles)\r
+{\r
+    filenames_.reserve(bamFiles.size());\r
+    for(auto&& bamFile : bamFiles)\r
+        filenames_.push_back(bamFile.Filename());\r
+    Filter(filter);\r
+}\r
+\r
+template<typename OrderByType>\r
+inline PbiFilterCompositeBamReader<OrderByType>::PbiFilterCompositeBamReader(const PbiFilter& filter,\r
+                                                                             const DataSet& dataset)\r
+    : PbiFilterCompositeBamReader(filter, std::move(dataset.BamFiles()))\r
+{ }\r
+\r
+template<typename OrderByType>\r
+inline bool PbiFilterCompositeBamReader<OrderByType>::GetNext(BamRecord& record)\r
+{\r
+    // nothing left to read\r
+    if (mergeQueue_.empty())\r
+        return false;\r
+\r
+    // non-destructive 'pop' of first item from queue\r
+    auto firstIter = mergeQueue_.begin();\r
+    auto firstItem = value_type{ std::move(firstIter->reader), std::move(firstIter->record) };\r
+    mergeQueue_.pop_front();\r
+\r
+    // store its record in our output record\r
+    std::swap(record, firstItem.record);\r
+\r
+    // try fetch 'next' from first item's reader\r
+    // if successful, re-insert it into container & re-sort on our new values\r
+    // otherwise, this item will go out of scope & reader destroyed\r
+    if (firstItem.reader->GetNext(firstItem.record)) {\r
+        mergeQueue_.push_front(std::move(firstItem));\r
+        UpdateSort();\r
+    }\r
+\r
+    // return success\r
+    return true;\r
+}\r
+\r
+template<typename OrderByType>\r
+inline PbiFilterCompositeBamReader<OrderByType>&\r
+PbiFilterCompositeBamReader<OrderByType>::Filter(const PbiFilter& filter)\r
+{\r
+    auto updatedMergeItems = container_type{ };\r
+    auto filesToCreate = std::set<std::string>{ filenames_.cbegin(), filenames_.cend() };\r
+\r
+    // update existing readers\r
+    while (!mergeQueue_.empty()) {\r
+\r
+        // non-destructive 'pop' of first item from queue\r
+        auto firstIter = mergeQueue_.begin();\r
+        auto firstItem = internal::CompositeMergeItem{ std::move(firstIter->reader), std::move(firstIter->record) };\r
+        mergeQueue_.pop_front();\r
+\r
+        // reset request\r
+        PbiIndexedBamReader* pbiReader = dynamic_cast<PbiIndexedBamReader*>(firstItem.reader.get());\r
+        assert(pbiReader);\r
+        pbiReader->Filter(filter);\r
+\r
+        // try fetch 'next' from first item's reader\r
+        // if successful, re-insert it into container & re-sort on our new values\r
+        // otherwise, this item will go out of scope & reader destroyed\r
+        if (firstItem.reader->GetNext(firstItem.record)) {\r
+            updatedMergeItems.push_front(std::move(firstItem));\r
+            filesToCreate.erase(firstItem.reader->Filename());\r
+        }\r
+    }\r
+\r
+    // create readers for files that were not 'active' for the previous\r
+    std::vector<std::string> missingPbi;\r
+    for (auto&& fn : filesToCreate) {\r
+        auto bamFile = BamFile{ fn };\r
+        if (bamFile.PacBioIndexExists()) {\r
+            auto item = internal::CompositeMergeItem{ std::unique_ptr<BamReader>{ new PbiIndexedBamReader{ filter, std::move(bamFile) } } };\r
+            if (item.reader->GetNext(item.record))\r
+                updatedMergeItems.push_back(std::move(item));\r
+            // else not an error, simply no data matching filter\r
+        }\r
+        else\r
+            missingPbi.push_back(fn);\r
+    }\r
+\r
+    // throw if any files missing PBI\r
+    if (!missingPbi.empty()) {\r
+        std::stringstream e;\r
+        e << "failed to open PbiFilterCompositeBamReader because the following files are missing a PBI file:" << std::endl;\r
+        for (const auto& fn : missingPbi)\r
+            e << "  " << fn << std::endl;\r
+        throw std::runtime_error(e.str());\r
+    }\r
+\r
+    // update our actual container and return\r
+    mergeQueue_ = std::move(updatedMergeItems);\r
+    UpdateSort();\r
+    return *this;\r
+}\r
+\r
+template<typename OrderByType>\r
+inline void PbiFilterCompositeBamReader<OrderByType>::UpdateSort(void)\r
+{ std::stable_sort(mergeQueue_.begin(), mergeQueue_.end(), merge_sorter_type{}); }\r
+\r
+// ------------------------------\r
+// SequentialCompositeBamReader\r
+// ------------------------------\r
+\r
+inline SequentialCompositeBamReader::SequentialCompositeBamReader(const std::vector<BamFile>& bamFiles)\r
+{\r
+    for (auto&& bamFile : bamFiles)\r
+        readers_.emplace_back(new BamReader{ bamFile });\r
+}\r
+\r
+inline SequentialCompositeBamReader::SequentialCompositeBamReader(std::vector<BamFile>&& bamFiles)\r
+{\r
+    for (auto&& bamFile : bamFiles)\r
+        readers_.emplace_back(new BamReader{ std::move(bamFile) });\r
+}\r
+\r
+inline SequentialCompositeBamReader::SequentialCompositeBamReader(const DataSet& dataset)\r
+    : SequentialCompositeBamReader(dataset.BamFiles())\r
+{ }\r
+\r
+inline bool SequentialCompositeBamReader::GetNext(BamRecord& record)\r
+{\r
+    // try first reader, if successful return true\r
+    // else pop reader and try next, until all readers exhausted\r
+    while (!readers_.empty()) {\r
+        auto& reader = readers_.front();\r
+        if (reader->GetNext(record))\r
+            return true;\r
+        else\r
+            readers_.pop_front();\r
+    }\r
+\r
+    // no readers available\r
+    return false;\r
+}\r
+\r
+} // namespace BAM\r
+} // namespace PacBio\r
diff --git a/include/pbbam/internal/DataSet.inl b/include/pbbam/internal/DataSet.inl

new file mode 100644 (file)

index 0000000..6627ddf
--- /dev/null
+++ b/include/pbbam/internal/DataSet.inl
@@ -0,0 +1,201 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file DataSet.inl
+/// \brief Inline implementations for the DataSet class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/DataSet.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline const std::string& DataSet::Attribute(const std::string& name) const
+{ return d_->Attribute(name); }
+
+inline std::string& DataSet::Attribute(const std::string& name)
+{ return d_->Attribute(name); }
+
+inline DataSet& DataSet::Attribute(const std::string& name, const std::string& value)
+{ d_->Attribute(name, value); return *this; }
+
+inline const std::string& DataSet::CreatedAt(void) const
+{ return d_->CreatedAt(); }
+
+inline std::string& DataSet::CreatedAt(void)
+{ return d_->CreatedAt(); }
+
+inline DataSet& DataSet::CreatedAt(const std::string& createdAt)
+{ d_->CreatedAt(createdAt); return *this; }
+
+inline const PacBio::BAM::Extensions& DataSet::Extensions(void) const
+{ return d_->Extensions(); }
+
+inline PacBio::BAM::Extensions& DataSet::Extensions(void)
+{ return d_->Extensions(); }
+
+inline DataSet& DataSet::Extensions(const PacBio::BAM::Extensions& extensions)
+{ d_->Extensions(extensions); return *this; }
+
+inline const PacBio::BAM::ExternalResources& DataSet::ExternalResources(void) const
+{ return d_->ExternalResources(); }
+
+inline PacBio::BAM::ExternalResources& DataSet::ExternalResources(void)
+{ return d_->ExternalResources(); }
+
+inline DataSet& DataSet::ExternalResources(const PacBio::BAM::ExternalResources& resources)
+{ d_->ExternalResources(resources); return *this; }
+
+inline const PacBio::BAM::Filters& DataSet::Filters(void) const
+{ return d_->Filters(); }
+
+inline PacBio::BAM::Filters& DataSet::Filters(void)
+{ return d_->Filters(); }
+
+inline DataSet& DataSet::Filters(const PacBio::BAM::Filters& filters)
+{ d_->Filters(filters); return *this; }
+
+inline const std::string& DataSet::Format(void) const
+{ return d_->Format(); }
+
+inline std::string& DataSet::Format(void)
+{ return d_->Format(); }
+
+inline DataSet& DataSet::Format(const std::string& format)
+{ d_->Format(format); return *this; }
+
+inline const PacBio::BAM::DataSetMetadata& DataSet::Metadata(void) const
+{ return d_->Metadata(); }
+
+inline PacBio::BAM::DataSetMetadata& DataSet::Metadata(void)
+{ return d_->Metadata(); }
+
+inline DataSet& DataSet::Metadata(const PacBio::BAM::DataSetMetadata& metadata)
+{ d_->Metadata(metadata); return *this; }
+
+inline const std::string& DataSet::MetaType(void) const
+{ return d_->MetaType(); }
+
+inline std::string& DataSet::MetaType(void)
+{ return d_->MetaType(); }
+
+inline DataSet& DataSet::MetaType(const std::string& metatype)
+{ d_->MetaType(metatype); return *this; }
+
+inline const std::string& DataSet::ModifiedAt(void) const
+{ return d_->ModifiedAt(); }
+
+inline std::string& DataSet::ModifiedAt(void)
+{ return d_->ModifiedAt(); }
+
+inline DataSet& DataSet::ModifiedAt(const std::string& modifiedAt)
+{ d_->ModifiedAt(modifiedAt); return *this; }
+
+inline const std::string& DataSet::Name(void) const
+{ return d_->Name(); }
+
+inline std::string& DataSet::Name(void)
+{ return d_->Name(); }
+
+inline DataSet& DataSet::Name(const std::string& name)
+{ d_->Name(name); return *this; }
+
+inline const std::string& DataSet::ResourceId(void) const
+{ return d_->ResourceId(); }
+
+inline std::string& DataSet::ResourceId(void)
+{ return d_->ResourceId(); }
+
+inline DataSet& DataSet::ResourceId(const std::string& resourceId)
+{ d_->ResourceId(resourceId); return *this; }
+
+inline const PacBio::BAM::SubDataSets& DataSet::SubDataSets(void) const
+{ return d_->SubDataSets(); }
+
+inline PacBio::BAM::SubDataSets& DataSet::SubDataSets(void)
+{ return d_->SubDataSets(); }
+
+inline DataSet& DataSet::SubDataSets(const PacBio::BAM::SubDataSets& subdatasets)
+{ d_->SubDataSets(subdatasets); return *this; }
+
+inline const std::string& DataSet::Tags(void) const
+{ return d_->Tags(); }
+
+inline std::string& DataSet::Tags(void)
+{ return d_->Tags(); }
+
+inline DataSet& DataSet::Tags(const std::string& tags)
+{ d_->Tags(tags); return *this; }
+
+inline const std::string& DataSet::TimeStampedName(void) const
+{ return d_->TimeStampedName(); }
+
+inline std::string& DataSet::TimeStampedName(void)
+{ return d_->TimeStampedName(); }
+
+inline DataSet& DataSet::TimeStampedName(const std::string& timeStampedName)
+{ d_->TimeStampedName(timeStampedName); return *this; }
+
+inline PacBio::BAM::DataSet::TypeEnum DataSet::Type(void) const
+{ return DataSet::NameToType(TypeName()); }
+
+inline DataSet& DataSet::Type(const DataSet::TypeEnum type)
+{ d_->Label(DataSet::TypeToName(type)); return *this; }
+
+inline std::string DataSet::TypeName(void) const
+{ return d_->LocalNameLabel().to_string(); }
+
+inline const std::string& DataSet::UniqueId(void) const
+{ return d_->UniqueId(); }
+
+inline std::string& DataSet::UniqueId(void)
+{ return d_->UniqueId(); }
+
+inline DataSet& DataSet::UniqueId(const std::string& uuid)
+{ d_->UniqueId(uuid); return *this; }
+
+inline const std::string& DataSet::Version(void) const
+{ return d_->Version(); }
+
+inline std::string& DataSet::Version(void)
+{ return d_->Version(); }
+
+inline DataSet& DataSet::Version(const std::string& version)
+{ d_->Version(version); return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/DataSetBaseTypes.h b/include/pbbam/internal/DataSetBaseTypes.h

new file mode 100644 (file)

index 0000000..917162a
--- /dev/null
+++ b/include/pbbam/internal/DataSetBaseTypes.h
@@ -0,0 +1,179 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef DATASETBASETYPES_H
+#define DATASETBASETYPES_H
+
+#include "pbbam/Config.h"
+#include "pbbam/internal/DataSetElement.h"
+#include "pbbam/internal/DataSetListElement.h"
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+class DataSetMetadata;
+class Extensions;
+class ExternalResources;
+class FileIndices;
+class Filters;
+class Properties;
+class Provenance;
+
+namespace internal {
+
+class BaseEntityType : public DataSetElement
+{
+protected:
+    BaseEntityType(const std::string& label,
+                   const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const std::string& CreatedAt(void) const;
+    const std::string& Description(void) const;
+    const PacBio::BAM::Extensions& Extensions(void) const;
+    const std::string& Format(void) const;
+    const std::string& ModifiedAt(void) const;
+    const std::string& Name(void) const;
+    const std::string& ResourceId(void) const;
+    const std::string& Tags(void) const;
+    const std::string& Version(void) const;
+
+    std::string& CreatedAt(void);
+    std::string& Description(void);
+    PacBio::BAM::Extensions& Extensions(void);
+    std::string& Format(void);
+    std::string& ModifiedAt(void);
+    std::string& Name(void);
+    std::string& ResourceId(void);
+    std::string& Tags(void);
+    std::string& Version(void);
+
+    BaseEntityType& CreatedAt(const std::string& createdAt);
+    BaseEntityType& Description(const std::string& description);
+    BaseEntityType& Extensions(const PacBio::BAM::Extensions& extensions);
+    BaseEntityType& Format(const std::string& format);    
+    BaseEntityType& ModifiedAt(const std::string& modifiedAt);
+    BaseEntityType& Name(const std::string& name);
+    BaseEntityType& ResourceId(const std::string& resourceId);
+    BaseEntityType& Tags(const std::string& tags);
+    BaseEntityType& Version(const std::string& version);
+};
+
+class DataEntityType : public BaseEntityType
+{
+protected:
+    DataEntityType(const std::string& label,
+                   const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const std::string& Checksum(void) const;
+    const std::string& EncodedValue(void) const;
+    const std::string& MetaType(void) const;
+    const std::string& SimpleValue(void) const;
+    const std::string& TimeStampedName(void) const;
+    const std::string& UniqueId(void) const;
+    const std::string& ValueDataType(void) const;
+
+    std::string& Checksum(void);
+    std::string& EncodedValue(void);
+    std::string& MetaType(void);
+    std::string& SimpleValue(void);
+    std::string& TimeStampedName(void);
+    std::string& UniqueId(void);
+    std::string& ValueDataType(void);
+
+    DataEntityType& Checksum(const std::string& checksum);
+    DataEntityType& EncodedValue(const std::string& encodedValue);
+    DataEntityType& MetaType(const std::string& metatype);
+    DataEntityType& SimpleValue(const std::string& simpleValue);
+    DataEntityType& TimeStampedName(const std::string& timeStampedName);
+    DataEntityType& UniqueId(const std::string& uuid);
+    DataEntityType& ValueDataType(const std::string& valueDataType);
+};
+
+class StrictEntityType : public BaseEntityType
+{
+protected:
+    StrictEntityType(const std::string& metatype,
+                     const std::string& label,
+                     const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const std::string& MetaType(void) const;
+    const std::string& TimeStampedName(void) const;
+    const std::string& UniqueId(void) const;
+
+    std::string& MetaType(void);
+    std::string& TimeStampedName(void);
+    std::string& UniqueId(void);
+
+    StrictEntityType& MetaType(const std::string& metatype);
+    StrictEntityType& TimeStampedName(const std::string& timeStampedName);
+    StrictEntityType& UniqueId(const std::string& uuid);
+};
+
+class InputOutputDataType : public StrictEntityType
+{
+protected:
+    InputOutputDataType(const std::string& metatype, 
+                        const std::string& filename,
+                        const std::string& label,
+                        const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+};
+
+class IndexedDataType : public InputOutputDataType
+{
+protected:
+    IndexedDataType(const std::string& metatype, 
+                    const std::string& filename,
+                    const std::string& label,
+                    const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const PacBio::BAM::FileIndices& FileIndices(void) const;
+    PacBio::BAM::FileIndices& FileIndices(void);
+    IndexedDataType& FileIndices(const PacBio::BAM::FileIndices& indices);
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/DataSetBaseTypes.inl"
+
+#endif // DATASETBASETYPES_H
diff --git a/include/pbbam/internal/DataSetBaseTypes.inl b/include/pbbam/internal/DataSetBaseTypes.inl

new file mode 100644 (file)

index 0000000..c1d2f43
--- /dev/null
+++ b/include/pbbam/internal/DataSetBaseTypes.inl
@@ -0,0 +1,220 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "pbbam/internal/DataSetBaseTypes.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// ----------------
+// BaseEntityType
+// ----------------
+
+inline const std::string& BaseEntityType::CreatedAt(void) const
+{ return Attribute("CreatedAt"); }
+
+inline std::string& BaseEntityType::CreatedAt(void)
+{ return Attribute("CreatedAt"); }
+
+inline BaseEntityType& BaseEntityType::CreatedAt(const std::string& createdAt)
+{ Attribute("CreatedAt", createdAt); return *this; }
+
+inline const std::string& BaseEntityType::Description(void) const
+{ return Attribute("Description"); }
+
+inline std::string& BaseEntityType::Description(void)
+{ return Attribute("Description"); }
+
+inline BaseEntityType& BaseEntityType::Description(const std::string& description)
+{ Attribute("Description", description); return *this; }
+
+inline const std::string& BaseEntityType::Format(void) const
+{ return Attribute("Format"); }
+
+inline std::string& BaseEntityType::Format(void)
+{ return Attribute("Format"); }
+
+inline BaseEntityType& BaseEntityType::Format(const std::string& format)
+{ Attribute("Format", format); return *this; }
+
+inline const std::string& BaseEntityType::ModifiedAt(void) const
+{ return Attribute("ModifiedAt"); }
+
+inline std::string& BaseEntityType::ModifiedAt(void)
+{ return Attribute("ModifiedAt"); }
+
+inline BaseEntityType& BaseEntityType::ModifiedAt(const std::string& modifiedAt)
+{ Attribute("ModifiedAt", modifiedAt); return *this; }
+
+inline const std::string& BaseEntityType::Name(void) const
+{ return Attribute("Name"); }
+
+inline std::string& BaseEntityType::Name(void)
+{ return Attribute("Name"); }
+
+inline BaseEntityType& BaseEntityType::Name(const std::string& name)
+{ Attribute("Name", name); return *this; }
+
+inline const std::string& BaseEntityType::ResourceId(void) const
+{ return Attribute("ResourceId"); }
+
+inline std::string& BaseEntityType::ResourceId(void)
+{ return Attribute("ResourceId"); }
+
+inline BaseEntityType& BaseEntityType::ResourceId(const std::string& resourceId)
+{ Attribute("ResourceId", resourceId); return *this; }
+
+inline const std::string& BaseEntityType::Tags(void) const
+{ return Attribute("Tags"); }
+
+inline std::string& BaseEntityType::Tags(void)
+{ return Attribute("Tags"); }
+
+inline BaseEntityType& BaseEntityType::Tags(const std::string& tags)
+{ Attribute("Tags", tags); return *this; }
+
+inline const std::string& BaseEntityType::Version(void) const
+{ return Attribute("Version"); }
+
+inline std::string& BaseEntityType::Version(void)
+{ return Attribute("Version"); }
+
+inline BaseEntityType& BaseEntityType::Version(const std::string& version)
+{ Attribute("Version", version); return *this; }
+
+// ----------------
+// DataEntityType
+// ----------------
+
+inline const std::string& DataEntityType::Checksum(void) const
+{ return ChildText("Checksum"); }
+
+inline std::string& DataEntityType::Checksum(void)
+{ return ChildText("Checksum"); }
+
+inline DataEntityType& DataEntityType::Checksum(const std::string& checksum)
+{ ChildText("Checksum", checksum); return *this; }
+
+inline const std::string& DataEntityType::EncodedValue(void) const
+{ return ChildText("EncodedValue"); }
+
+inline std::string& DataEntityType::EncodedValue(void)
+{ return ChildText("EncodedValue"); }
+
+inline DataEntityType& DataEntityType::EncodedValue(const std::string& encodedValue)
+{ ChildText("EncodedValue", encodedValue); return *this; }
+
+inline const std::string& DataEntityType::MetaType(void) const
+{ return Attribute("MetaType"); }
+
+inline std::string& DataEntityType::MetaType(void)
+{ return Attribute("MetaType"); }
+
+inline DataEntityType& DataEntityType::MetaType(const std::string& metatype)
+{ Attribute("MetaType", metatype); return *this; }
+
+inline const std::string& DataEntityType::SimpleValue(void) const
+{ return Attribute("SimpleValue"); }
+
+inline std::string& DataEntityType::SimpleValue(void)
+{ return Attribute("SimpleValue"); }
+
+inline DataEntityType& DataEntityType::SimpleValue(const std::string& simpleValue)
+{ Attribute("SimpleValue", simpleValue); return *this; }
+
+inline const std::string& DataEntityType::TimeStampedName(void) const
+{ return Attribute("TimeStampedName"); }
+
+inline std::string& DataEntityType::TimeStampedName(void)
+{ return Attribute("TimeStampedName"); }
+
+inline DataEntityType& DataEntityType::TimeStampedName(const std::string& timeStampedName)
+{ Attribute("TimeStampedName", timeStampedName); return *this; }
+
+inline const std::string& DataEntityType::UniqueId(void) const
+{ return Attribute("UniqueId"); }
+
+inline std::string& DataEntityType::UniqueId(void)
+{ return Attribute("UniqueId"); }
+
+inline DataEntityType& DataEntityType::UniqueId(const std::string& uuid)
+{ Attribute("UniqueId", uuid); return *this; }
+
+inline const std::string& DataEntityType::ValueDataType(void) const
+{ return Attribute("ValueDataType"); }
+
+inline std::string& DataEntityType::ValueDataType(void)
+{ return Attribute("ValueDataType"); }
+
+inline DataEntityType& DataEntityType::ValueDataType(const std::string& valueDataType)
+{ Attribute("ValueDataType", valueDataType); return *this; }
+
+// ----------------
+// StrictEntityType
+// ----------------
+
+inline const std::string& StrictEntityType::MetaType(void) const
+{ return Attribute("MetaType"); }
+
+inline std::string& StrictEntityType::MetaType(void)
+{ return Attribute("MetaType"); }
+
+inline StrictEntityType& StrictEntityType::MetaType(const std::string& metatype)
+{ Attribute("MetaType", metatype); return *this; }
+
+inline const std::string& StrictEntityType::TimeStampedName(void) const
+{ return Attribute("TimeStampedName"); }
+
+inline std::string& StrictEntityType::TimeStampedName(void)
+{ return Attribute("TimeStampedName"); }
+
+inline StrictEntityType& StrictEntityType::TimeStampedName(const std::string& timeStampedName)
+{ Attribute("TimeStampedName", timeStampedName); return *this; }
+
+inline const std::string& StrictEntityType::UniqueId(void) const
+{ return Attribute("UniqueId"); }
+
+inline std::string& StrictEntityType::UniqueId(void)
+{ return Attribute("UniqueId"); }
+
+inline StrictEntityType& StrictEntityType::UniqueId(const std::string& uuid)
+{ Attribute("UniqueId", uuid); return *this; }
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/DataSetElement.h b/include/pbbam/internal/DataSetElement.h

new file mode 100644 (file)

index 0000000..c7f7c8d
--- /dev/null
+++ b/include/pbbam/internal/DataSetElement.h
@@ -0,0 +1,192 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef DATASETELEMENT_H
+#define DATASETELEMENT_H
+
+#include "pbbam/DataSetXsd.h"
+
+#include <boost/utility/string_ref.hpp>
+#include <algorithm>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <cassert>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class XmlName
+{
+    //    qualified name
+    //       |
+    //  --------------
+    // <pbns:node_name >
+    //  ---- ---------
+    //   |        |
+    //  prefix    local name
+
+public:
+    XmlName(const std::string& fullName, bool verbatim = false);
+    XmlName(const std::string& localName, const std::string& prefix);
+    XmlName(const XmlName& other);
+    XmlName(XmlName&& other);
+    XmlName& operator=(const XmlName& other);
+    XmlName& operator=(XmlName&& other);
+    ~XmlName(void);
+
+public:
+    bool operator==(const XmlName& other) const;
+    bool operator!=(const XmlName& other) const;
+
+public:
+    const boost::string_ref LocalName(void) const;
+    const boost::string_ref Prefix(void) const;
+    const std::string& QualifiedName(void) const;
+    bool Verbatim(void) const;
+
+private:
+    std::string qualifiedName_;
+    size_t prefixSize_;
+    size_t localNameOffset_;
+    size_t localNameSize_;
+    bool verbatim_;
+};
+
+struct FromInputXml { };
+
+class DataSetElement
+{
+public:
+    DataSetElement(const std::string& label, const XsdType& xsd = XsdType::NONE);
+    DataSetElement(const std::string& label, const FromInputXml& fromInputXml, const XsdType& xsd = XsdType::NONE);
+    DataSetElement(const DataSetElement& other);
+    DataSetElement(DataSetElement&& other);
+    DataSetElement& operator=(const DataSetElement& other);
+    DataSetElement& operator=(DataSetElement&& other);
+    virtual ~DataSetElement(void);
+
+public:
+    bool operator==(const DataSetElement& other) const;
+    bool operator!=(const DataSetElement& other) const;
+
+public:
+    const std::string& Attribute(const std::string& name) const;
+    std::string& Attribute(const std::string& name);
+    const std::map<std::string, std::string>& Attributes(void) const;
+    std::map<std::string, std::string>& Attributes(void);
+    bool HasAttribute(const std::string& name) const;
+
+    const std::vector<DataSetElement>& Children(void) const;
+    std::vector<DataSetElement>& Children(void);
+    bool HasChild(const std::string& label) const;
+
+    const boost::string_ref LocalNameLabel(void) const;
+    const boost::string_ref PrefixLabel(void) const;
+    const std::string& QualifiedNameLabel(void) const;
+    bool IsVerbatimLabel(void) const;
+
+    const std::string& Text(void) const;
+    std::string& Text(void);
+
+    const XsdType& Xsd(void) const;
+
+public:
+    void Attribute(const std::string& name, const std::string& value);
+    void Label(const std::string& label);
+    void Text(const std::string& text);
+
+public:
+    size_t NumAttributes(void) const;
+    size_t NumChildren(void) const;
+
+public:
+    void AddChild(const DataSetElement& e);
+    void RemoveChild(const DataSetElement& e);
+
+    template<typename T>
+    const T& Child(size_t index) const;
+
+    template<typename T>
+    T& Child(size_t index);
+
+    template<typename T>
+    const T& Child(const std::string& label) const;
+
+    template<typename T>
+    T& Child(const std::string& label);
+
+    template<typename T>
+    const T& operator[](size_t index) const;
+
+    template<typename T>
+    T& operator[](size_t index);
+
+    template<typename T = DataSetElement>
+    const T& operator[](const std::string& label) const;
+
+    template<typename T = DataSetElement>
+    T& operator[](const std::string& label);
+
+protected:
+    static const std::string& SharedNullString(void);
+
+public:
+    const std::string& ChildText(const std::string& label) const;
+    std::string& ChildText(const std::string& label);
+    void ChildText(const std::string& label, const std::string& text);
+
+protected:
+    XsdType xsd_;
+    XmlName label_;
+    std::string text_;
+    std::map<std::string, std::string> attributes_;
+    std::vector<DataSetElement> children_;
+
+private:
+    int IndexOf(const std::string& label) const;
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/DataSetElement.inl"
+
+#endif // DATASETELEMENT_H
diff --git a/include/pbbam/internal/DataSetElement.inl b/include/pbbam/internal/DataSetElement.inl

new file mode 100644 (file)

index 0000000..37a673f
--- /dev/null
+++ b/include/pbbam/internal/DataSetElement.inl
@@ -0,0 +1,373 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "pbbam/internal/DataSetElement.h"
+
+#include <iostream>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// ----------------
+// DataSetElement
+// ----------------
+
+inline DataSetElement::DataSetElement(const std::string& label, const XsdType& xsd)
+    : xsd_(xsd)
+    , label_(label)
+{ }
+
+inline DataSetElement::DataSetElement(const std::string& label,
+                                      const FromInputXml&,
+                                      const XsdType& xsd)
+    : xsd_(xsd)
+    , label_(label, true)
+{ }
+
+inline DataSetElement::DataSetElement(const DataSetElement& other)
+    : xsd_(other.xsd_)
+    , label_(other.label_)
+    , text_(other.text_)
+    , attributes_(other.attributes_)
+    , children_(other.children_)
+{ }
+
+inline DataSetElement::DataSetElement(DataSetElement&& other)
+    : xsd_(std::move(other.xsd_))
+    , label_(std::move(other.label_))
+    , text_(std::move(other.text_))
+    , attributes_(std::move(other.attributes_))
+    , children_(std::move(other.children_))
+{ }
+
+inline DataSetElement& DataSetElement::operator=(const DataSetElement& other)
+{
+    xsd_ = other.xsd_;
+    label_ = other.label_;
+    text_ = other.text_;
+    attributes_ = other.attributes_;
+    children_ = other.children_;
+    return *this;
+}
+
+inline DataSetElement& DataSetElement::operator=(DataSetElement&& other)
+{
+    xsd_ = std::move(other.xsd_);
+    label_ = std::move(other.label_);
+    text_ = std::move(other.text_);
+    attributes_ = std::move(other.attributes_);
+    children_ = std::move(other.children_);
+    return *this;
+}
+
+inline DataSetElement::~DataSetElement(void) { }
+
+inline bool DataSetElement::operator==(const DataSetElement& other) const
+{
+    return xsd_   == other.xsd_   &&
+           label_ == other.label_ &&
+           text_  == other.text_  &&
+           attributes_ == other.attributes_ &&
+           children_   == other.children_;
+}
+
+inline bool DataSetElement::operator!=(const DataSetElement& other) const
+{ return !(*this == other); }
+
+template<typename T>
+const T& DataSetElement::operator[](size_t index) const
+{ return Child<T>(index); }
+
+template<typename T>
+T& DataSetElement::operator[](size_t index)
+{ return Child<T>(index); }
+
+template<typename T>
+const T& DataSetElement::operator[](const std::string& label) const
+{ return Child<T>(label); }
+
+template<typename T>
+T& DataSetElement::operator[](const std::string& label)
+{ return Child<T>(label); }
+
+inline void DataSetElement::AddChild(const DataSetElement& e)
+{ children_.push_back(e); }
+
+inline std::string& DataSetElement::Attribute(const std::string& name)
+{ return attributes_[name]; }
+
+inline const std::string& DataSetElement::Attribute(const std::string& name) const
+{
+    auto iter = attributes_.find(name);
+    if (iter == attributes_.cend())
+        return SharedNullString();
+    return iter->second;
+}
+
+inline void DataSetElement::Attribute(const std::string& name, const std::string& value)
+{ attributes_[name] = value; }
+
+inline const std::map<std::string, std::string>& DataSetElement::Attributes(void) const
+{ return attributes_; }
+
+inline std::map<std::string, std::string>& DataSetElement::Attributes(void)
+{ return attributes_; }
+
+template<typename T>
+inline const T& DataSetElement::Child(size_t index) const
+{ return static_cast<const T&>(children_.at(index)); }
+
+template<typename T>
+inline T& DataSetElement::Child(size_t index)
+{ return static_cast<T&>(children_.at(index)); }
+
+template<typename T>
+inline const T& DataSetElement::Child(const std::string& label) const
+{ return Child<T>(IndexOf(label)); }
+
+template<typename T>
+inline T& DataSetElement::Child(const std::string& label)
+{
+    const int i = IndexOf(label);
+    if (i >= 0) {
+        assert(static_cast<size_t>(i) < NumChildren());
+        return Child<T>(i);
+    } else {
+        AddChild(DataSetElement(label));
+        return Child<T>(NumChildren()-1);
+    }
+}
+
+inline const std::vector<DataSetElement>& DataSetElement::Children(void) const
+{ return children_; }
+
+inline std::vector<DataSetElement>& DataSetElement::Children(void)
+{ return children_; }
+
+inline const std::string& DataSetElement::ChildText(const std::string& label) const
+{
+    if (!HasChild(label))
+        return SharedNullString();
+    return Child<DataSetElement>(label).Text();
+}
+
+inline std::string& DataSetElement::ChildText(const std::string& label)
+{
+    if (!HasChild(label))
+        AddChild(DataSetElement(label));
+    return Child<DataSetElement>(label).Text();
+}
+
+inline bool DataSetElement::HasAttribute(const std::string& name) const
+{ return attributes_.find(name) != attributes_.cend(); }
+
+inline bool DataSetElement::HasChild(const std::string& label) const
+{ return IndexOf(label) != -1; }
+
+inline int DataSetElement::IndexOf(const std::string& label) const
+{
+    const size_t count = NumChildren();
+    for (size_t i = 0; i < count; ++i) {
+        const DataSetElement& child = children_.at(i);
+        if (child.LocalNameLabel() == label || child.label_ == label)
+            return i;
+    }
+    return -1;
+}
+
+inline const boost::string_ref DataSetElement::LocalNameLabel(void) const
+{ return label_.LocalName(); }
+
+inline const boost::string_ref DataSetElement::PrefixLabel(void) const
+{ return label_.Prefix(); }
+
+inline const std::string& DataSetElement::QualifiedNameLabel(void) const
+{ return label_.QualifiedName(); }
+
+//inline std::string& DataSetElement::Label(void)
+//{ return label_.QualifiedName(); }
+
+inline void DataSetElement::Label(const std::string& label)
+{ label_ = XmlName(label, true); }
+
+inline size_t DataSetElement::NumAttributes(void) const
+{ return attributes_.size(); }
+
+inline size_t DataSetElement::NumChildren(void) const
+{ return children_.size(); }
+
+inline void DataSetElement::RemoveChild(const DataSetElement& e)
+{
+    children_.erase(
+        std::remove(children_.begin(),
+                    children_.end(),
+                    e),
+        children_.end()
+    );
+}
+
+inline void DataSetElement::ChildText(const std::string& label,
+                                         const std::string& text)
+{
+    if (!HasChild(label)) {
+        DataSetElement e(label);
+        e.Text(text);
+        AddChild(e);
+    } else {
+        Child<DataSetElement>(label).Text(text);
+    }
+}
+
+inline bool DataSetElement::IsVerbatimLabel(void) const
+{ return label_.Verbatim(); }
+
+inline const std::string& DataSetElement::Text(void) const
+{ return text_; }
+
+inline std::string& DataSetElement::Text(void)
+{ return text_; }
+
+inline void DataSetElement::Text(const std::string& text)
+{ text_ = text; }
+
+inline const XsdType& DataSetElement::Xsd(void) const
+{ return xsd_; }
+
+// ----------------
+// XmlName
+// ----------------
+
+inline XmlName::XmlName(const std::string& fullName, bool verbatim)
+    : qualifiedName_(fullName)
+    , prefixSize_(0)
+    , localNameOffset_(0)
+    , localNameSize_(0)
+    , verbatim_(verbatim)
+{
+    const size_t colonFound = qualifiedName_.find(':');
+    if (colonFound == std::string::npos || colonFound == 0)
+        localNameSize_ = qualifiedName_.size();
+    else {
+        prefixSize_ = colonFound;
+        localNameSize_ = (qualifiedName_.size() - colonFound) - 1;
+    }
+
+    // adjust for colon if prefix present
+    localNameOffset_ = prefixSize_;
+    if (prefixSize_ != 0)
+        ++localNameOffset_;
+}
+
+inline XmlName::XmlName(const std::string& localName,
+                        const std::string& prefix)
+    : prefixSize_(prefix.size())
+    , localNameOffset_(prefixSize_)
+    , localNameSize_(localName.size())
+    , verbatim_(true)
+{
+    qualifiedName_.clear();
+    qualifiedName_.reserve(localNameSize_+ prefixSize_ + 1);
+    qualifiedName_.append(prefix);
+    if (!qualifiedName_.empty())
+        qualifiedName_.append(1, ':');
+    qualifiedName_.append(localName);
+
+    // adjust for colon if prefix present
+    if (prefixSize_ != 0)
+        ++localNameOffset_;
+}
+
+inline XmlName::XmlName(const XmlName& other)
+    : qualifiedName_(other.qualifiedName_)
+    , prefixSize_(other.prefixSize_)
+    , localNameOffset_(other.localNameOffset_)
+    , localNameSize_(other.localNameSize_)
+    , verbatim_(other.verbatim_)
+{ }
+
+inline XmlName::XmlName(XmlName&& other)
+    : qualifiedName_(std::move(other.qualifiedName_))
+    , prefixSize_(std::move(other.prefixSize_))
+    , localNameOffset_(std::move(other.localNameOffset_))
+    , localNameSize_(std::move(other.localNameSize_))
+    , verbatim_(std::move(other.verbatim_))
+{ }
+
+inline XmlName& XmlName::operator=(const XmlName& other)
+{
+    qualifiedName_   = other.qualifiedName_;
+    prefixSize_      = other.prefixSize_;
+    localNameOffset_ = other.localNameOffset_;
+    localNameSize_   = other.localNameSize_;
+    verbatim_        = other.verbatim_;
+    return *this;
+}
+
+inline XmlName& XmlName::operator=(XmlName&& other)
+{
+    qualifiedName_   = std::move(other.qualifiedName_);
+    prefixSize_      = std::move(other.prefixSize_);
+    localNameOffset_ = std::move(other.localNameOffset_);
+    localNameSize_   = std::move(other.localNameSize_);
+    verbatim_        = std::move(other.verbatim_);
+    return *this;
+}
+
+inline XmlName::~XmlName(void) { }
+
+inline bool XmlName::operator==(const XmlName& other) const
+{ return qualifiedName_ == other.qualifiedName_; }
+
+inline bool XmlName::operator!=(const XmlName& other) const
+{ return !(*this == other); }
+
+inline const boost::string_ref XmlName::LocalName(void) const
+{ return boost::string_ref(qualifiedName_.data() + localNameOffset_, localNameSize_); }
+
+inline const boost::string_ref XmlName::Prefix(void) const
+{ return boost::string_ref(qualifiedName_.data(), prefixSize_); }
+
+inline const std::string& XmlName::QualifiedName(void) const
+{ return qualifiedName_; }
+
+inline bool XmlName::Verbatim(void) const
+{ return verbatim_; }
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/DataSetListElement.h b/include/pbbam/internal/DataSetListElement.h

new file mode 100644 (file)

index 0000000..5c44d25
--- /dev/null
+++ b/include/pbbam/internal/DataSetListElement.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef DATASETLISTELEMENT_H
+#define DATASETLISTELEMENT_H
+
+#include "pbbam/internal/DataSetElement.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+//
+// adds iterators for convenience
+//
+template<class T> class DataSetListElement;
+
+template<class T>
+class DataSetListIteratorBase
+{
+public:
+    bool operator==(const DataSetListIteratorBase<T>& other) const;
+    bool operator!=(const DataSetListIteratorBase<T>& other) const;
+
+protected:
+    DataSetListIteratorBase(const DataSetListElement<T>* parent, size_t i);
+    void ReadNext(void);
+
+protected:
+    const DataSetListElement<T>* parent_;
+    size_t index_;
+};
+
+template<class T>
+class DataSetListIterator : public DataSetListIteratorBase<T>
+{
+public:
+    DataSetListIterator(const DataSetListElement<T>* parent, size_t i);
+    T& operator*(void);
+    T* operator->(void);
+    DataSetListIterator<T>& operator++(void);
+    DataSetListIterator<T> operator++(int);
+};
+
+template<class T>
+class DataSetListConstIterator : public DataSetListIteratorBase<T>
+{
+public:
+    DataSetListConstIterator(const DataSetListElement<T>* parent, size_t i);
+    const T& operator*(void) const;
+    const T* operator->(void) const;
+    DataSetListConstIterator<T>& operator++(void);
+    DataSetListConstIterator<T> operator++(int);
+};
+
+template<class T>
+class DataSetListElement : public DataSetElement
+{
+public:
+    DataSetListElement(const std::string& label, const XsdType& xsd = XsdType::NONE);
+
+// child access through index
+public:
+    const T& operator[](size_t index) const;
+    T& operator[](size_t index);
+    size_t Size(void) const;
+
+// child access through iterators
+public:
+    DataSetListIterator<T> begin(void);
+    DataSetListConstIterator<T> begin(void) const;
+    DataSetListConstIterator<T> cbegin(void) const;
+    DataSetListIterator<T> end(void);
+    DataSetListConstIterator<T> end(void) const;
+    DataSetListConstIterator<T> cend(void) const;
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/DataSetListElement.inl"
+
+#endif // DATASETLISTELEMENT_H
diff --git a/include/pbbam/internal/DataSetListElement.inl b/include/pbbam/internal/DataSetListElement.inl

new file mode 100644 (file)

index 0000000..1479fa5
--- /dev/null
+++ b/include/pbbam/internal/DataSetListElement.inl
@@ -0,0 +1,181 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "pbbam/internal/DataSetListElement.h"
+#include <cassert>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// --------------------
+// DataSetListElement
+// --------------------
+
+template<class T>
+inline DataSetListElement<T>::DataSetListElement(const std::string& label,
+                                                 const XsdType& xsd)
+    : DataSetElement(label, xsd)
+{ }
+
+template<class T>
+inline const T& DataSetListElement<T>::operator[](size_t index) const
+{ return static_cast<const T&>(children_.at(index)); }
+
+template<class T>
+inline T& DataSetListElement<T>::operator[](size_t index)
+{ return static_cast<T&>(children_.at(index)); }
+
+template<class T>
+inline size_t DataSetListElement<T>::Size(void) const
+{ return NumChildren(); }
+
+template<class T>
+inline DataSetListIterator<T> DataSetListElement<T>::begin(void)
+{ return DataSetListIterator<T>(this, 0); }
+
+template<class T>
+inline DataSetListConstIterator<T> DataSetListElement<T>::begin(void) const
+{ return DataSetListConstIterator<T>(this, 0); }
+
+template<class T>
+inline DataSetListConstIterator<T> DataSetListElement<T>::cbegin(void) const
+{ return DataSetListConstIterator<T>(this, 0); }
+
+template<class T>
+inline DataSetListIterator<T> DataSetListElement<T>::end(void)
+{ return DataSetListIterator<T>(this, NumChildren()); }
+
+template<class T>
+inline DataSetListConstIterator<T> DataSetListElement<T>::end(void) const
+{ return DataSetListConstIterator<T>(this, NumChildren()); }
+
+template<class T>
+inline DataSetListConstIterator<T>DataSetListElement<T>::cend(void) const
+{ return DataSetListConstIterator<T>(this, NumChildren()); }
+
+// -------------------------
+// DataSetListIteratorBase
+// -------------------------
+
+template<class T>
+inline bool DataSetListIteratorBase<T>::operator==(const DataSetListIteratorBase<T>& other) const
+{ return parent_ == other.parent_ &&
+         index_ == other.index_;
+}
+
+template<class T>
+inline bool DataSetListIteratorBase<T>::operator!=(const DataSetListIteratorBase<T>& other) const
+{ return !(*this == other); }
+
+template<class T>
+inline DataSetListIteratorBase<T>::DataSetListIteratorBase(const DataSetListElement<T>* parent, size_t i)
+    : parent_(parent)
+    , index_(i)
+{ }
+
+template<class T>
+inline void DataSetListIteratorBase<T>::ReadNext(void)
+{
+    if (index_ >= parent_->NumChildren()) {
+        parent_ = nullptr;
+        return;
+    }
+    ++index_;
+}
+
+// ---------------------
+// DataSetListIterator
+// ---------------------
+
+template<class T>
+inline DataSetListIterator<T>::DataSetListIterator(const DataSetListElement<T>* parent, size_t i)
+    : DataSetListIteratorBase<T>(parent, i)
+{ }
+
+template<class T>
+inline T& DataSetListIterator<T>::operator*(void)
+{ return DataSetListIteratorBase<T>::parent_->template Child<T>(DataSetListIteratorBase<T>::index_); }
+
+template<class T>
+inline T* DataSetListIterator<T>::operator->(void)
+{ return &(operator*()); }
+
+template<class T>
+inline DataSetListIterator<T>& DataSetListIterator<T>::operator++(void)
+{ DataSetListIteratorBase<T>::ReadNext(); return *this; }
+
+template<class T>
+inline DataSetListIterator<T> DataSetListIterator<T>::operator++(int)
+{
+    DataSetListIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+// --------------------------
+// DataSetListConstIterator
+// --------------------------
+
+template<class T>
+inline DataSetListConstIterator<T>::DataSetListConstIterator(const DataSetListElement<T>* parent, size_t i)
+    : DataSetListIteratorBase<T>(parent, i)
+{ }
+
+template<class T>
+inline const T& DataSetListConstIterator<T>::operator*(void) const
+{ return DataSetListIteratorBase<T>::parent_->template Child<T>(DataSetListIteratorBase<T>::index_); }
+
+template<class T>
+inline const T* DataSetListConstIterator<T>::operator->(void) const
+{ return &(operator*()); }
+
+template<class T>
+inline DataSetListConstIterator<T>& DataSetListConstIterator<T>::operator++(void)
+{ DataSetListIteratorBase<T>::ReadNext(); return *this; }
+
+template<class T>
+inline DataSetListConstIterator<T> DataSetListConstIterator<T>::operator++(int)
+{
+    DataSetListConstIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/DataSetTypes.inl b/include/pbbam/internal/DataSetTypes.inl

new file mode 100644 (file)

index 0000000..dbcbd26
--- /dev/null
+++ b/include/pbbam/internal/DataSetTypes.inl
@@ -0,0 +1,154 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file DataSetTypes.inl
+/// \brief Inline implementations for the public DataSet component classes.
+//
+// Author: Derek Barnett
+
+#include "pbbam/DataSetTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+// -------------
+// DataSetBase
+// --------------
+
+inline const NamespaceRegistry& DataSetBase::Namespaces(void) const
+{ return registry_; }
+
+inline NamespaceRegistry& DataSetBase::Namespaces(void)
+{ return registry_; }
+
+// ---------------------
+// DataSetMetadata
+// ---------------------
+
+inline const std::string& DataSetMetadata::NumRecords(void) const
+{ return ChildText("NumRecords"); }
+
+inline std::string& DataSetMetadata::NumRecords(void)
+{ return ChildText("NumRecords"); }
+
+inline DataSetMetadata& DataSetMetadata::NumRecords(const std::string& numRecords)
+{ ChildText("NumRecords", numRecords); return *this; }
+
+inline const std::string& DataSetMetadata::TotalLength(void) const
+{ return ChildText("TotalLength"); }
+
+inline std::string& DataSetMetadata::TotalLength(void)
+{ return ChildText("TotalLength"); }
+
+inline DataSetMetadata& DataSetMetadata::TotalLength(const std::string& totalLength)
+{ ChildText("TotalLength", totalLength); return *this; }
+
+// ----------
+// Property
+// ----------
+
+inline const std::string& Property::Name(void) const
+{ return Attribute("Name"); }
+
+inline std::string& Property::Name(void)
+{ return Attribute("Name"); }
+
+inline Property& Property::Name(const std::string& name)
+{ Attribute("Name", name); return *this; }
+
+inline const std::string& Property::Operator(void) const
+{ return Attribute("Operator"); }
+
+inline std::string& Property::Operator(void)
+{ return Attribute("Operator"); }
+
+inline Property& Property::Operator(const std::string& op)
+{ Attribute("Operator", op); return *this; }
+
+inline const std::string& Property::Value(void) const
+{ return Attribute("Value"); }
+
+inline std::string& Property::Value(void)
+{ return Attribute("Value"); }
+
+inline Property& Property::Value(const std::string& value)
+{ Attribute("Value", value); return *this; }
+
+// ------------
+// Provenance
+// ------------
+
+inline const std::string& Provenance::CreatedBy(void) const
+{ return Attribute("CreatedBy"); }
+
+inline std::string& Provenance::CreatedBy(void)
+{ return Attribute("CreatedBy"); }
+
+inline Provenance& Provenance::CreatedBy(const std::string& createdBy)
+{ Attribute("CreatedBy", createdBy); return *this; }
+
+inline const std::string& Provenance::CommonServicesInstanceId(void) const
+{ return ChildText("CommonServicesInstanceId"); }
+
+inline std::string& Provenance::CommonServicesInstanceId(void)
+{ return ChildText("CommonServicesInstanceId"); }
+
+inline Provenance& Provenance::CommonServicesInstanceId(const std::string& id)
+{ ChildText("CommonServicesInstanceId", id); return *this; }
+
+inline const std::string& Provenance::CreatorUserId(void) const
+{ return ChildText("CreatorUserId"); }
+
+inline std::string& Provenance::CreatorUserId(void)
+{ return ChildText("CreatorUserId"); }
+
+inline Provenance& Provenance::CreatorUserId(const std::string& id)
+{ ChildText("CreatorUserId", id); return *this; }
+
+inline const std::string& Provenance::ParentJobId(void) const
+{ return ChildText("ParentJobId"); }
+
+inline std::string& Provenance::ParentJobId(void)
+{ return ChildText("ParentJobId"); }
+
+inline Provenance& Provenance::ParentJobId(const std::string& id)
+{ ChildText("ParentJobId", id); return *this; }
+
+inline Provenance& Provenance::ParentTool(const PacBio::BAM::ParentTool& tool)
+{ ParentTool() = tool; return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/FastaSequence.inl b/include/pbbam/internal/FastaSequence.inl

new file mode 100644 (file)

index 0000000..fe28170
--- /dev/null
+++ b/include/pbbam/internal/FastaSequence.inl
@@ -0,0 +1,60 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file FastaSequence.inl
+/// \brief Inline implementations for the FastaSequence class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/FastaSequence.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline FastaSequence::FastaSequence(std::string name,
+                                    std::string bases)
+    : name_{std::move(name)}
+    , bases_{std::move(bases)}
+{ }
+
+inline std::string FastaSequence::Bases(void) const
+{ return bases_; }
+
+inline std::string FastaSequence::Name(void) const
+{ return name_; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Frames.inl b/include/pbbam/internal/Frames.inl

new file mode 100644 (file)

index 0000000..37cb64b
--- /dev/null
+++ b/include/pbbam/internal/Frames.inl
@@ -0,0 +1,93 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Frames.inl
+/// \brief Inline implementations for the Frames class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Frames.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline const std::vector<uint16_t>& Frames::Data(void) const
+{ return data_; }
+
+inline std::vector<uint16_t>& Frames::DataRaw(void)
+{ return data_; }
+
+inline std::vector<uint8_t> Frames::Encode(void) const
+{ return Frames::Encode(data_); }
+
+inline Frames& Frames::Data(const std::vector<uint16_t>& frames)
+{ data_ = frames; return *this; }
+
+inline Frames& Frames::Data(std::vector<uint16_t>&& frames)
+{ data_ = std::move(frames); return *this; }
+
+inline std::vector<uint16_t>::const_iterator Frames::begin(void) const
+{ return data_.begin(); }
+
+inline std::vector<uint16_t>::iterator Frames::begin(void)
+{ return data_.begin(); }
+
+inline std::vector<uint16_t>::const_iterator Frames::cbegin(void) const
+{ return data_.cbegin(); }
+
+inline std::vector<uint16_t>::const_iterator Frames::cend(void) const
+{ return data_.cend(); }
+
+inline std::vector<uint16_t>::const_iterator Frames::end(void) const
+{ return data_.end(); }
+
+inline std::vector<uint16_t>::iterator Frames::end(void)
+{ return data_.end(); }
+
+inline size_t Frames::size(void) const
+{ return data_.size(); }
+
+inline bool Frames::empty(void) const
+{ return data_.empty(); }
+
+inline bool Frames::operator==(const Frames& other) const
+{ return data_ == other.data_; }
+
+inline bool Frames::operator!=(const Frames& other) const
+{ return !(*this == other); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/GenomicInterval.inl b/include/pbbam/internal/GenomicInterval.inl

new file mode 100644 (file)

index 0000000..07c18ef
--- /dev/null
+++ b/include/pbbam/internal/GenomicInterval.inl
@@ -0,0 +1,91 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file GenomicInterval.inl
+/// \brief Inline implementations for the GenomicInterval class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/GenomicInterval.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline GenomicInterval::~GenomicInterval(void) { }
+
+inline std::string GenomicInterval::Name(void) const
+{ return name_; }
+
+inline GenomicInterval& GenomicInterval::Name(const std::string& name)
+{ name_ = name; return *this; }
+
+inline PacBio::BAM::Interval<Position> GenomicInterval::Interval(void) const
+{ return interval_; }
+
+inline GenomicInterval& GenomicInterval::Interval(const PacBio::BAM::Interval<Position>& interval)
+{ interval_ = interval; return *this; }
+
+inline bool GenomicInterval::IsValid(void) const
+{
+    return !name_.empty() &&
+           interval_.Start() >= 0 &&
+           interval_.Stop()  >= 0 &&
+           interval_.IsValid();
+}
+
+inline size_t GenomicInterval::Length(void) const
+{ return interval_.Length(); }
+
+inline Position GenomicInterval::Start(void) const
+{ return interval_.Start(); }
+
+inline GenomicInterval& GenomicInterval::Start(const Position start)
+{ interval_.Start(start); return *this; }
+
+inline Position GenomicInterval::Stop(void) const
+{ return interval_.Stop(); }
+
+inline GenomicInterval& GenomicInterval::Stop(const Position stop)
+{ interval_.Stop(stop); return *this; }
+
+inline bool GenomicInterval::operator==(const GenomicInterval& other) const
+{ return name_ == other.name_ && interval_ == other.interval_; }
+
+inline bool GenomicInterval::operator!=(const GenomicInterval& other) const
+{ return !(*this == other); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Interval.inl b/include/pbbam/internal/Interval.inl

new file mode 100644 (file)

index 0000000..e9c7edd
--- /dev/null
+++ b/include/pbbam/internal/Interval.inl
@@ -0,0 +1,118 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Interval.inl
+/// \brief Inline implementations for the Interval class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Interval.h"
+
+namespace PacBio {
+namespace BAM {
+
+template<typename T>
+inline Interval<T>::Interval(void)
+    : data_(boost::icl::discrete_interval<T>::right_open(0,0))
+{ }
+
+template<typename T>
+inline Interval<T>::Interval(const T val)
+    : data_(boost::icl::discrete_interval<T>::right_open(val,val+1))
+{ }
+
+template<typename T>
+inline Interval<T>::Interval(const T start, const T stop)
+    : data_(boost::icl::discrete_interval<T>::right_open(start,stop))
+{ }
+
+template<typename T>
+inline Interval<T>::Interval(const Interval<T>& other)
+    : data_(boost::icl::discrete_interval<T>::right_open(other.Start(), other.Stop()))
+{ }
+
+template<typename T>
+inline bool Interval<T>::operator==(const Interval<T>& other) const
+{ return data_ == other.data_; }
+
+template<typename T>
+inline bool Interval<T>::operator!=(const Interval<T>& other) const
+{ return !(data_ == other.data_); }
+
+template<typename T>
+inline bool Interval<T>::CoveredBy(const Interval<T>& other) const
+{ return boost::icl::within(data_, other.data_); }
+
+template<typename T>
+inline bool Interval<T>::Covers(const Interval<T>& other) const
+{ return boost::icl::contains(data_, other.data_); }
+
+template<typename T>
+inline bool Interval<T>::Intersects(const Interval<T>& other) const
+{ return boost::icl::intersects(data_, other.data_); }
+
+template<typename T>
+inline bool Interval<T>::IsValid(void) const
+{ return !boost::icl::is_empty(data_); }
+
+template<typename T>
+inline size_t Interval<T>::Length(void) const
+{ return boost::icl::length(data_); }
+
+template<typename T>
+inline T Interval<T>::Start(void) const
+{ return data_.lower(); }
+
+template<typename T>
+inline Interval<T>& Interval<T>::Start(const T& start)
+{
+    data_ = boost::icl::discrete_interval<T>::right_open(start, data_.upper());
+    return *this;
+}
+
+template<typename T>
+inline T Interval<T>::Stop(void) const
+{ return data_.upper(); }
+
+template<typename T>
+inline Interval<T>& Interval<T>::Stop(const T& stop)
+{
+    data_ = boost::icl::discrete_interval<T>::right_open(data_.lower(), stop);
+    return *this;
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/PbiBasicTypes.inl b/include/pbbam/internal/PbiBasicTypes.inl

new file mode 100644 (file)

index 0000000..229841e
--- /dev/null
+++ b/include/pbbam/internal/PbiBasicTypes.inl
@@ -0,0 +1,70 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiBasicTypes.inl
+/// \brief Inline implementations for the basic data structures used in PBI lookups.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiBasicTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline IndexResultBlock::IndexResultBlock(void)
+    : firstIndex_(0)
+    , numReads_(0)
+    , virtualOffset_(-1)
+{ }
+
+inline IndexResultBlock::IndexResultBlock(size_t idx, size_t numReads)
+    : firstIndex_(idx)
+    , numReads_(numReads)
+    , virtualOffset_(-1)
+{ }
+
+inline bool IndexResultBlock::operator==(const IndexResultBlock& other) const
+{
+    return firstIndex_ == other.firstIndex_ &&
+           numReads_ == other.numReads_ &&
+           virtualOffset_ == other.virtualOffset_;
+}
+
+inline bool IndexResultBlock::operator!=(const IndexResultBlock& other) const
+{ return !(*this == other); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/PbiFilter.inl b/include/pbbam/internal/PbiFilter.inl

new file mode 100644 (file)

index 0000000..18c26d0
--- /dev/null
+++ b/include/pbbam/internal/PbiFilter.inl
@@ -0,0 +1,312 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiFilter.inl
+/// \brief Inline implementations for the PbiFilter class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiFilter.h"
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <set>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+/// \internal
+///
+/// This class wraps a the basic PBI filter (whether property filter or some operator
+/// e.g. union, intersect, etc.). The wrapper allows PbiFilters to hold heterogeneous,
+/// recursive filter types - without exposing pointers & worrying about memory ownership
+/// issues between client & library.
+///
+/// Filters can be given by value from client code and we will wrap them for composition.
+///
+/// \code{.cpp}
+///    PbiFilter f1(PbiZmwFilter(42));
+///    PbiFilter f2;
+///    f2.Add(PbiQueryLengthFilter(3000, GREATER_THAN_EQUAL));
+///    f2.Add(MyApplicationCustomFilter("foo"));
+///    PbiFilter intersect = PbiFilter::Intersect(f1, f2);
+///    ...
+/// \endcode
+///
+struct FilterWrapper
+{
+public:
+    template<typename T> FilterWrapper(T x);
+
+    FilterWrapper(const FilterWrapper& other);
+    FilterWrapper(FilterWrapper&&) noexcept = default;
+    FilterWrapper& operator=(const FilterWrapper& other);
+    FilterWrapper& operator=(FilterWrapper&&) noexcept = default;
+    ~FilterWrapper(void);
+
+public:
+    bool Accepts(const PacBio::BAM::PbiRawData& idx, const size_t row) const;
+
+private:
+    struct WrapperInterface
+    {
+        virtual ~WrapperInterface(void) = default;
+        virtual WrapperInterface* Clone(void) const =0;
+        virtual bool Accepts(const PacBio::BAM::PbiRawData& idx,
+                             const size_t row) const =0;
+    };
+
+    template<typename T>
+    struct WrapperImpl : public WrapperInterface
+    {
+        WrapperImpl(T x);
+        WrapperImpl(const WrapperImpl& other);
+        WrapperInterface* Clone(void) const;
+        bool Accepts(const PacBio::BAM::PbiRawData& idx, const size_t row) const;
+        T data_;
+    };
+
+private:
+    std::unique_ptr<WrapperInterface> self_;
+};
+
+// ---------------
+// FilterWrapper
+// ---------------
+
+template<typename T>
+inline FilterWrapper::FilterWrapper(T x)
+    : self_(new WrapperImpl<T>(std::move(x)))
+{ }
+
+inline FilterWrapper::FilterWrapper(const FilterWrapper& other)
+    : self_(other.self_->Clone())
+{ }
+
+inline FilterWrapper& FilterWrapper::operator=(const FilterWrapper& other)
+{
+    self_.reset(other.self_->Clone());
+    return *this;
+}
+
+inline FilterWrapper::~FilterWrapper(void) { }
+
+inline bool FilterWrapper::Accepts(const PbiRawData& idx, const size_t row) const
+{ return self_->Accepts(idx, row); }
+
+// ----------------
+// WrapperImpl<T>
+// ----------------
+
+template<typename T>
+inline FilterWrapper::WrapperImpl<T>::WrapperImpl(T x)
+    : FilterWrapper::WrapperInterface()
+    , data_(std::move(x))
+{
+    BOOST_CONCEPT_ASSERT((PbiFilterConcept<T>));
+}
+
+template<typename T>
+inline FilterWrapper::WrapperImpl<T>::WrapperImpl(const WrapperImpl& other)
+    : FilterWrapper::WrapperInterface()
+    , data_(other.data_)
+{ }
+
+template<typename T>
+inline FilterWrapper::WrapperInterface* FilterWrapper::WrapperImpl<T>::Clone(void) const
+{ return new WrapperImpl(*this); }
+
+template<typename T>
+inline bool FilterWrapper::WrapperImpl<T>::Accepts(const PbiRawData& idx,
+                                                   const size_t row) const
+{ return data_.Accepts(idx, row); }
+
+struct PbiFilterPrivate
+{
+    PbiFilterPrivate(PbiFilter::CompositionType type)
+        : type_(type)
+    { }
+
+    template<typename T>
+    void Add(T&& filter)
+    {
+        filters_.emplace_back(std::move(filter));
+    }
+
+    std::unique_ptr<internal::PbiFilterPrivate> DeepCopy(void)
+    {
+        auto copy = std::unique_ptr<PbiFilterPrivate>{ new PbiFilterPrivate{type_} };
+        copy->filters_ = this->filters_;
+        return copy;
+    }
+
+    bool Accepts(const PbiRawData& idx, const size_t row) const
+    {
+        // no filter -> accepts every record
+        if (filters_.empty())
+            return true;
+
+        // intersection of child filters
+        if (type_ == PbiFilter::INTERSECT) {
+            for (const auto& filter : filters_) {
+                if (!filter.Accepts(idx, row))
+                    return false; // break early on failure
+            }
+            return true; // all passed
+        }
+
+        // union of child filters
+        else if (type_ == PbiFilter::UNION) {
+            for (const auto& filter : filters_) {
+                if (filter.Accepts(idx, row))
+                    return true; // break early on pass
+            }
+            return false; // none passed
+        }
+
+        else
+            //assert(false); // invalid composite filter type
+            throw std::runtime_error("invalid composite filter type in PbiFilterPrivate::Accepts");
+    }
+
+    PbiFilter::CompositionType type_;
+    std::vector<FilterWrapper> filters_;
+};
+
+} // namespace internal
+
+inline PbiFilter::PbiFilter(const CompositionType type)
+    : d_{ new internal::PbiFilterPrivate{ type } }
+{ }
+
+template<typename T> inline
+PbiFilter::PbiFilter(const T& filter)
+    : d_{ new internal::PbiFilterPrivate{ PbiFilter::INTERSECT } }
+{
+    Add(filter);
+}
+
+template<typename T> inline
+PbiFilter::PbiFilter(T&& filter)
+    : d_{ new internal::PbiFilterPrivate{ PbiFilter::INTERSECT } }
+{
+    Add(std::move(filter));
+}
+
+inline PbiFilter::PbiFilter(const std::vector<PbiFilter>& filters)
+    : d_{ new internal::PbiFilterPrivate{ PbiFilter::INTERSECT } }
+{
+    Add(filters);
+}
+
+inline PbiFilter::PbiFilter(std::vector<PbiFilter>&& filters)
+    : d_{ new internal::PbiFilterPrivate{ PbiFilter::INTERSECT} }
+{
+    Add(std::move(filters));
+}
+
+inline PbiFilter::PbiFilter(const PbiFilter& other)
+    : d_{ other.d_->DeepCopy() }
+{ }
+
+inline PbiFilter::PbiFilter(PbiFilter&& other) noexcept
+    : d_{ std::move(other.d_) }
+{ }
+
+inline PbiFilter& PbiFilter::operator=(const PbiFilter& other)
+{
+    d_ = other.d_->DeepCopy();
+    return *this;
+}
+
+inline PbiFilter& PbiFilter::operator=(PbiFilter&& other) noexcept
+{
+    d_ = std::move(other.d_);
+    return *this;
+}
+
+inline PbiFilter::~PbiFilter(void) { }
+
+inline bool PbiFilter::Accepts(const PacBio::BAM::PbiRawData& idx,
+                               const size_t row) const
+{ return d_->Accepts(idx, row); }
+
+template<typename T>
+inline PbiFilter& PbiFilter::Add(const T& filter)
+{
+    T copy = filter;
+    return Add(std::move(copy));
+}
+
+template<typename T>
+inline PbiFilter& PbiFilter::Add(T&& filter)
+{
+    d_->Add(std::move(filter));
+    return *this;
+}
+
+inline PbiFilter& PbiFilter::Add(const PbiFilter& filter)
+{
+    PbiFilter copy = filter;
+    return Add(std::move(copy));
+}
+
+inline PbiFilter& PbiFilter::Add(PbiFilter&& filter)
+{
+    d_->Add(std::move(filter));
+    return *this;
+}
+
+inline PbiFilter& PbiFilter::Add(const std::vector<PbiFilter>& filters)
+{
+    std::vector<PbiFilter> copy = filters;
+    return Add(std::move(copy));
+}
+
+inline PbiFilter& PbiFilter::Add(std::vector<PbiFilter>&& filters)
+{
+    for (auto&& filter : filters)
+        d_->Add(std::move(filter));
+    return *this;
+}
+
+inline bool PbiFilter::IsEmpty(void) const
+{ return d_->filters_.empty(); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/PbiFilterTypes.inl b/include/pbbam/internal/PbiFilterTypes.inl

new file mode 100644 (file)

index 0000000..a7a8787
--- /dev/null
+++ b/include/pbbam/internal/PbiFilterTypes.inl
@@ -0,0 +1,548 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiFilterTypes.inl
+/// \brief Inline implementations for the built-in PBI filters.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiFilterTypes.h"
+#include <cassert>
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+
+template <typename T>
+inline FilterBase<T>::FilterBase(const T& value, const Compare::Type cmp)
+    : value_(value)
+    , cmp_(cmp)
+{ }
+
+template <typename T>
+inline FilterBase<T>::FilterBase(T&& value, const Compare::Type cmp)
+    : value_(std::move(value))
+    , cmp_(cmp)
+{ }
+
+template <typename T>
+inline FilterBase<T>::FilterBase(const std::vector<T>& values)
+    : multiValue_(values)
+{ }
+
+template <typename T>
+inline FilterBase<T>::FilterBase(std::vector<T>&& values)
+    : multiValue_(std::move(values))
+{ }
+
+template<typename T>
+inline bool FilterBase<T>::CompareHelper(const T& lhs) const
+{
+    if (multiValue_ == boost::none)
+        return CompareSingleHelper(lhs);
+    else
+        return CompareMultiHelper(lhs);
+}
+
+template<typename T>
+inline bool FilterBase<T>::CompareMultiHelper(const T& lhs) const
+{
+    // check provided value against all filter criteria,
+    // return true on any exact match
+    auto iter = multiValue_.get().cbegin();
+    const auto end  = multiValue_.get().cend();
+    for (; iter != end; ++iter) {
+        if (*iter == lhs)
+            return true;
+    }
+    return false; // no matches
+}
+
+template<typename T>
+inline bool FilterBase<T>::CompareSingleHelper(const T& lhs) const
+{
+    switch(cmp_) {
+        case Compare::EQUAL:              return lhs == value_;
+        case Compare::LESS_THAN:          return lhs < value_;
+        case Compare::LESS_THAN_EQUAL:    return lhs <= value_;
+        case Compare::GREATER_THAN:       return lhs > value_;
+        case Compare::GREATER_THAN_EQUAL: return lhs >= value_;
+        case Compare::NOT_EQUAL:          return lhs != value_;
+        default:
+            assert(false);
+            throw std::runtime_error("unsupported compare type requested");
+    }
+}
+
+template<>
+inline bool FilterBase<LocalContextFlags>::CompareSingleHelper(const LocalContextFlags& lhs) const
+{
+    switch(cmp_) {
+        case Compare::EQUAL:              return lhs == value_;
+        case Compare::LESS_THAN:          return lhs < value_;
+        case Compare::LESS_THAN_EQUAL:    return lhs <= value_;
+        case Compare::GREATER_THAN:       return lhs > value_;
+        case Compare::GREATER_THAN_EQUAL: return lhs >= value_;
+        case Compare::NOT_EQUAL:          return lhs != value_;
+        case Compare::CONTAINS:           return ((lhs & value_) != 0);
+        case Compare::NOT_CONTAINS:       return ((lhs & value_) == 0);
+
+        default:
+            assert(false);
+            throw std::runtime_error("unsupported compare type requested");
+    }
+}
+
+// BarcodeDataFilterBase
+
+template<typename T, BarcodeLookupData::Field field>
+inline BarcodeDataFilterBase<T, field>::BarcodeDataFilterBase(const T& value, const Compare::Type cmp)
+    : FilterBase<T>(value, cmp)
+{ }
+
+template<typename T, BarcodeLookupData::Field field>
+inline BarcodeDataFilterBase<T, field>::BarcodeDataFilterBase(T&& value, const Compare::Type cmp)
+    : FilterBase<T>(std::move(value), cmp)
+{ }
+
+template<typename T, BarcodeLookupData::Field field>
+inline BarcodeDataFilterBase<T, field>::BarcodeDataFilterBase(const std::vector<T>& values)
+    : FilterBase<T>(values)
+{ }
+
+template<typename T, BarcodeLookupData::Field field>
+inline BarcodeDataFilterBase<T, field>::BarcodeDataFilterBase(std::vector<T>&& values)
+    : FilterBase<T>(std::move(values))
+{ }
+
+template<typename T, BarcodeLookupData::Field field>
+inline bool BarcodeDataFilterBase<T, field>::BarcodeDataFilterBase::Accepts(const PbiRawData& idx,
+                                           const size_t row) const
+{
+    const PbiRawBarcodeData& barcodeData = idx.BarcodeData();
+    switch (field) {
+        case BarcodeLookupData::BC_FORWARD: return FilterBase<T>::CompareHelper(barcodeData.bcForward_.at(row));
+        case BarcodeLookupData::BC_REVERSE: return FilterBase<T>::CompareHelper(barcodeData.bcReverse_.at(row));
+        case BarcodeLookupData::BC_QUALITY: return FilterBase<T>::CompareHelper(barcodeData.bcQual_.at(row));
+        default:
+            assert(false);
+            throw std::runtime_error("unsupported BarcodeData field requested");
+    }
+}
+
+// BasicDataFilterBase
+
+template<typename T, BasicLookupData::Field field>
+inline BasicDataFilterBase<T, field>::BasicDataFilterBase(const T& value, const Compare::Type cmp)
+    : FilterBase<T>(value, cmp)
+{ }
+
+template<typename T, BasicLookupData::Field field>
+inline BasicDataFilterBase<T, field>::BasicDataFilterBase(T&& value, const Compare::Type cmp)
+    : FilterBase<T>(std::move(value), cmp)
+{ }
+
+template<typename T, BasicLookupData::Field field>
+inline BasicDataFilterBase<T, field>::BasicDataFilterBase(const std::vector<T>& values)
+    : FilterBase<T>(values)
+{ }
+
+template<typename T, BasicLookupData::Field field>
+inline BasicDataFilterBase<T, field>::BasicDataFilterBase(std::vector<T>&& values)
+    : FilterBase<T>(std::move(values))
+{ }
+
+template<typename T, BasicLookupData::Field field>
+inline bool BasicDataFilterBase<T, field>::BasicDataFilterBase::Accepts(const PbiRawData& idx,
+                                                                        const size_t row) const
+{
+    const PbiRawBasicData& basicData = idx.BasicData();
+    switch (field) {
+        case BasicLookupData::RG_ID:        return FilterBase<T>::CompareHelper(basicData.rgId_.at(row));
+        case BasicLookupData::Q_START:      return FilterBase<T>::CompareHelper(basicData.qStart_.at(row));
+        case BasicLookupData::Q_END:        return FilterBase<T>::CompareHelper(basicData.qEnd_.at(row));
+        case BasicLookupData::ZMW:          return FilterBase<T>::CompareHelper(basicData.holeNumber_.at(row));
+        case BasicLookupData::READ_QUALITY: return FilterBase<T>::CompareHelper(basicData.readQual_.at(row));
+        //   BasicLookupData::CONTEXT_FLAG has its own specialization
+        default:
+            assert(false);
+            throw std::runtime_error("unsupported BasicData field requested");
+    }
+}
+
+// this typedef exists purely so that the next method signature isn't 2 screen widths long
+typedef BasicDataFilterBase<LocalContextFlags, BasicLookupData::CONTEXT_FLAG> LocalContextFilter__;
+
+template<>
+inline bool LocalContextFilter__::BasicDataFilterBase::Accepts(const PbiRawData& idx,
+                                                               const size_t row) const
+{
+    const PbiRawBasicData& basicData = idx.BasicData();
+    const LocalContextFlags rowFlags = static_cast<LocalContextFlags>(basicData.ctxtFlag_.at(row));
+    return FilterBase<LocalContextFlags>::CompareHelper(rowFlags);
+}
+
+template<typename T, MappedLookupData::Field field>
+inline MappedDataFilterBase<T, field>::MappedDataFilterBase(const T& value, const Compare::Type cmp)
+    : FilterBase<T>(value, cmp)
+{ }
+
+template<typename T, MappedLookupData::Field field>
+inline MappedDataFilterBase<T, field>::MappedDataFilterBase(T&& value, const Compare::Type cmp)
+    : FilterBase<T>(std::move(value), cmp)
+{ }
+
+template<typename T, MappedLookupData::Field field>
+inline MappedDataFilterBase<T, field>::MappedDataFilterBase(const std::vector<T>& values)
+    : FilterBase<T>(values)
+{ }
+
+template<typename T, MappedLookupData::Field field>
+inline MappedDataFilterBase<T, field>::MappedDataFilterBase(std::vector<T>&& values)
+    : FilterBase<T>(std::move(values))
+{ }
+
+template<>
+inline bool MappedDataFilterBase<Strand, MappedLookupData::STRAND>::MappedDataFilterBase::Accepts(const PbiRawData& idx,
+                                                                                                  const size_t row) const
+{
+    const PbiRawMappedData& mappedData = idx.MappedData();
+    const Strand strand = (mappedData.revStrand_.at(row) == 1 ? Strand::REVERSE : Strand::FORWARD);
+    return FilterBase<Strand>::CompareHelper(strand);
+}
+
+template<typename T, MappedLookupData::Field field>
+inline bool MappedDataFilterBase<T, field>::MappedDataFilterBase::Accepts(const PbiRawData& idx,
+                                                                          const size_t row) const
+{
+    const PbiRawMappedData& mappedData = idx.MappedData();
+    switch (field) {
+        case MappedLookupData::T_ID:        return FilterBase<T>::CompareHelper(mappedData.tId_.at(row));
+        case MappedLookupData::T_START:     return FilterBase<T>::CompareHelper(mappedData.tStart_.at(row));
+        case MappedLookupData::T_END:       return FilterBase<T>::CompareHelper(mappedData.tEnd_.at(row));
+        case MappedLookupData::A_START:     return FilterBase<T>::CompareHelper(mappedData.aStart_.at(row));
+        case MappedLookupData::A_END:       return FilterBase<T>::CompareHelper(mappedData.aEnd_.at(row));
+        case MappedLookupData::N_M:         return FilterBase<T>::CompareHelper(mappedData.nM_.at(row));
+        case MappedLookupData::N_MM:        return FilterBase<T>::CompareHelper(mappedData.nMM_.at(row));
+        case MappedLookupData::N_DEL:       return FilterBase<T>::CompareHelper(mappedData.NumDeletedBasesAt(row));
+        case MappedLookupData::N_INS:       return FilterBase<T>::CompareHelper(mappedData.NumInsertedBasesAt(row));
+        case MappedLookupData::MAP_QUALITY: return FilterBase<T>::CompareHelper(mappedData.mapQV_.at(row));
+        default:
+            assert(false);
+            throw std::runtime_error("unsupported MappedData field requested");
+    }
+}
+
+} // namespace internal
+
+// PbiAlignedEndFilter
+
+inline PbiAlignedEndFilter::PbiAlignedEndFilter(const uint32_t position, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, MappedLookupData::A_END>(position, cmp)
+{ }
+
+// PbiAlignedLengthFilter
+
+inline PbiAlignedLengthFilter::PbiAlignedLengthFilter(const uint32_t length, const Compare::Type cmp)
+    : internal::FilterBase<uint32_t>(length, cmp)
+{ }
+
+// PbiAlignedStartFilter
+
+inline PbiAlignedStartFilter::PbiAlignedStartFilter(const uint32_t position, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, MappedLookupData::A_START>(position, cmp)
+{ }
+
+// PbiAlignedStrandFilter
+
+inline PbiAlignedStrandFilter::PbiAlignedStrandFilter(const Strand strand, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<Strand, MappedLookupData::STRAND>(strand, cmp)
+{
+    if (cmp != Compare::EQUAL && cmp != Compare::NOT_EQUAL) {
+        auto msg = std::string{ "Compare type: " };
+        msg += Compare::TypeToName(cmp);
+        msg += " not supported for PbiAlignedStrandFilter (use one of Compare::EQUAL or Compare::NOT_EQUAL).";
+        throw std::runtime_error(msg);
+    }
+}
+
+// PbiBarcodeFilter
+
+inline PbiBarcodeFilter::PbiBarcodeFilter(const int16_t barcode, const Compare::Type cmp)
+    : compositeFilter_{ PbiFilter::Union({ PbiBarcodeForwardFilter{barcode,cmp},
+                                           PbiBarcodeReverseFilter{barcode,cmp}
+                                         })
+                      }
+{ }
+
+inline PbiBarcodeFilter::PbiBarcodeFilter(const std::vector<int16_t>& whitelist)
+    : compositeFilter_{ PbiFilter::Union({ PbiBarcodeForwardFilter{whitelist},
+                                           PbiBarcodeReverseFilter{whitelist}
+                                         })
+                      }
+{ }
+
+inline PbiBarcodeFilter::PbiBarcodeFilter(std::vector<int16_t>&& whitelist)
+    : compositeFilter_{ PbiFilter::Union({ PbiBarcodeForwardFilter{std::move(whitelist)},
+                                           PbiBarcodeReverseFilter{std::move(whitelist)}
+                                         })
+                      }
+{ }
+
+inline bool PbiBarcodeFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{ return compositeFilter_.Accepts(idx, row); }
+
+// PbiBarcodeForwardFilter
+
+inline PbiBarcodeForwardFilter::PbiBarcodeForwardFilter(const int16_t bcFwdId, const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<int16_t, BarcodeLookupData::BC_FORWARD>(bcFwdId, cmp)
+{ }
+
+inline PbiBarcodeForwardFilter::PbiBarcodeForwardFilter(const std::vector<int16_t>& whitelist)
+    : internal::BarcodeDataFilterBase<int16_t, BarcodeLookupData::BC_FORWARD>(whitelist)
+{ }
+
+inline PbiBarcodeForwardFilter::PbiBarcodeForwardFilter(std::vector<int16_t>&& whitelist)
+    : internal::BarcodeDataFilterBase<int16_t, BarcodeLookupData::BC_FORWARD>(std::move(whitelist))
+{ }
+
+// PbiBarcodeQualityFilter
+
+inline PbiBarcodeQualityFilter::PbiBarcodeQualityFilter(const uint8_t bcQuality, const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<uint8_t, BarcodeLookupData::BC_QUALITY>(bcQuality, cmp)
+{ }
+
+// PbiBarcodeReverseFilter
+
+inline PbiBarcodeReverseFilter::PbiBarcodeReverseFilter(const int16_t bcRevId, const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<int16_t, BarcodeLookupData::BC_REVERSE>(bcRevId, cmp)
+{ }
+
+inline PbiBarcodeReverseFilter::PbiBarcodeReverseFilter(const std::vector<int16_t>& whitelist)
+    : internal::BarcodeDataFilterBase<int16_t, BarcodeLookupData::BC_REVERSE>(whitelist)
+{ }
+
+inline PbiBarcodeReverseFilter::PbiBarcodeReverseFilter(std::vector<int16_t>&& whitelist)
+    : internal::BarcodeDataFilterBase<int16_t, BarcodeLookupData::BC_REVERSE>(std::move(whitelist))
+{ }
+
+// PbiBarcodesFilter
+
+inline PbiBarcodesFilter::PbiBarcodesFilter(const std::pair<int16_t, int16_t> barcodes, const Compare::Type cmp)
+    : PbiBarcodesFilter(barcodes.first, barcodes.second, cmp)
+{ }
+
+inline PbiBarcodesFilter::PbiBarcodesFilter(const int16_t bcForward, const int16_t bcReverse, const Compare::Type cmp)
+    : compositeFilter_{ PbiFilter::Intersection({ PbiBarcodeForwardFilter{bcForward,cmp},
+                                                  PbiBarcodeReverseFilter{bcReverse,cmp}
+                                                })
+                      }
+{ }
+
+inline bool PbiBarcodesFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{ return compositeFilter_.Accepts(idx, row); }
+
+// PbiIdentityFilter
+
+inline PbiIdentityFilter::PbiIdentityFilter(const float identity,
+                                            const Compare::Type cmp)
+    : internal::FilterBase<float>(identity, cmp)
+{ }
+
+// PbiLocalContextFilter
+
+inline PbiLocalContextFilter::PbiLocalContextFilter(const LocalContextFlags& flags,
+                                                    const Compare::Type cmp)
+    : internal::BasicDataFilterBase<LocalContextFlags, BasicLookupData::CONTEXT_FLAG>(flags, cmp)
+{ }
+
+// PbiMapQualityFilter
+
+inline PbiMapQualityFilter::PbiMapQualityFilter(const uint8_t mapQual, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint8_t, MappedLookupData::MAP_QUALITY>(mapQual, cmp)
+{ }
+
+// PbiMovieNameFilter
+
+inline bool PbiMovieNameFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{ return compositeFilter_.Accepts(idx, row); }
+
+// PbiNumDeletedBasesFilter
+
+inline PbiNumDeletedBasesFilter::PbiNumDeletedBasesFilter(const size_t numDeletions, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, MappedLookupData::N_DEL>(numDeletions, cmp)
+{ }
+
+// PbiNumInsertedBasesFilter
+
+inline PbiNumInsertedBasesFilter::PbiNumInsertedBasesFilter(const size_t numInsertions, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, MappedLookupData::N_INS>(numInsertions, cmp)
+{ }
+
+// PbiNumMatchesFilter
+
+inline PbiNumMatchesFilter::PbiNumMatchesFilter(const size_t numMatchedBases, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, MappedLookupData::N_M>(numMatchedBases, cmp)
+{ }
+
+// PbiNumMismatchesFilter
+
+inline PbiNumMismatchesFilter::PbiNumMismatchesFilter(const size_t numMismatchedBases, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, MappedLookupData::N_MM>(numMismatchedBases, cmp)
+{ }
+
+// PbiQueryEndFilter
+
+inline PbiQueryEndFilter::PbiQueryEndFilter(const int32_t position, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::Q_END>(position, cmp)
+{ }
+
+// PbiQueryLengthFilter
+
+inline PbiQueryLengthFilter::PbiQueryLengthFilter(const int32_t length, const Compare::Type cmp)
+    : internal::FilterBase<int32_t>(length, cmp)
+{ }
+
+// PbiQueryStartFilter
+
+inline PbiQueryStartFilter::PbiQueryStartFilter(const int32_t position, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::Q_START>(position, cmp)
+{ }
+
+// PbiReadAccuracyFilter
+
+inline PbiReadAccuracyFilter::PbiReadAccuracyFilter(const Accuracy accuracy, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<Accuracy, BasicLookupData::READ_QUALITY>(accuracy, cmp)
+{ }
+
+// PbiReadGroupFilter
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(const int32_t rgId, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::RG_ID>(rgId, cmp)
+{ }
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(const std::string rgId, const Compare::Type cmp)
+    : PbiReadGroupFilter(ReadGroupInfo::IdToInt(rgId), cmp)
+{ }
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(const ReadGroupInfo& rg, const Compare::Type cmp)
+    : PbiReadGroupFilter(rg.Id(), cmp)
+{ }
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(const std::vector<int32_t>& whitelist)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::RG_ID>(whitelist)
+{ }
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(std::vector<int32_t>&& whitelist)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::RG_ID>(std::move(whitelist))
+{ }
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(const std::vector<std::string>& whitelist)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::RG_ID>(std::vector<int32_t>())
+{
+    multiValue_->reserve(whitelist.size());
+    for (const auto& rg : whitelist)
+        multiValue_->push_back(ReadGroupInfo::IdToInt(rg));
+}
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(std::vector<std::string>&& whitelist)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::RG_ID>(std::vector<int32_t>())
+{
+    multiValue_->reserve(whitelist.size());
+    for (auto&& rg : whitelist)
+        multiValue_->push_back(ReadGroupInfo::IdToInt(rg));
+}
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(const std::vector<ReadGroupInfo>& whitelist)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::RG_ID>(std::vector<int32_t>())
+{
+    multiValue_->reserve(whitelist.size());
+    for (const auto& rg : whitelist)
+        multiValue_->push_back(ReadGroupInfo::IdToInt(rg.Id()));
+}
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(std::vector<ReadGroupInfo>&& whitelist)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::RG_ID>(std::vector<int32_t>())
+{
+    multiValue_->reserve(whitelist.size());
+    for (auto&& rg : whitelist)
+        multiValue_->push_back(ReadGroupInfo::IdToInt(rg.Id()));
+}
+
+// PbiReferenceEndFilter
+
+inline PbiReferenceEndFilter::PbiReferenceEndFilter(const uint32_t tEnd, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, MappedLookupData::T_END>(tEnd, cmp)
+{ }
+
+// PbiReferenceIdFilter
+
+inline PbiReferenceIdFilter::PbiReferenceIdFilter(const int32_t tId, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<int32_t, MappedLookupData::T_ID>(tId, cmp)
+{ }
+
+inline PbiReferenceIdFilter::PbiReferenceIdFilter(const std::vector<int32_t>& whitelist)
+    : internal::MappedDataFilterBase<int32_t, MappedLookupData::T_ID>(whitelist)
+{ }
+
+inline PbiReferenceIdFilter::PbiReferenceIdFilter(std::vector<int32_t>&& whitelist)
+    : internal::MappedDataFilterBase<int32_t, MappedLookupData::T_ID>(std::move(whitelist))
+{ }
+
+// PbiReferenceStartFilter
+
+inline PbiReferenceStartFilter::PbiReferenceStartFilter(const uint32_t tStart, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, MappedLookupData::T_START>(tStart, cmp)
+{ }
+
+// PbiZmwFilter
+
+inline PbiZmwFilter::PbiZmwFilter(const int32_t zmw, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::ZMW>(zmw, cmp)
+{ }
+
+inline PbiZmwFilter::PbiZmwFilter(const std::vector<int32_t>& whitelist)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::ZMW>(whitelist)
+{ }
+
+inline PbiZmwFilter::PbiZmwFilter(std::vector<int32_t>&& whitelist)
+    : internal::BasicDataFilterBase<int32_t, BasicLookupData::ZMW>(std::move(whitelist))
+{ }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/PbiIndex.inl b/include/pbbam/internal/PbiIndex.inl

new file mode 100644 (file)

index 0000000..ca4c4ce
--- /dev/null
+++ b/include/pbbam/internal/PbiIndex.inl
@@ -0,0 +1,165 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiIndex.inl
+/// \brief Inline implementations for the PbiIndex class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/PbiFile.h"
+#include "pbbam/PbiIndex.h"
+#include "pbbam/PbiRawData.h"
+
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include <cassert>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// --------------------------
+// Pbi Lookup Aggregate
+// --------------------------
+
+class PbiIndexPrivate
+{
+public:
+    PbiIndexPrivate(void);
+    PbiIndexPrivate(const PbiRawData& rawIndex);
+    PbiIndexPrivate(PbiRawData&& rawIndex);
+
+    std::unique_ptr<PbiIndexPrivate> DeepCopy(void) const;
+
+public:
+    bool HasSection(const PbiFile::Section flag) const;
+    void SetSection(const PbiFile::Section flag, bool ok = true);
+
+public:
+    IndexResultBlocks LookupReference(const int32_t tId) const;
+
+private:
+    IndexResultBlocks MergeBlocksWithOffsets(const IndexList& indices) const;
+
+public:
+    std::string filename_;
+    PbiFile::VersionEnum version_;
+    PbiFile::Sections sections_;
+    uint32_t numReads_;
+
+    // lookup structures
+    BasicLookupData     basicData_;
+    MappedLookupData    mappedData_;
+    ReferenceLookupData referenceData_;
+    BarcodeLookupData   barcodeData_;
+
+private:
+    // not-implemented - ensure no copy
+    PbiIndexPrivate(const PbiIndexPrivate& other);
+    PbiIndexPrivate& operator=(const PbiIndexPrivate& other);
+};
+
+inline bool PbiIndexPrivate::HasSection(const PbiFile::Section flag) const
+{ return (sections_ & flag) != 0; }
+
+inline void PbiIndexPrivate::SetSection(const PbiFile::Section flag, bool ok)
+{ if (ok) sections_ |= flag; else sections_ &= ~flag; }
+
+inline IndexResultBlocks
+PbiIndexPrivate::LookupReference(const int32_t tId) const
+{
+    if (!HasSection(PbiFile::REFERENCE))
+        return IndexResultBlocks{ };
+
+    const auto& indexRange = referenceData_.Indices(tId);
+    if (indexRange.first == nullIndex() && indexRange.second == nullIndex())
+        return IndexResultBlocks{ };
+    const auto numReads = indexRange.second - indexRange.first;
+    auto blocks = IndexResultBlocks{ IndexResultBlock(indexRange.first, numReads) };
+    basicData_.ApplyOffsets(blocks);
+    return blocks;
+}
+
+inline IndexResultBlocks
+PbiIndexPrivate::MergeBlocksWithOffsets(const IndexList& indices) const
+{
+    auto blocks = mergedIndexBlocks(indices);
+    basicData_.ApplyOffsets(blocks);
+    return blocks;
+}
+
+} // namespace internal
+
+inline PbiFile::Sections PbiIndex::FileSections(void) const
+{ return d_->sections_; }
+
+inline bool PbiIndex::HasBarcodeData(void) const
+{ return d_->HasSection(PbiFile::BARCODE); }
+
+inline bool PbiIndex::HasMappedData(void) const
+{ return d_->HasSection(PbiFile::MAPPED); }
+
+inline bool PbiIndex::HasReferenceData(void) const
+{ return d_->HasSection(PbiFile::REFERENCE); }
+
+inline bool PbiIndex::HasSection(const PbiFile::Section section) const
+{ return d_->HasSection(section); }
+
+inline uint32_t PbiIndex::NumReads(void) const
+{ return d_->numReads_; }
+
+inline PbiFile::VersionEnum PbiIndex::Version(void) const
+{ return d_->version_; }
+
+inline const BarcodeLookupData& PbiIndex::BarcodeData(void) const
+{ return d_->barcodeData_; }
+
+inline const BasicLookupData& PbiIndex::BasicData(void) const
+{ return d_->basicData_; }
+
+inline const MappedLookupData& PbiIndex::MappedData(void) const
+{ return d_->mappedData_; }
+
+inline const ReferenceLookupData& PbiIndex::ReferenceData(void) const
+{ return d_->referenceData_; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/PbiLookupData.inl b/include/pbbam/internal/PbiLookupData.inl

new file mode 100644 (file)

index 0000000..2ca38f3
--- /dev/null
+++ b/include/pbbam/internal/PbiLookupData.inl
@@ -0,0 +1,531 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiLookupData.inl
+/// \brief Inline implementations for the classes used for PBI data lookup.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiLookupData.h"
+#include "pbbam/PbiRawData.h"
+#include "pbbam/Strand.h"
+#include <algorithm>
+#include <unordered_set>
+#include <cassert>
+
+namespace PacBio {
+namespace BAM {
+
+// ----------------
+// helper methods
+// ----------------
+
+inline IndexResultBlocks mergedIndexBlocks(IndexList&& indices)
+{
+    if (indices.empty())
+        return IndexResultBlocks{ };
+
+    std::sort(indices.begin(), indices.end());
+    auto newEndIter = std::unique(indices.begin(), indices.end());
+    auto numIndices = std::distance(indices.begin(), newEndIter);
+    assert(!indices.empty());
+    auto result = IndexResultBlocks{ IndexResultBlock(indices.at(0), 1) };
+    for (auto i = 1; i < numIndices; ++i) {
+        if (indices.at(i) == indices.at(i-1)+1)
+            ++result.back().numReads_;
+        else
+            result.push_back(IndexResultBlock(indices.at(i), 1));
+    }
+    return result;
+}
+
+inline IndexResultBlocks mergedIndexBlocks(const IndexList& indices)
+{
+    auto copy = indices;
+    return mergedIndexBlocks(std::move(copy));
+}
+
+inline size_t nullIndex(void)
+{ return static_cast<size_t>(-1); }
+
+inline void pushBackIndices(IndexList& result,
+                            const IndexList& toAppend)
+{
+    result.reserve(result.size() + toAppend.size());
+    for (auto element : toAppend)
+        result.push_back(element);
+}
+
+// -----------------
+// OrderedLookup
+// -----------------
+
+template<typename T>
+inline OrderedLookup<T>::OrderedLookup(void) { }
+
+template<typename T>
+inline OrderedLookup<T>::OrderedLookup(const container_type& data)
+    : data_(data)
+{ }
+
+template<typename T>
+inline OrderedLookup<T>::OrderedLookup(container_type&& data)
+    : data_(std::move(data))
+{ }
+
+template<typename T>
+inline OrderedLookup<T>::OrderedLookup(const std::vector<T>& rawData)
+{
+    const auto numElements = rawData.size();
+    for (auto i = decltype(numElements){0}; i < numElements; ++i)
+        data_[rawData.at(i)].push_back(i);
+}
+
+template<typename T>
+inline OrderedLookup<T>::OrderedLookup(std::vector<T>&& rawData)
+{
+    const auto numElements = rawData.size();
+    for (auto i = decltype(numElements){0}; i < numElements; ++i)
+        data_[rawData.at(i)].push_back(i);
+}
+
+template<typename T>
+inline bool OrderedLookup<T>::operator==(const OrderedLookup<T>& other) const
+{ return data_ == other.data_; }
+
+template<typename T>
+inline bool OrderedLookup<T>::operator!=(const OrderedLookup<T>& other) const
+{ return !(*this == other); }
+
+template<typename T>
+inline typename OrderedLookup<T>::iterator OrderedLookup<T>::begin(void)
+{ return data_.begin(); }
+
+template<typename T>
+inline typename OrderedLookup<T>::const_iterator OrderedLookup<T>::begin(void) const
+{ return data_.cbegin(); }
+
+template<typename T>
+inline typename OrderedLookup<T>::const_iterator OrderedLookup<T>::cbegin(void) const
+{ return data_.cbegin(); }
+
+template<typename T>
+inline typename OrderedLookup<T>::iterator OrderedLookup<T>::end(void)
+{ return data_.end(); }
+
+template<typename T>
+inline typename OrderedLookup<T>::const_iterator OrderedLookup<T>::end(void) const
+{ return data_.cend(); }
+
+template<typename T>
+inline typename OrderedLookup<T>::const_iterator OrderedLookup<T>::cend(void) const
+{ return data_.cend(); }
+
+template<typename T>
+inline bool OrderedLookup<T>::empty(void) const
+{ return data_.empty(); }
+
+template<typename T>
+inline size_t OrderedLookup<T>::size(void) const
+{ return data_.size(); }
+
+template<typename T>
+inline IndexList
+OrderedLookup<T>::LookupInclusiveRange(const const_iterator &begin,
+                                       const const_iterator &end) const
+{
+    auto result = IndexList{ };
+    for (auto iter = begin; iter != end; ++iter)
+        pushBackIndices(result, iter->second);
+    std::sort(result.begin(), result.end());
+    return result;
+}
+
+template<typename T>
+inline IndexList
+OrderedLookup<T>::LookupExclusiveRange(const const_iterator& begin,
+                                       const const_iterator& end,
+                                       const key_type& key) const
+{
+    auto result = IndexList{ };
+    for (auto iter = begin; iter != end; ++iter) {
+        if (iter->first != key)
+            pushBackIndices(result, iter->second);
+    }
+    std::sort(result.begin(), result.end());
+    return result;
+}
+
+template<typename T>
+inline IndexList
+OrderedLookup<T>::LookupIndices(const OrderedLookup::key_type& key,
+                                const Compare::Type& compare) const
+{
+    auto begin = data_.cbegin();
+    auto end   = data_.cend();
+    switch(compare)
+    {
+        case Compare::EQUAL:
+        {
+            const auto found = data_.find(key);
+            if (found != end)
+                return found->second;
+            return IndexList();
+        }
+        case Compare::LESS_THAN:          return LookupExclusiveRange(begin, data_.upper_bound(key), key);
+        case Compare::LESS_THAN_EQUAL:    return LookupInclusiveRange(begin, data_.upper_bound(key));
+        case Compare::GREATER_THAN:       return LookupExclusiveRange(data_.lower_bound(key), end, key);
+        case Compare::GREATER_THAN_EQUAL: return LookupInclusiveRange(data_.lower_bound(key), end);
+        case Compare::NOT_EQUAL:          return LookupExclusiveRange(begin, end, key);
+        default:
+            assert(false);
+    }
+    return IndexList{ };
+}
+
+template<typename T>
+inline std::vector<T> OrderedLookup<T>::Unpack(void) const
+{
+    auto result = std::vector<T>{ };
+    auto iter = cbegin();
+    const auto end = cend();
+    for ( ; iter != end; ++iter ) {
+        const auto& indices = iter->second;
+        for (auto&& i : indices) {
+            if (result.size() <= i)
+                result.resize(i+1);
+            result[i] = iter->first;
+        }
+    }
+    return result;
+}
+
+// -----------------
+// UnorderedLookup
+// -----------------
+
+template<typename T>
+inline UnorderedLookup<T>::UnorderedLookup(void) { }
+
+template<typename T>
+inline UnorderedLookup<T>::UnorderedLookup(const container_type& data)
+    : data_(data)
+{ }
+
+template<typename T>
+inline UnorderedLookup<T>::UnorderedLookup(container_type&& data)
+    : data_(std::move(data))
+{ }
+
+template<typename T>
+inline UnorderedLookup<T>::UnorderedLookup(const std::vector<T>& rawData)
+{
+    const auto numElements = rawData.size();
+    for (auto i = decltype(numElements){0}; i < numElements; ++i)
+        data_[rawData.at(i)].push_back(i);
+}
+
+template<typename T>
+inline UnorderedLookup<T>::UnorderedLookup(std::vector<T>&& rawData)
+{
+    const auto numElements = rawData.size();
+    for (auto i = decltype(numElements){0}; i < numElements; ++i)
+        data_[rawData.at(i)].push_back(i);
+}
+
+template<typename T>
+inline bool UnorderedLookup<T>::operator==(const UnorderedLookup<T>& other) const
+{ return data_ == other.data_; }
+
+template<typename T>
+inline bool UnorderedLookup<T>::operator!=(const UnorderedLookup<T>& other) const
+{ return !(*this == other); }
+
+template<typename T>
+inline typename UnorderedLookup<T>::iterator UnorderedLookup<T>::begin(void)
+{ return data_.begin(); }
+
+template<typename T>
+inline typename UnorderedLookup<T>::const_iterator UnorderedLookup<T>::begin(void) const
+{ return data_.cbegin(); }
+
+template<typename T>
+inline typename UnorderedLookup<T>::const_iterator UnorderedLookup<T>::cbegin(void) const
+{ return data_.cbegin(); }
+
+template<typename T>
+inline typename UnorderedLookup<T>::iterator UnorderedLookup<T>::end(void)
+{ return data_.end(); }
+
+template<typename T>
+inline typename UnorderedLookup<T>::const_iterator UnorderedLookup<T>::end(void) const
+{ return data_.cend(); }
+
+template<typename T>
+inline typename UnorderedLookup<T>::const_iterator UnorderedLookup<T>::cend(void) const
+{ return data_.cend(); }
+
+template<typename T>
+inline bool UnorderedLookup<T>::empty(void) const
+{ return data_.empty(); }
+
+template<typename T>
+inline size_t UnorderedLookup<T>::size(void) const
+{ return data_.size(); }
+
+template<typename T>
+template<typename Compare>
+inline IndexList
+UnorderedLookup<T>::LookupHelper(const UnorderedLookup::key_type& key,
+                                 const Compare& cmp) const
+{
+    auto result = IndexList{ }; // init with some avg size ??
+    const auto end = data_.cend();
+    for (auto iter = data_.cbegin(); iter != end; ++iter) {
+        const auto e = (iter->first);
+        if (cmp(e, key))
+            pushBackIndices(result, iter->second);
+    }
+    std::sort(result.begin(), result.end());
+    return result;
+}
+
+template<typename T>
+inline IndexList
+UnorderedLookup<T>::LookupIndices(const UnorderedLookup::key_type& key,
+                                  const Compare::Type& compare) const
+{
+    switch (compare) {
+        case Compare::EQUAL:
+        {
+            const auto found = data_.find(key);
+            if (found != data_.cend())
+                return found->second;
+            else
+                return IndexList();
+        }
+        case Compare::LESS_THAN:          return LookupHelper(key, std::less<key_type>());
+        case Compare::LESS_THAN_EQUAL:    return LookupHelper(key, std::less_equal<key_type>());
+        case Compare::GREATER_THAN:       return LookupHelper(key, std::greater<key_type>());
+        case Compare::GREATER_THAN_EQUAL: return LookupHelper(key, std::greater_equal<key_type>());
+        case Compare::NOT_EQUAL:          return LookupHelper(key, std::not_equal_to<key_type>());
+        default:
+            assert(false);
+    }
+    return IndexList{ };
+}
+
+template<typename T>
+inline std::vector<T> UnorderedLookup<T>::Unpack(void) const
+{
+    auto result = std::vector<T>{ };
+    auto iter = cbegin();
+    const auto end = cend();
+    for ( ; iter != end; ++iter ) {
+        const auto& indices = iter->second;
+        for (auto&& i : indices) {
+            if (result.size() <= i)
+                result.resize(i+1);
+            result[i] = iter->first;
+        }
+    }
+    return result;
+}
+
+// -------------------
+// SubreadLookupData
+// -------------------
+
+inline
+void BasicLookupData::ApplyOffsets(IndexResultBlocks& blocks) const
+{
+    for (IndexResultBlock& block : blocks)
+        block.virtualOffset_ = fileOffset_.at(block.firstIndex_);
+}
+
+template<typename T>
+inline IndexList BasicLookupData::Indices(const BasicLookupData::Field& field,
+                                            const T& value,
+                                            const Compare::Type& compareType) const
+{
+    switch(field) {
+        case BasicLookupData::RG_ID:        return rgId_.LookupIndices(value, compareType);
+        case BasicLookupData::Q_START:      return qStart_.LookupIndices(value, compareType);
+        case BasicLookupData::Q_END:        return qEnd_.LookupIndices(value, compareType);
+        case BasicLookupData::ZMW:          return holeNumber_.LookupIndices(value, compareType);
+        case BasicLookupData::READ_QUALITY: return readQual_.LookupIndices(value, compareType);
+        case BasicLookupData::CONTEXT_FLAG: return ctxtFlag_.LookupIndices(value, compareType);
+
+        case BasicLookupData::VIRTUAL_OFFSET : // fall-through, not supported this way
+        default:
+            assert(false);
+    }
+    return IndexList{ };
+}
+
+template<typename T>
+inline IndexList BasicLookupData::IndicesMulti(const BasicLookupData::Field& field,
+                                                 const std::vector<T>& values) const
+{
+    auto result = IndexList{ };
+    for (auto value : values) {
+        const auto valueIndices = Indices(field, value, Compare::EQUAL);
+        result.reserve(result.size() + valueIndices.size());
+        for (auto i : valueIndices)
+            result.push_back(i);
+    }
+    return result;
+}
+
+inline const std::vector<int64_t>& BasicLookupData::VirtualFileOffsets(void) const
+{ return fileOffset_; }
+
+// -------------------
+// MappedLookupData
+// -------------------
+
+template<typename T>
+inline IndexList MappedLookupData::Indices(const MappedLookupData::Field& field,
+                                           const T& value,
+                                           const Compare::Type& compareType) const
+{
+    switch(field) {
+        case MappedLookupData::T_ID:        return tId_.LookupIndices(value, compareType);
+        case MappedLookupData::T_START:     return tStart_.LookupIndices(value, compareType);
+        case MappedLookupData::T_END:       return tEnd_.LookupIndices(value, compareType);
+        case MappedLookupData::A_START:     return aStart_.LookupIndices(value, compareType);
+        case MappedLookupData::A_END:       return aEnd_.LookupIndices(value, compareType);
+        case MappedLookupData::N_M:         return nM_.LookupIndices(value, compareType);
+        case MappedLookupData::N_MM:        return nMM_.LookupIndices(value, compareType);
+        case MappedLookupData::N_DEL:       return nDel_.LookupIndices(value, compareType);
+        case MappedLookupData::N_INS:       return nIns_.LookupIndices(value, compareType);
+        case MappedLookupData::MAP_QUALITY: return mapQV_.LookupIndices(value, compareType);
+
+        // MappedField::STRAND has its own specialization
+
+        default:
+            assert(false);
+    }
+    return IndexList{ };
+}
+
+template<>
+inline IndexList MappedLookupData::Indices(const MappedLookupData::Field& field,
+                                           const Strand& strand,
+                                           const Compare::Type& compareType) const
+{
+    assert(field == MappedLookupData::STRAND);
+    (void)field; // quash warnings building in release mode
+
+    if (compareType == Compare::EQUAL) {
+        if (strand == Strand::FORWARD)
+            return forwardStrand_;
+        else
+            return reverseStrand_;
+    } else if (compareType == Compare::NOT_EQUAL) {
+        if (strand == Strand::FORWARD)
+            return reverseStrand_;
+        else
+            return forwardStrand_;
+    }
+
+    // only EQUAL/NOT_EQUAL supported
+    assert(false);
+    return IndexList{ };
+}
+
+template<typename T>
+inline IndexList MappedLookupData::IndicesMulti(const MappedLookupData::Field& field,
+                                                const std::vector<T>& values) const
+{
+    auto result = IndexList{ };
+    for (auto value : values) {
+        auto valueIndices = Indices(field, value, Compare::EQUAL);
+        result.reserve(result.size() + valueIndices.size());
+        for (auto i : valueIndices)
+            result.push_back(i);
+    }
+    return result;
+}
+
+
+// ---------------------
+// ReferenceLookupData
+// ---------------------
+
+inline IndexRange ReferenceLookupData::Indices(const int32_t tId) const
+{
+    auto found = references_.find(tId);
+    if (found == references_.cend())
+        return IndexRange{ nullIndex(), nullIndex() };
+    return found->second;
+}
+
+// -------------------
+// BarcodeLookupData
+// -------------------
+
+template<typename T>
+inline IndexList BarcodeLookupData::Indices(const BarcodeLookupData::Field &field,
+                                            const T& value,
+                                            const Compare::Type &compareType) const
+{
+    switch(field) {
+        case BarcodeLookupData::BC_FORWARD:      return bcForward_.LookupIndices(value, compareType);
+        case BarcodeLookupData::BC_REVERSE:     return bcReverse_.LookupIndices(value, compareType);
+        case BarcodeLookupData::BC_QUALITY:   return bcQual_.LookupIndices(value, compareType);
+        default:
+            assert(false);
+    }
+    return IndexList{ };
+}
+
+template<typename T>
+inline IndexList BarcodeLookupData::IndicesMulti(const BarcodeLookupData::Field &field,
+                                                 const std::vector<T>& values) const
+{
+    IndexList result;
+    for (auto value : values) {
+        const IndexList& valueIndices = Indices(field, value, Compare::EQUAL);
+        result.reserve(result.size() + valueIndices.size());
+        for (auto i : valueIndices)
+            result.push_back(i);
+    }
+    return result;
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/PbiRawData.inl b/include/pbbam/internal/PbiRawData.inl

new file mode 100644 (file)

index 0000000..af24376
--- /dev/null
+++ b/include/pbbam/internal/PbiRawData.inl
@@ -0,0 +1,113 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiRawData.inl
+/// \brief Inline implementations for the classes used for working with raw PBI
+///        data.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiRawData.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline const PbiRawBarcodeData& PbiRawData::BarcodeData(void) const
+{ return barcodeData_; }
+
+inline PbiRawBarcodeData& PbiRawData::BarcodeData(void)
+{ return barcodeData_; }
+
+inline const PbiRawBasicData& PbiRawData::BasicData(void) const
+{ return basicData_; }
+
+inline PbiRawBasicData& PbiRawData::BasicData(void)
+{ return basicData_; }
+
+inline std::string PbiRawData::Filename(void) const
+{ return filename_; }
+
+inline PbiFile::Sections PbiRawData::FileSections(void) const
+{ return sections_; }
+
+inline PbiRawData& PbiRawData::FileSections(PbiFile::Sections sections)
+{ sections_ = sections; return *this; }
+
+inline bool PbiRawData::HasBarcodeData(void) const
+{ return HasSection(PbiFile::BARCODE); }
+
+inline bool PbiRawData::HasMappedData(void) const
+{ return HasSection(PbiFile::MAPPED); }
+
+inline bool PbiRawData::HasReferenceData(void) const
+{ return HasSection(PbiFile::REFERENCE); }
+
+inline bool PbiRawData::HasSection(const PbiFile::Section section) const
+{ return (sections_ & section) != 0; }
+
+inline uint32_t PbiRawData::NumReads(void) const
+{ return numReads_; }
+
+inline PbiRawData& PbiRawData::NumReads(uint32_t num)
+{ numReads_ = num; return *this; }
+
+inline const PbiRawMappedData& PbiRawData::MappedData(void) const
+{ return mappedData_; }
+
+inline PbiRawMappedData& PbiRawData::MappedData(void)
+{ return mappedData_; }
+
+inline const PbiRawReferenceData& PbiRawData::ReferenceData(void) const
+{ return referenceData_; }
+
+inline PbiRawReferenceData& PbiRawData::ReferenceData(void)
+{ return referenceData_; }
+
+inline PbiFile::VersionEnum PbiRawData::Version(void) const
+{ return version_; }
+
+inline PbiRawData& PbiRawData::Version(PbiFile::VersionEnum version)
+{ version_ = version; return *this; }
+
+inline bool PbiReferenceEntry::operator==(const PbiReferenceEntry& other) const
+{
+    return tId_      == other.tId_ &&
+           beginRow_ == other.beginRow_ &&
+           endRow_   == other.endRow_;
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/ProgramInfo.inl b/include/pbbam/internal/ProgramInfo.inl

new file mode 100644 (file)

index 0000000..2f0287f
--- /dev/null
+++ b/include/pbbam/internal/ProgramInfo.inl
@@ -0,0 +1,97 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ProgramInfo.inl
+/// \brief Inline implementations for the ProgramInfo class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/ProgramInfo.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline std::string ProgramInfo::CommandLine(void) const
+{ return commandLine_; }
+
+inline ProgramInfo& ProgramInfo::CommandLine(const std::string& cmd)
+{ commandLine_ = cmd; return *this; }
+
+inline std::map<std::string, std::string> ProgramInfo::CustomTags(void) const
+{ return custom_; }
+
+inline ProgramInfo& ProgramInfo::CustomTags(const std::map<std::string,
+                                            std::string>& custom)
+{ custom_ = custom; return *this; }
+
+inline std::string ProgramInfo::Description(void) const
+{ return description_; }
+
+inline ProgramInfo& ProgramInfo::Description(const std::string& description)
+{ description_ = description; return *this; }
+
+inline std::string ProgramInfo::Id(void) const
+{ return id_; }
+
+inline ProgramInfo& ProgramInfo::Id(const std::string& id)
+{ id_ = id; return *this; }
+
+inline bool ProgramInfo::IsValid(void) const
+{ return !id_.empty(); }
+
+inline std::string ProgramInfo::Name(void) const
+{ return name_; }
+
+inline ProgramInfo& ProgramInfo::Name(const std::string& name)
+{ name_ = name; return *this; }
+
+inline std::string ProgramInfo::PreviousProgramId(void) const
+{ return previousProgramId_; }
+
+inline ProgramInfo& ProgramInfo::PreviousProgramId(const std::string& id)
+{ previousProgramId_ = id; return *this; }
+
+inline std::string ProgramInfo::ToSam(const ProgramInfo& prog)
+{ return prog.ToSam(); }
+
+inline std::string ProgramInfo::Version(void) const
+{ return version_; }
+
+inline ProgramInfo& ProgramInfo::Version(const std::string& version)
+{ version_ = version; return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/QualityValue.inl b/include/pbbam/internal/QualityValue.inl

new file mode 100644 (file)

index 0000000..07db35b
--- /dev/null
+++ b/include/pbbam/internal/QualityValue.inl
@@ -0,0 +1,71 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file QualityValue.inl
+/// \brief Inline implementations for the QualityValue class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/QualityValue.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline QualityValue::QualityValue(const uint8_t value)
+    : value_(value)
+{
+    // clamp QV
+    if (value_ > QualityValue::MAX)
+        value_ = QualityValue::MAX;
+}
+
+inline QualityValue::QualityValue(const QualityValue& other)
+    : value_(other.value_)
+{ }
+
+inline QualityValue::~QualityValue(void) { }
+
+inline char QualityValue::Fastq(void) const
+{ return static_cast<char>(value_ + 33); }
+
+inline QualityValue::operator uint8_t(void) const
+{ return value_; }
+
+inline QualityValue QualityValue::FromFastq(const char c)
+{ return QualityValue(static_cast<uint8_t>(c-33)); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/QualityValues.inl b/include/pbbam/internal/QualityValues.inl

new file mode 100644 (file)

index 0000000..0eabf49
--- /dev/null
+++ b/include/pbbam/internal/QualityValues.inl
@@ -0,0 +1,148 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file QualityValues.inl
+/// \brief Inline implementations for the QualityValues class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/QualityValues.h"
+#include <algorithm>
+
+namespace PacBio {
+namespace BAM {
+
+inline QualityValues::QualityValues(void)
+    : std::vector<QualityValue>()
+{ }
+
+inline QualityValues::QualityValues(const std::string& fastqString)
+    : std::vector<QualityValue>()
+{
+    resize(fastqString.size());
+    std::transform(fastqString.cbegin(), fastqString.cend(),
+                   begin(), QualityValue::FromFastq);
+}
+
+inline QualityValues::QualityValues(const std::vector<QualityValue>& quals)
+    : std::vector<QualityValue>(quals)
+{ }
+
+inline QualityValues::QualityValues(const std::vector<uint8_t>& quals)
+    : std::vector<QualityValue>()
+{
+    resize(quals.size());
+    std::copy(quals.cbegin(), quals.cend(), begin());
+}
+
+inline QualityValues::QualityValues(const std::vector<uint8_t>::const_iterator first,
+                                    const std::vector<uint8_t>::const_iterator last)
+    : std::vector<QualityValue>(first, last)
+{ }
+
+inline QualityValues::QualityValues(const QualityValues::const_iterator first,
+                                    const QualityValues::const_iterator last)
+    : std::vector<QualityValue>()
+{
+    assign(first, last);
+}
+
+inline QualityValues::QualityValues(const QualityValues& other)
+    : std::vector<QualityValue>(other)
+{ }
+
+inline QualityValues::QualityValues(std::vector<QualityValue>&& quals)
+    : std::vector<QualityValue>(std::move(quals))
+{ }
+
+inline QualityValues::QualityValues(QualityValues&& other)
+    : std::vector<QualityValue>(std::move(other))
+{ }
+
+inline QualityValues& QualityValues::operator=(const QualityValues& other)
+{ std::vector<QualityValue>::operator=(other); return *this; }
+
+inline QualityValues& QualityValues::operator=(const std::vector<QualityValue>& quals)
+{ std::vector<QualityValue>::operator=(quals); return *this; }
+
+inline QualityValues& QualityValues::operator=(QualityValues&& other)
+{ std::vector<QualityValue>::operator=(std::move(other)); return *this; }
+
+inline QualityValues& QualityValues::operator=(std::vector<QualityValue>&& quals)
+{ std::vector<QualityValue>::operator=(std::move(quals)); return *this; }
+
+inline QualityValues::~QualityValues(void) { }
+
+inline std::vector<QualityValue>::const_iterator QualityValues::cbegin(void) const
+{ return std::vector<QualityValue>::cbegin(); }
+
+inline std::vector<QualityValue>::const_iterator QualityValues::cend(void) const
+{ return std::vector<QualityValue>::cend(); }
+
+inline std::vector<QualityValue>::const_iterator QualityValues::begin(void) const
+{ return std::vector<QualityValue>::begin(); }
+
+inline std::vector<QualityValue>::const_iterator QualityValues::end(void) const
+{ return std::vector<QualityValue>::end(); }
+
+inline std::vector<QualityValue>::iterator QualityValues::begin(void)
+{ return std::vector<QualityValue>::begin(); }
+
+inline std::vector<QualityValue>::iterator QualityValues::end(void)
+{ return std::vector<QualityValue>::end(); }
+
+inline QualityValues QualityValues::FromFastq(const std::string& fastq)
+{ return QualityValues(fastq); }
+
+inline std::string QualityValues::Fastq(void) const
+{
+    std::string result;
+    result.reserve(size());
+    auto iter = cbegin();
+    const auto end = cend();
+    for (; iter != end; ++iter)
+        result.push_back((*iter).Fastq());
+    return result;
+}
+
+inline bool QualityValues::operator==(const std::string& fastq) const
+{ return *this == QualityValues(fastq); }
+
+inline bool QualityValues::operator!=(const std::string& fastq) const
+{ return *this != QualityValues(fastq); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/QueryBase.h b/include/pbbam/internal/QueryBase.h

new file mode 100644 (file)

index 0000000..e012f86
--- /dev/null
+++ b/include/pbbam/internal/QueryBase.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef QUERYBASE_H
+#define QUERYBASE_H
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/DataSet.h"
+#include <memory>
+#include <vector>
+#include <cassert>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template<typename T>
+class QueryBase;
+
+template<typename T>
+class QueryIteratorBase
+{
+public:
+    virtual ~QueryIteratorBase(void);
+
+    bool operator==(const QueryIteratorBase<T>& other) const;
+    bool operator!=(const QueryIteratorBase<T>& other) const;
+
+protected:
+    QueryIteratorBase(void);
+    QueryIteratorBase(QueryBase<T>& query);
+
+    void ReadNext(void);
+
+protected:
+    QueryBase<T>* query_;
+    T record_;
+};
+
+template<typename T>
+class QueryIterator : public QueryIteratorBase<T>
+{
+public:
+    QueryIterator(void);
+    QueryIterator(QueryBase<T>& query);
+
+    T& operator*(void);
+    T* operator->(void);
+
+    QueryIterator<T>& operator++(void);
+    QueryIterator<T> operator++(int);
+};
+
+template<typename T>
+class QueryConstIterator : public QueryIteratorBase<T>
+{
+public:
+    QueryConstIterator(void);
+    QueryConstIterator(const QueryBase<T>& query);
+
+    const T& operator*(void) const;
+    const T* operator->(void) const;
+
+    QueryConstIterator<T>& operator++(void);
+    QueryConstIterator<T> operator++(int);
+};
+
+template<typename T>
+class QueryBase {
+
+public:
+    typedef QueryIterator<T>      iterator;
+    typedef QueryConstIterator<T> const_iterator;
+
+public:
+    virtual ~QueryBase(void);
+
+public:
+    QueryConstIterator<T> begin(void) const;
+    QueryConstIterator<T> cbegin(void) const;
+    QueryIterator<T> begin(void);
+
+    QueryConstIterator<T> end(void) const;
+    QueryConstIterator<T> cend(void) const;
+    QueryIterator<T> end(void);
+
+public:
+    virtual bool GetNext(T& r) =0;
+
+protected:
+    QueryBase(void);
+};
+
+typedef QueryBase<BamRecord>               IQuery;
+typedef QueryBase<std::vector<BamRecord> > IGroupQuery;
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#include "pbbam/internal/QueryBase.inl"
+
+#endif // QUERYBASE_H
diff --git a/include/pbbam/internal/QueryBase.inl b/include/pbbam/internal/QueryBase.inl

new file mode 100644 (file)

index 0000000..7f2376f
--- /dev/null
+++ b/include/pbbam/internal/QueryBase.inl
@@ -0,0 +1,177 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// -------------------
+// QueryIteratorBase
+// -------------------
+
+template<typename T>
+inline QueryIteratorBase<T>::QueryIteratorBase(void)
+    : query_(nullptr)
+{ }
+
+template<typename T>
+inline QueryIteratorBase<T>::QueryIteratorBase(QueryBase<T>& query)
+    : query_(&query)
+{ ReadNext(); }
+
+template<typename T> inline
+QueryIteratorBase<T>::~QueryIteratorBase(void) { }
+
+template<typename T> inline
+bool QueryIteratorBase<T>::operator==(const QueryIteratorBase<T>& other) const
+{ return query_ == other.query_; }
+
+template<typename T> inline
+bool QueryIteratorBase<T>::operator!=(const QueryIteratorBase<T>& other) const
+{ return !(*this == other); }
+
+// -------------------
+// QueryIterator
+// -------------------
+
+template<typename T> inline
+QueryIterator<T>::QueryIterator(void) : QueryIteratorBase<T>() { }
+
+template<typename T> inline
+QueryIterator<T>::QueryIterator(QueryBase<T>& query)
+    : QueryIteratorBase<T>(query)
+{ }
+
+template<typename T> inline
+T& QueryIterator<T>::operator*(void)
+{ return QueryIteratorBase<T>::record_; }
+
+template<typename T> inline
+T* QueryIterator<T>::operator->(void)
+{ return &(operator*()); }
+
+template<typename T> inline
+QueryIterator<T>& QueryIterator<T>::operator++(void)
+{ QueryIteratorBase<T>::ReadNext(); return *this; }
+
+template<typename T> inline
+QueryIterator<T> QueryIterator<T>::operator++(int)
+{
+    QueryIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+// --------------------
+// QueryConstIterator
+// --------------------
+
+template<typename T> inline
+QueryConstIterator<T>::QueryConstIterator(void) : QueryIteratorBase<T>() { }
+
+template<typename T> inline
+QueryConstIterator<T>::QueryConstIterator(const QueryBase<T>& query)
+    : QueryIteratorBase<T>(const_cast<QueryBase<T>&>(query))
+{ }
+
+template<typename T> inline
+const T& QueryConstIterator<T>::operator*(void) const
+{ return QueryIteratorBase<T>::record_; }
+
+template<typename T> inline
+const T* QueryConstIterator<T>::operator->(void) const
+{ return &(operator*()); }
+
+template<typename T> inline
+QueryConstIterator<T>& QueryConstIterator<T>::operator++(void)
+{ QueryIteratorBase<T>::ReadNext(); return *this; }
+
+template<typename T> inline
+QueryConstIterator<T> QueryConstIterator<T>::operator++(int)
+{
+    QueryConstIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+// -----------
+// QueryBase
+// -----------
+
+template<typename T> inline
+QueryBase<T>::QueryBase(void) { }
+
+template<typename T> inline
+QueryBase<T>::~QueryBase(void) { }
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::begin(void) const
+{ return QueryConstIterator<T>(*this); }
+
+template<typename T> inline
+QueryIterator<T> QueryBase<T>::begin(void)
+{ return QueryIterator<T>(*this); }
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::cbegin(void) const
+{ return QueryConstIterator<T>(*this); }
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::cend(void) const
+{ return QueryConstIterator<T>(); }
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::end(void) const
+{ return QueryConstIterator<T>(); }
+
+template<typename T> inline
+QueryIterator<T> QueryBase<T>::end(void)
+{ return QueryIterator<T>(); }
+
+template<typename T>
+inline void QueryIteratorBase<T>::ReadNext(void)
+{
+    assert(query_);
+    if (!query_->GetNext(record_))
+        query_ = nullptr;
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/ReadGroupInfo.inl b/include/pbbam/internal/ReadGroupInfo.inl

new file mode 100644 (file)

index 0000000..b8a24e0
--- /dev/null
+++ b/include/pbbam/internal/ReadGroupInfo.inl
@@ -0,0 +1,279 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ReadGroupInfo.inl
+/// \brief Inline implementations for the ReadGroupInfo class.
+//
+// Author: Derek Barnett
+
+#include <stdexcept>
+#include "pbbam/ReadGroupInfo.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline size_t ReadGroupInfo::BarcodeCount(void) const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error("barcode count requested but barcode data is missing");
+    return barcodeCount_;
+}
+
+inline ReadGroupInfo& ReadGroupInfo::BarcodeData(const std::string& barcodeFile,
+                                                 const std::string& barcodeHash,
+                                                 size_t barcodeCount,
+                                                 BarcodeModeType barcodeMode,
+                                                 BarcodeQualityType barcodeQuality)
+{
+    barcodeFile_ = barcodeFile;
+    barcodeHash_ = barcodeHash;
+    barcodeCount_ = barcodeCount;
+    barcodeMode_ = barcodeMode;
+    barcodeQuality_ = barcodeQuality;
+    hasBarcodeData_ = true;
+    return *this;
+}
+
+inline std::string ReadGroupInfo::BarcodeFile(void) const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error("barcode file requested but barcode data is missing");
+    return barcodeFile_;
+}
+
+inline std::string ReadGroupInfo::BarcodeHash(void) const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error("barcode hash requested but barcode data is missing");
+    return barcodeHash_;
+}
+
+inline BarcodeModeType ReadGroupInfo::BarcodeMode(void) const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error("barcode mode requested but barcode data is missing");
+    return barcodeMode_;
+}
+
+inline BarcodeQualityType ReadGroupInfo::BarcodeQuality(void) const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error("barcode quality requested but barcode data is missing");
+    return barcodeQuality_;
+}
+
+inline std::string ReadGroupInfo::BasecallerVersion(void) const
+{ return basecallerVersion_; }
+
+inline ReadGroupInfo& ReadGroupInfo::BasecallerVersion(const std::string& versionNumber)
+{ basecallerVersion_ = versionNumber; return *this; }
+
+inline std::string ReadGroupInfo::BaseFeatureTag(const BaseFeature& feature) const
+{
+    const auto iter = features_.find(feature);
+    if (iter == features_.end())
+        return std::string();
+    return iter->second;
+}
+
+inline ReadGroupInfo& ReadGroupInfo::BaseFeatureTag(const BaseFeature& feature,
+                                                    const std::string& tag)
+{ features_[feature] = tag; return *this; }
+
+inline std::string ReadGroupInfo::BindingKit(void) const
+{ return bindingKit_; }
+
+inline ReadGroupInfo& ReadGroupInfo::BindingKit(const std::string& kitNumber)
+{ bindingKit_ = kitNumber; return *this; }
+
+inline ReadGroupInfo& ReadGroupInfo::ClearBarcodeData(void)
+{
+    barcodeFile_.clear();
+    barcodeHash_.clear();
+    hasBarcodeData_ = false;
+    return *this;
+}
+
+inline ReadGroupInfo& ReadGroupInfo::ClearBaseFeatures(void)
+{
+    features_.clear();
+    return *this;
+}
+
+inline bool ReadGroupInfo::Control(void) const
+{ return control_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Control(const bool ctrl)
+{ control_ = ctrl; return *this; }
+
+inline std::map<std::string, std::string> ReadGroupInfo::CustomTags(void) const
+{ return custom_; }
+
+inline ReadGroupInfo& ReadGroupInfo::CustomTags(const std::map<std::string, std::string>& custom)
+{ custom_ = custom; return *this; }
+
+inline std::string ReadGroupInfo::Date(void) const
+{ return date_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Date(const std::string& date)
+{ date_ = date; return *this; }
+
+inline std::string ReadGroupInfo::FlowOrder(void) const
+{ return flowOrder_; }
+
+inline ReadGroupInfo& ReadGroupInfo::FlowOrder(const std::string& order)
+{ flowOrder_ = order; return *this; }
+
+inline std::string ReadGroupInfo::FrameRateHz(void) const
+{ return frameRateHz_; }
+
+inline ReadGroupInfo& ReadGroupInfo::FrameRateHz(const std::string& frameRateHz)
+{ frameRateHz_ = frameRateHz; return *this; }
+
+inline bool ReadGroupInfo::HasBarcodeData(void) const
+{ return hasBarcodeData_; }
+
+inline bool ReadGroupInfo::HasBaseFeature(const BaseFeature& feature) const
+{ return features_.find(feature) != features_.end(); }
+
+inline std::string ReadGroupInfo::Id(void) const
+{ return id_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Id(const std::string& id)
+{ id_ = id; return *this; }
+
+inline ReadGroupInfo& ReadGroupInfo::Id(const std::string& movieName,
+                                        const std::string& readType)
+{ id_ = MakeReadGroupId(movieName, readType); return *this; }
+
+inline int32_t ReadGroupInfo::IdToInt(const std::string& rgId)
+{
+    const uint32_t rawid = std::stoul(rgId, nullptr, 16);
+    return static_cast<int32_t>(rawid);
+}
+
+inline FrameCodec ReadGroupInfo::IpdCodec(void) const
+{ return ipdCodec_; }
+
+inline bool ReadGroupInfo::IsValid(void) const
+{ return !id_.empty(); }
+
+inline std::string ReadGroupInfo::KeySequence(void) const
+{ return keySequence_; }
+
+inline ReadGroupInfo& ReadGroupInfo::KeySequence(const std::string& sequence)
+{ keySequence_ = sequence; return *this; }
+
+inline std::string ReadGroupInfo::Library(void) const
+{ return library_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Library(const std::string& library)
+{ library_ = library; return *this; }
+
+inline std::string ReadGroupInfo::MovieName(void) const
+{ return movieName_; }
+
+inline ReadGroupInfo& ReadGroupInfo::MovieName(const std::string& movieName)
+{ movieName_ = movieName; return *this; }
+
+inline std::string ReadGroupInfo::Platform(void) const
+{ return std::string("PACBIO"); }
+
+inline PlatformModelType ReadGroupInfo::PlatformModel(void) const
+{ return platformModel_; }
+
+inline ReadGroupInfo& ReadGroupInfo::PlatformModel(const PlatformModelType& platform)
+{ platformModel_ = platform; return *this; }
+
+inline std::string ReadGroupInfo::PredictedInsertSize(void) const
+{ return predictedInsertSize_; }
+
+inline ReadGroupInfo& ReadGroupInfo::PredictedInsertSize(const std::string& size)
+{ predictedInsertSize_ = size; return *this; }
+
+inline std::string ReadGroupInfo::Programs(void) const
+{ return programs_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Programs(const std::string& programs)
+{ programs_ = programs; return *this; }
+
+inline FrameCodec ReadGroupInfo::PulseWidthCodec(void) const
+{ return pulseWidthCodec_; }
+
+inline std::string ReadGroupInfo::ReadType(void) const
+{ return readType_; }
+
+inline ReadGroupInfo& ReadGroupInfo::ReadType(const std::string& type)
+{ readType_ = type; return *this; }
+
+inline ReadGroupInfo& ReadGroupInfo::RemoveBaseFeature(const BaseFeature& feature)
+{
+    auto iter = features_.find(feature);
+    if (iter != features_.end())
+        features_.erase(iter);
+    return *this;
+}
+
+inline std::string ReadGroupInfo::Sample(void) const
+{ return sample_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Sample(const std::string& sample)
+{ sample_ = sample; return *this; }
+
+inline std::string ReadGroupInfo::SequencingCenter(void) const
+{ return sequencingCenter_; }
+
+inline ReadGroupInfo& ReadGroupInfo::SequencingCenter(const std::string& center)
+{ sequencingCenter_ = center; return *this; }
+
+inline std::string ReadGroupInfo::SequencingChemistry(void) const
+{
+    return SequencingChemistryFromTriple(BindingKit(),
+                                         SequencingKit(),
+                                         BasecallerVersion());
+}
+
+inline std::string ReadGroupInfo::SequencingKit(void) const
+{ return sequencingKit_; }
+
+inline ReadGroupInfo& ReadGroupInfo::SequencingKit(const std::string& kitNumber)
+{ sequencingKit_ = kitNumber; return *this; }
+
+inline std::string ReadGroupInfo::ToSam(const ReadGroupInfo& rg)
+{ return rg.ToSam(); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/SequenceInfo.inl b/include/pbbam/internal/SequenceInfo.inl

new file mode 100644 (file)

index 0000000..93b653d
--- /dev/null
+++ b/include/pbbam/internal/SequenceInfo.inl
@@ -0,0 +1,107 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file SequenceInfo.inl
+/// \brief Inline implementations for the SequenceInfo class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/SequenceInfo.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline bool SequenceInfo::operator==(const SequenceInfo& other) const
+{
+    return assemblyId_ == other.assemblyId_ &&
+           checksum_   == other.checksum_   &&
+           length_     == other.length_     &&
+           name_       == other.name_       &&
+           species_    == other.species_    &&
+           uri_        == other.uri_        &&
+           custom_     == other.custom_;
+}
+
+inline bool SequenceInfo::operator!=(const SequenceInfo& other) const
+{ return !(*this == other); }
+
+inline std::string SequenceInfo::AssemblyId(void) const
+{ return assemblyId_; }
+
+inline SequenceInfo& SequenceInfo::AssemblyId(const std::string& id)
+{ assemblyId_ = id; return *this; }
+
+inline std::string SequenceInfo::Checksum(void) const
+{ return checksum_; }
+
+inline SequenceInfo& SequenceInfo::Checksum(const std::string& checksum)
+{ checksum_ = checksum; return *this; }
+
+inline std::map<std::string, std::string> SequenceInfo::CustomTags(void) const
+{ return custom_; }
+
+inline SequenceInfo& SequenceInfo::CustomTags(const std::map<std::string, std::string>& custom)
+{ custom_ = custom; return *this; }
+
+inline std::string SequenceInfo::Length(void) const
+{ return length_; }
+
+inline SequenceInfo& SequenceInfo::Length(const std::string& length)
+{ length_ = length; return *this; }
+
+inline std::string SequenceInfo::Name(void) const
+{ return name_; }
+
+inline SequenceInfo& SequenceInfo::Name(const std::string& name)
+{ name_ = name; return *this; }
+
+inline std::string SequenceInfo::Species(void) const
+{ return species_; }
+
+inline SequenceInfo& SequenceInfo::Species(const std::string& species)
+{ species_ = species; return *this; }
+
+inline std::string SequenceInfo::ToSam(const SequenceInfo& seq)
+{ return seq.ToSam(); }
+
+inline std::string SequenceInfo::Uri(void) const
+{ return uri_; }
+
+inline SequenceInfo& SequenceInfo::Uri(const std::string& uri)
+{ uri_ = uri; return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Tag.inl b/include/pbbam/internal/Tag.inl

new file mode 100644 (file)

index 0000000..f8d4af2
--- /dev/null
+++ b/include/pbbam/internal/Tag.inl
@@ -0,0 +1,323 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Tag.inl
+/// \brief Inline implementations for the Tag class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Tag.h"
+#include <boost/numeric/conversion/cast.hpp>
+#include <iostream>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template<typename T>
+inline bool InAsciiRange(const T& x)
+{ return (x >=33 && x <= 127); }
+
+struct AsciiConvertVisitor : public boost::static_visitor<char>
+{
+    // only valid for numeric types - maybe even more restrictive?
+    char operator() (const int8_t& x) const   { return Helper(x); }
+    char operator() (const uint8_t& x) const  { return Helper(x); }
+    char operator() (const int16_t& x) const  { return Helper(x); }
+    char operator() (const uint16_t& x) const { return Helper(x); }
+    char operator() (const int32_t& x) const  { return Helper(x); }
+    char operator() (const uint32_t& x) const { return Helper(x); }
+
+    // anything else always throws
+    template<typename T>
+    char operator()(const T&) const
+    { throw std::runtime_error("conversion not supported"); return 0; }
+
+private:
+    template<typename T>
+    char Helper(const T& x) const
+    {
+        if (!InAsciiRange(x))
+            throw std::runtime_error("not valid ASCII");
+        return static_cast<char>(x);
+    }
+};
+
+template<typename DesiredType>
+struct NumericConvertVisitor : public boost::static_visitor<DesiredType>
+{
+    // only valid for integral types
+    DesiredType operator() (const int8_t& x) const   { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator() (const uint8_t& x) const  { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator() (const int16_t& x) const  { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator() (const uint16_t& x) const { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator() (const int32_t& x) const  { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator() (const uint32_t& x) const { return boost::numeric_cast<DesiredType>(x); }
+
+    // anything else always throws
+    template<typename T> DesiredType operator()(const T& t) const
+    {
+        const std::string from = typeid(t).name();
+        const std::string to   = typeid(DesiredType).name();
+        const std::string msg  = std::string("conversion not supported: ") + from + " -> " + to;
+        throw std::runtime_error(msg);
+        return 0;
+    }
+};
+
+typedef NumericConvertVisitor<int8_t>   ToInt8ConvertVisitor;
+typedef NumericConvertVisitor<uint8_t>  ToUInt8ConvertVisitor;
+typedef NumericConvertVisitor<int16_t>  ToInt16ConvertVisitor;
+typedef NumericConvertVisitor<uint16_t> ToUInt16ConvertVisitor;
+typedef NumericConvertVisitor<int32_t>  ToInt32ConvertVisitor;
+typedef NumericConvertVisitor<uint32_t> ToUInt32ConvertVisitor;
+
+struct IsEqualVisitor : public boost::static_visitor<bool>
+{
+    template <typename T, typename U>
+    bool operator() (const T&, const U&) const
+    {
+        // maybe allow conversions down the road?
+        // but for now, just fail if types are different
+        return false;
+    }
+
+    bool operator() (const boost::blank&, const boost::blank&) const
+    { return true; }
+
+    template <typename T>
+    bool operator() (const T& lhs, const T& rhs) const
+    { return lhs == rhs; }
+};
+
+struct TypenameVisitor : public boost::static_visitor<std::string>
+{
+    std::string operator() (const boost::blank&) const     { return "none"; }
+    std::string operator() (const int8_t&) const           { return "int8_t"; }
+    std::string operator() (const uint8_t&) const          { return "uint8_t"; }
+    std::string operator() (const int16_t&) const          { return "int16_t"; }
+    std::string operator() (const uint16_t&) const         { return "uint16_t"; }
+    std::string operator() (const int32_t&) const          { return "int32_t"; }
+    std::string operator() (const uint32_t&) const         { return "uint32_t"; }
+    std::string operator() (const float&) const            { return "float"; }
+    std::string operator() (const std::string&) const      { return "string"; }
+    std::string operator() (const std::vector<int8_t>&) const   { return "vector<int8_t>"; }
+    std::string operator() (const std::vector<uint8_t>&) const  { return "vector<uint8_t>"; }
+    std::string operator() (const std::vector<int16_t>&) const  { return "vector<int16_t>"; }
+    std::string operator() (const std::vector<uint16_t>&) const { return "vector<uint16_t>"; }
+    std::string operator() (const std::vector<int32_t>&) const  { return "vector<int32_t>"; }
+    std::string operator() (const std::vector<uint32_t>&) const { return "vector<uint32_t>"; }
+    std::string operator() (const std::vector<float>&) const    { return "vector<float>"; }
+};
+
+} // namespace internal
+
+inline bool Tag::operator== (const Tag& other) const
+{
+    return boost::apply_visitor(internal::IsEqualVisitor(), data_, other.data_) &&
+           (modifier_ == other.modifier_) ;
+}
+
+inline bool Tag::operator!= (const Tag& other) const
+{ return !(*this == other); }
+
+inline bool Tag::HasModifier(const TagModifier m) const
+{
+    // we just allow one at a time (for now at least)
+    return modifier_ == m;
+}
+
+inline bool Tag::IsNull(void) const
+{ return Type() == TagDataType::INVALID; }
+
+inline bool Tag::IsInt8(void) const
+{ return Type() == TagDataType::INT8; }
+
+inline bool Tag::IsUInt8(void) const
+{ return Type() == TagDataType::UINT8; }
+
+inline bool Tag::IsInt16(void) const
+{ return Type() == TagDataType::INT16; }
+
+inline bool Tag::IsUInt16(void) const
+{ return Type() == TagDataType::UINT16; }
+
+inline bool Tag::IsInt32(void) const
+{ return Type() == TagDataType::INT32; }
+
+inline bool Tag::IsUInt32(void) const
+{ return Type() == TagDataType::UINT32; }
+
+inline bool Tag::IsFloat(void) const
+{ return Type() == TagDataType::FLOAT; }
+
+inline bool Tag::IsString(void) const
+{ return Type() == TagDataType::STRING; }
+
+inline bool Tag::IsHexString(void) const
+{ return IsString() && modifier_ == TagModifier::HEX_STRING; }
+
+inline bool Tag::IsInt8Array(void) const
+{ return Type() == TagDataType::INT8_ARRAY; }
+
+inline bool Tag::IsUInt8Array(void) const
+{ return Type() == TagDataType::UINT8_ARRAY; }
+
+inline bool Tag::IsInt16Array(void) const
+{ return Type() == TagDataType::INT16_ARRAY; }
+
+inline bool Tag::IsUInt16Array(void) const
+{ return Type() == TagDataType::UINT16_ARRAY; }
+
+inline bool Tag::IsInt32Array(void) const
+{ return Type() == TagDataType::INT32_ARRAY; }
+
+inline bool Tag::IsUInt32Array(void) const
+{ return Type() == TagDataType::UINT32_ARRAY; }
+
+inline bool Tag::IsFloatArray(void) const
+{ return Type() == TagDataType::FLOAT_ARRAY; }
+
+inline bool Tag::IsSignedInt(void) const
+{ return IsInt8() || IsInt16() || IsInt32(); }
+
+inline bool Tag::IsUnsignedInt(void) const
+{ return IsUInt8() || IsUInt16() || IsUInt32(); }
+
+inline bool Tag::IsIntegral(void) const
+{ return IsSignedInt() || IsUnsignedInt(); }
+
+inline bool Tag::IsNumeric(void) const
+{ return IsIntegral() || IsFloat(); }
+
+inline bool Tag::IsSignedArray(void) const
+{ return IsInt8Array() || IsInt16Array() || IsInt32Array(); }
+
+inline bool Tag::IsUnsignedArray(void) const
+{ return IsUInt8Array() || IsUInt16Array() || IsUInt32Array(); }
+
+inline bool Tag::IsIntegralArray(void) const
+{ return IsSignedArray() || IsUnsignedArray(); }
+
+inline bool Tag::IsArray(void) const
+{ return IsIntegralArray() || IsFloatArray(); }
+
+inline TagModifier Tag::Modifier(void) const
+{ return modifier_; }
+
+inline Tag& Tag::Modifier(const TagModifier m)
+{ modifier_ = m; return *this; }
+
+inline char Tag::ToAscii(void) const
+{ return boost::apply_visitor(internal::AsciiConvertVisitor(), data_); }
+
+inline int8_t Tag::ToInt8(void) const
+{
+    if (IsInt8())
+        return boost::get<int8_t>(data_);
+    return boost::apply_visitor(internal::ToInt8ConvertVisitor(), data_);
+}
+
+inline uint8_t Tag::ToUInt8(void) const
+{
+    if (IsUInt8())
+        return boost::get<uint8_t>(data_);
+    return boost::apply_visitor(internal::ToUInt8ConvertVisitor(), data_);
+}
+
+inline int16_t Tag::ToInt16(void) const
+{
+    if (IsInt16())
+        return boost::get<int16_t>(data_);
+    return boost::apply_visitor(internal::ToInt16ConvertVisitor(), data_);
+}
+
+inline uint16_t Tag::ToUInt16(void) const
+{
+    if (IsUInt16())
+        return boost::get<uint16_t>(data_);
+    return boost::apply_visitor(internal::ToUInt16ConvertVisitor(), data_);
+}
+
+inline int32_t Tag::ToInt32(void) const
+{
+    if (IsInt32())
+        return boost::get<int32_t>(data_);
+    return boost::apply_visitor(internal::ToInt32ConvertVisitor(), data_);
+}
+
+inline uint32_t Tag::ToUInt32(void) const
+{
+    if (IsUInt32())
+        return boost::get<uint32_t>(data_);
+    return boost::apply_visitor(internal::ToUInt32ConvertVisitor(), data_);
+}
+
+inline float Tag::ToFloat(void) const
+{ return boost::get<float>(data_); }
+
+inline std::string Tag::ToString(void) const
+{ return boost::get<std::string>(data_); }
+
+inline std::vector<int8_t> Tag::ToInt8Array(void) const
+{ return boost::get< std::vector<int8_t> >(data_); }
+
+inline std::vector<uint8_t> Tag::ToUInt8Array(void) const
+{ return boost::get< std::vector<uint8_t> >(data_); }
+
+inline std::vector<int16_t> Tag::ToInt16Array(void) const
+{ return boost::get< std::vector<int16_t> >(data_); }
+
+inline std::vector<uint16_t> Tag::ToUInt16Array(void) const
+{ return boost::get< std::vector<uint16_t> >(data_); }
+
+inline std::vector<int32_t> Tag::ToInt32Array(void) const
+{ return boost::get< std::vector<int32_t> >(data_); }
+
+inline std::vector<uint32_t> Tag::ToUInt32Array(void) const
+{ return boost::get< std::vector<uint32_t> >(data_); }
+
+inline std::vector<float> Tag::ToFloatArray(void) const
+{ return boost::get< std::vector<float> >(data_); }
+
+inline TagDataType Tag::Type(void) const
+{ return TagDataType(data_.which()  ); }
+
+inline std::string Tag::Typename(void) const
+{ return boost::apply_visitor(internal::TypenameVisitor(), data_); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Validator.inl b/include/pbbam/internal/Validator.inl

new file mode 100644 (file)

index 0000000..123cfad
--- /dev/null
+++ b/include/pbbam/internal/Validator.inl
@@ -0,0 +1,92 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Validator.inl
+/// \brief Inline implementations for the Validator class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Validator.h"
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+
+inline bool Validator::IsValid(const BamFile& file, const bool entireFile)
+{
+    try {
+        if (entireFile)
+            ValidateEntireFile(file, 1);
+        else
+            ValidateFileMetadata(file, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+inline bool Validator::IsValid(const BamHeader& header)
+{
+    try {
+        Validate(header, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+inline bool Validator::IsValid(const BamRecord& record)
+{
+    try {
+        Validate(record, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+inline bool Validator::IsValid(const ReadGroupInfo& rg)
+{
+    try {
+        Validate(rg, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/virtual/VirtualPolymeraseBamRecord.h b/include/pbbam/virtual/VirtualPolymeraseBamRecord.h

new file mode 100644 (file)

index 0000000..ee473b7
--- /dev/null
+++ b/include/pbbam/virtual/VirtualPolymeraseBamRecord.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualPolymeraseBamRecord.h
+/// \brief Defines the VirtualPolymeraseBamRecord class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALPOLYMERASEBAMRECORD_H
+#define VIRTUALPOLYMERASEBAMRECORD_H
+
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use VirtualZmwBamRecord instead.
+typedef VirtualZmwBamRecord VirtualPolymeraseBamRecord;
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // VIRTUALPOLYMERASEBAMRECORD_H
diff --git a/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h b/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h

new file mode 100644 (file)

index 0000000..7d37240
--- /dev/null
+++ b/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualPolymeraseCompositeReader.h
+/// \brief Defines the VirtualPolymeraseCompositeReader class.
+//
+// Author: Derek Barnett
+
+#ifndef VIRTUALPOLYMERASECOMPOSITEREADER_H
+#define VIRTUALPOLYMERASECOMPOSITEREADER_H
+
+#include "pbbam/virtual/VirtualPolymeraseBamRecord.h"
+#include "pbbam/virtual/ZmwReadStitcher.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use ZmwReadStitcher instead.
+typedef ZmwReadStitcher VirtualPolymeraseCompositeReader;
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // VIRTUALPOLYMERASECOMPOSITEREADER_H
diff --git a/include/pbbam/virtual/VirtualPolymeraseReader.h b/include/pbbam/virtual/VirtualPolymeraseReader.h

new file mode 100644 (file)

index 0000000..5ccfa27
--- /dev/null
+++ b/include/pbbam/virtual/VirtualPolymeraseReader.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualPolymeraseReader.h
+/// \brief Defines the VirtualPolymeraseReader class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALPOLYMERASEREADER_H
+#define VIRTUALPOLYMERASEREADER_H
+
+#include "pbbam/virtual/VirtualPolymeraseBamRecord.h"
+#include "pbbam/virtual/ZmwReadStitcher.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use ZmwReadStitcher instead.
+typedef ZmwReadStitcher VirtualPolymeraseReader;
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // VIRTUALPOLYMERASEREADER_H
diff --git a/include/pbbam/virtual/VirtualRegion.h b/include/pbbam/virtual/VirtualRegion.h

new file mode 100644 (file)

index 0000000..facce7d
--- /dev/null
+++ b/include/pbbam/virtual/VirtualRegion.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualRegion.h
+/// \brief Defines the VirtualRegion class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALREGION_H
+#define VIRTUALREGION_H
+
+#include "pbbam/Config.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+#include "pbbam/LocalContextFlags.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The VirtualRegion represents an annotation of a polymerase region.
+///
+struct VirtualRegion
+{
+public:
+    VirtualRegionType type;
+    int beginPos;
+    int endPos;
+    LocalContextFlags cxTag = LocalContextFlags::NO_LOCAL_CONTEXT;
+    int barcodeLeft = -1;
+    int barcodeRight = -1;
+    int score = 0;
+
+public:
+    /// \brief Creates a virtual region with basic type & position info.
+    ///
+    VirtualRegion(const VirtualRegionType type, 
+                  const int beginPos,
+                  const int endPos,
+                  const int score = 0);
+
+    /// \brief Creates a virtual region with type/position info, as well as context & barcode.
+    ///
+    VirtualRegion(const VirtualRegionType type, 
+                  const int beginPos,
+                  const int endPos, 
+                  const LocalContextFlags cxTag, 
+                  const int barcodeLeft,
+                  const int barcodeRight,
+                  const int score = 0);
+
+    VirtualRegion(void) = default;
+    VirtualRegion(const VirtualRegion&) = default;
+    VirtualRegion(VirtualRegion&&) = default;
+    VirtualRegion& operator=(const VirtualRegion&) = default; // un-"delete"-ed for SWIG
+    VirtualRegion& operator=(VirtualRegion&&) = default;
+    ~VirtualRegion(void) = default;
+
+    bool operator==(const VirtualRegion &v1) const;
+
+};
+
+inline VirtualRegion::VirtualRegion(const VirtualRegionType type,
+                                    const int beginPos,
+                                    const int endPos,
+                                    const int score)
+    : type(type)
+    , beginPos(beginPos)
+    , endPos(endPos), cxTag()
+    , score(score)
+{}
+
+inline VirtualRegion::VirtualRegion(const VirtualRegionType type,
+                                    const int beginPos,
+                                    const int endPos,
+                                    const LocalContextFlags cxTag,
+                                    const int barcodeLeft,
+                                    const int barcodeRight,
+                                    const int score)
+    : type(type)
+    , beginPos(beginPos)
+    , endPos(endPos)
+    , cxTag(cxTag)
+    , barcodeLeft(barcodeLeft)
+    , barcodeRight(barcodeRight)
+    , score(score)
+{}
+
+inline bool VirtualRegion::operator==(const VirtualRegion& v1) const
+{
+    return (v1.type == this->type &&
+            v1.beginPos == this->beginPos &&
+            v1.endPos == this->endPos);
+}
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // VIRTUALREGION_H
diff --git a/include/pbbam/virtual/VirtualRegionType.h b/include/pbbam/virtual/VirtualRegionType.h

new file mode 100644 (file)

index 0000000..d359094
--- /dev/null
+++ b/include/pbbam/virtual/VirtualRegionType.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualRegionType.h
+/// \brief Defines the VirtualRegionType enum.
+//
+// Author: Derek Barnett
+
+#ifndef REGIONTYPE_H
+#define REGIONTYPE_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the types of annotated region.
+///
+enum class VirtualRegionType // : char
+{
+    ADAPTER        = 0x41,  ///< Adapter region ('A')
+    BARCODE        = 0x42,  ///< Barcode region ('B')
+    FILTERED       = 0x46,  ///< Filtered subread ('F')
+    SUBREAD        = 0x53,  ///< Subread ('S')
+    HQREGION       = 0x48,  ///< High-quality region ('H')
+    LQREGION       = 0x4C   ///< Low-quality region ('L'), i.e. outside the HQ region
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // REGIONTYPE_H
diff --git a/include/pbbam/virtual/VirtualRegionTypeMap.h b/include/pbbam/virtual/VirtualRegionTypeMap.h

new file mode 100644 (file)

index 0000000..200f12f
--- /dev/null
+++ b/include/pbbam/virtual/VirtualRegionTypeMap.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualRegionTypeMap.h
+/// \brief Defines the VirtualRegionTypeMap class.
+//
+// Author: Derek Barnett
+
+#ifndef VIRTUALREGIONTYPEMAP_H
+#define VIRTUALREGIONTYPEMAP_H
+
+#include <map>
+
+#include "pbbam/Config.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The VirtualRegionTypeMap class provides mapping between char codes and
+///        VirtualRegionType enum keys.
+///
+class VirtualRegionTypeMap
+{
+public:
+       static std::map<char, VirtualRegionType> ParseChar;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // VIRTUALREGIONTYPEMAP_H
diff --git a/include/pbbam/virtual/VirtualZmwBamRecord.h b/include/pbbam/virtual/VirtualZmwBamRecord.h

new file mode 100644 (file)

index 0000000..32149bd
--- /dev/null
+++ b/include/pbbam/virtual/VirtualZmwBamRecord.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualZmwBamRecord.h
+/// \brief Defines the VirtualZmwBamRecord class.
+//
+// Author: Armin Töpfer
+
+#ifndef VirtualZmwBAMRECORD_H
+#define VirtualZmwBAMRECORD_H
+
+#include <vector>
+#include <sstream>
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/virtual/VirtualRegion.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The VirtualZmwBamRecord class represents a ZMW read stitched
+///        on-the-fly from subreads|hqregion + scraps.
+///
+class VirtualZmwBamRecord : public BamRecord
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a "virtual" ZMW %BAM record, by re-stitching its
+    ///        constituent segments.
+    ///
+    /// \param[in] unorderedSources source data (subreads, scraps, etc.)
+    /// \param[in] header           %BAM header to associate with the new record
+    ///
+    /// \throws std::runtime_error on failure to stitch virtual record
+    ///
+    VirtualZmwBamRecord(std::vector<BamRecord>&& unorderedSources,
+                        const BamHeader& header);
+
+    VirtualZmwBamRecord(void) = delete;
+    VirtualZmwBamRecord(const VirtualZmwBamRecord&) = default;
+    VirtualZmwBamRecord(VirtualZmwBamRecord&&) = default;
+    VirtualZmwBamRecord& operator=(const VirtualZmwBamRecord&) = default;
+    VirtualZmwBamRecord& operator=(VirtualZmwBamRecord&&) = default;
+    virtual ~VirtualZmwBamRecord() = default;
+
+    /// \}
+
+public:
+    /// \name Virtual Record Attributes
+    ///
+
+    /// \returns true if requested VirtualRegionType has been annotated.
+    ///
+    bool HasVirtualRegionType(const VirtualRegionType regionType) const;
+
+    /// \returns IPD frame data
+    ///
+    Frames IPDV1Frames(Orientation orientation = Orientation::NATIVE) const;
+
+    /// \brief Provides all annotations of the polymerase read as a map (type => regions)
+    ///
+    std::map<VirtualRegionType, std::vector<VirtualRegion>> VirtualRegionsMap(void) const;
+
+    /// \brief Provides annotations of the polymerase read for a given VirtualRegionType.
+    ///
+    /// \param[in] regionType  requested region type
+    /// \returns regions that match the requested type (empty vector if none found).
+    ///
+    std::vector<VirtualRegion> VirtualRegionsTable(const VirtualRegionType regionType) const;
+
+    /// \}
+
+private:
+    std::vector<BamRecord> sources_;
+    std::map<VirtualRegionType, std::vector<VirtualRegion>> virtualRegionsMap_;
+
+private:
+    void StitchSources(void);
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // VirtualZmwBAMRECORD_H
diff --git a/include/pbbam/virtual/WhitelistedZmwReadStitcher.h b/include/pbbam/virtual/WhitelistedZmwReadStitcher.h

new file mode 100644 (file)

index 0000000..d6fa13a
--- /dev/null
+++ b/include/pbbam/virtual/WhitelistedZmwReadStitcher.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file WhitelistedZmwReadStitcher.h
+/// \brief Defines the  ZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#ifndef WHITELISTEDZMWREADSTITCHER_H
+#define WHITELISTEDZMWREADSTITCHER_H
+
+#include "pbbam/Config.h"
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+#include <memory>
+#include <vector>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+class DataSet;
+class PbiFilter;
+
+/// \brief The WhitelistedZmwReadStitcher class provides an interface for
+///        re-stitching "virtual" ZMW reads from their constituent parts,
+///        limiting results to only those reads originating from a 'whitelist'
+///         of ZMW hole numbers.
+///
+/// Whitelisted ZMWs that are not present in both primary and scraps BAMs
+/// will be "pre-removed." This ensures that, given client code like this:
+///
+/// \include code/WhitelistedZmwReadStitcher.txt
+///
+/// each iteration will always provide valid data - either a valid virtual
+/// record from Next() or a non-empty vector from NextRaw().
+///
+/// \note This reader requires that both input %BAM files also have associated
+///       PBI files available for query. See BamFile::EnsurePacBioIndexExists .
+///
+class PBBAM_EXPORT WhitelistedZmwReadStitcher
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a reader that will operate on a primary %BAM file (e.g. subread data)
+    ///        and a scraps file, using a ZMW whitelist to filter the input.
+    ///
+    /// \param[in] zmwWhitelist         list of ZMWs to restrict iteration over
+    /// \param[in] primaryBamFilePath   hqregion.bam or subreads.bam file path
+    /// \param[in] scrapsBamFilePath    scraps.bam file path
+    ///
+    /// \note This reader requires that both input %BAM files also have associated PBI
+    ///       files available for query. See BamFile::EnsurePacBioIndexExists .
+    ///
+    /// \throws std::runtime_error if any files (*.bam and/or *.pbi) were not available for reading, or
+    ///         if malformed data encountered
+    ///
+    WhitelistedZmwReadStitcher(const std::vector<int32_t>& zmwWhitelist,
+                              const std::string& primaryBamFilePath,
+                              const std::string& scrapsBamFilePath);
+
+    WhitelistedZmwReadStitcher(void) = delete;
+    WhitelistedZmwReadStitcher(const WhitelistedZmwReadStitcher&) = delete;
+    WhitelistedZmwReadStitcher(WhitelistedZmwReadStitcher&&)      = delete;
+    WhitelistedZmwReadStitcher& operator=(const WhitelistedZmwReadStitcher&) = delete;
+    WhitelistedZmwReadStitcher& operator=(WhitelistedZmwReadStitcher&&)      = delete;
+    ~WhitelistedZmwReadStitcher(void);
+
+    /// \}
+
+public:
+    /// \name Stitched Record Reading
+    /// \{
+
+    /// \returns true if more ZMWs are available for reading.
+    bool HasNext(void) const;
+
+    /// \returns the re-stitched polymerase read from the next ZMW in the whitelist
+    VirtualZmwBamRecord Next(void);
+
+    /// \returns the set of reads that belong to the next ZMW in the whitelist.
+    ///          This enables stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw(void);
+
+    /// \}
+
+public:
+    /// \name File Headers
+    /// \{
+
+    /// \returns the BamHeader associated with this reader's "primary" %BAM file
+    BamHeader PrimaryHeader(void) const;
+
+    /// \returns the BamHeader associated with this reader's "scraps" %BAM file
+    BamHeader ScrapsHeader(void) const;
+
+    /// \}
+
+private:
+    struct WhitelistedZmwReadStitcherPrivate;
+    std::unique_ptr<WhitelistedZmwReadStitcherPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // WHITELISTEDZMWREADSTITCHER
diff --git a/include/pbbam/virtual/ZmwReadStitcher.h b/include/pbbam/virtual/ZmwReadStitcher.h

new file mode 100644 (file)

index 0000000..a2e020a
--- /dev/null
+++ b/include/pbbam/virtual/ZmwReadStitcher.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ZmwReadStitcher.h
+/// \brief Defines the ZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWREADSTITCHER_H
+#define ZMWREADSTITCHER_H
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+#include <memory>
+#include <vector>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+class DataSet;
+class PbiFilter;
+
+/// \brief The ZmwReadStitcher class provides an interface for re-stitching
+///        "virtual" polymerase reads from their constituent parts.
+///
+/// \note This reader requires that any input %BAM files also have associated PBI
+///       files available for query. See BamFile::EnsurePacBioIndexExists .
+///
+class PBBAM_EXPORT ZmwReadStitcher
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// entire file, from BAM names
+    ZmwReadStitcher(const std::string& primaryBamFilePath,
+                    const std::string& scrapsBamFilePath);
+
+    /// filtered input from BAM names
+    ZmwReadStitcher(const std::string& primaryBamFilePath,
+                    const std::string& scrapsBamFilePath,
+                    const PbiFilter& filter);
+
+    /// maybe filtered, from DataSet input
+    ZmwReadStitcher(const DataSet& dataset);
+
+    ZmwReadStitcher(void) = delete;
+    ZmwReadStitcher(const ZmwReadStitcher&) = delete;
+    ZmwReadStitcher(ZmwReadStitcher&&) = delete;
+    ZmwReadStitcher& operator=(const ZmwReadStitcher&) = delete;
+    ZmwReadStitcher& operator=(ZmwReadStitcher&&) = delete;
+    ~ZmwReadStitcher(void);
+
+    /// \}
+
+public:
+    /// \name File Headers
+    /// \{
+
+    /// \returns the BamHeader associated with this reader's "primary" %BAM file
+    BamHeader PrimaryHeader(void) const;
+
+    /// \returns the BamHeader associated with this reader's "scraps" %BAM file
+    BamHeader ScrapsHeader(void) const;
+
+    /// \}
+
+public:
+    /// \name Stitched Record Reading
+    ///
+
+    /// \returns true if more ZMWs are available for reading.
+    bool HasNext(void);
+
+    /// \returns the next stitched polymerase read
+    VirtualZmwBamRecord Next(void);
+
+    /// \returns the next set of reads that belong to one ZMW.
+    ///          This enables stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw(void);
+
+    /// \}
+
+private:
+    struct ZmwReadStitcherPrivate;
+    std::unique_ptr<ZmwReadStitcherPrivate> d_;
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // ZMWREADSTITCHER_H
diff --git a/include/pbbam/virtual/ZmwWhitelistVirtualReader.h b/include/pbbam/virtual/ZmwWhitelistVirtualReader.h

new file mode 100644 (file)

index 0000000..8b99e3c
--- /dev/null
+++ b/include/pbbam/virtual/ZmwWhitelistVirtualReader.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ZmwWhitelistVirtualReader.h
+/// \brief Defines the ZmwWhitelistVirtualReader class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWWHITELISTVIRTUALREADER_H
+#define ZMWWHITELISTVIRTUALREADER_H
+
+#include "pbbam/virtual/WhitelistedZmwReadStitcher.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use WhitelistedZmwReadStitcher instead.
+typedef WhitelistedZmwReadStitcher ZmwWhitelistVirtualReader;
+
+} // namespace BAM
+} // namespace PacBio
+
+#endif // ZMWWHITELISTVIRTUALREADER_H
diff --git a/src/Accuracy.cpp b/src/Accuracy.cpp

new file mode 100644 (file)

index 0000000..e335abf
--- /dev/null
+++ b/src/Accuracy.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Accuracy.cpp
+/// \brief Implements the Accuracy class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Accuracy.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+const float Accuracy::MIN = 0.0f;
+const float Accuracy::MAX = 1.0f;
diff --git a/src/AlignmentPrinter.cpp b/src/AlignmentPrinter.cpp

new file mode 100644 (file)

index 0000000..5155859
--- /dev/null
+++ b/src/AlignmentPrinter.cpp
@@ -0,0 +1,155 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file AlignmentPrinter.cpp
+/// \brief Implements the AlignmentPrinter class.
+//
+// Author: Armin Töpfer
+
+#include "pbbam/AlignmentPrinter.h"
+
+#include <cmath>
+#include <iostream>
+#include <iomanip>  
+#include <stdexcept>
+#include <sstream>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+AlignmentPrinter::AlignmentPrinter(const IndexedFastaReader& ifr)
+    : ifr_(std::unique_ptr<IndexedFastaReader>(new IndexedFastaReader(ifr)))
+{ }
+
+std::string AlignmentPrinter::Print(const BamRecord& record,
+                                    const Orientation orientation)
+{
+       std::string seq = record.Sequence(orientation, true, true);
+       std::string ref = ifr_->ReferenceSubsequence(record, orientation, true, true);
+
+       if (seq.size() != ref.size())
+        throw std::runtime_error("Sequence and reference parts are of different size");
+
+       int seqLength = 0;
+       float matches = 0;
+       std::string pretty;
+    Position refCoord = record.ReferenceStart();
+    Position seqCoord = record.QueryStart();
+
+       for (size_t i = 0; i < seq.size();)
+       {
+               std::string refCoordStr = std::to_string(refCoord);
+               std::string seqCoordStr = std::to_string(seqCoord);
+
+               size_t maxCoordLength = std::max(refCoordStr.size(), seqCoordStr.size());
+               while (refCoordStr.size() < maxCoordLength)
+                       refCoordStr = " "+refCoordStr;
+               while (seqCoordStr.size() < maxCoordLength)
+                       seqCoordStr = " "+seqCoordStr;
+
+               std::string seqWrap = seqCoordStr + " : ";
+               std::string refWrap = refCoordStr + " : ";
+               std::string prettyWrap(maxCoordLength+3, ' ');
+               prettyWrap.reserve(seq.size());
+               for (int j = 0; i < seq.size() && j < 40; ++i, ++j)
+               {
+                       refWrap +=  ref[i];
+
+                       if (seq[i] == ref[i])
+                       {
+                               ++matches;
+                               if (refCoord == 0 || refCoord % 10)
+                                       prettyWrap += '|'; 
+                               else
+                               {
+                                       prettyWrap += "\033[1m\x1b[31m";
+                                       prettyWrap += '|'; 
+                                       prettyWrap += "\033[0m\x1b[39;49m";
+                               }
+                               seqWrap += seq[i];
+                       }
+                       else if (seq[i] == '-' || ref[i] == '-')
+                       {
+                               prettyWrap +=  ' ';
+                               seqWrap += seq[i];
+                       }
+                       else
+                       {
+                               prettyWrap +=  '.';
+                               seqWrap += "\033[1m\x1b[31m";
+                               seqWrap += seq[i];
+                               seqWrap += "\033[0m\x1b[39;49m";
+                       }
+                       if (seq[i] != '-')
+                       {
+                               ++seqLength;
+                               ++seqCoord;
+                       }
+                       if (ref[i] != '-')
+                       {
+                               ++refCoord;
+                       }
+               }
+
+               refCoordStr = std::to_string(refCoord);
+               seqCoordStr = std::to_string(seqCoord);
+
+               maxCoordLength = std::max(refCoordStr.size(), seqCoordStr.size());
+               while (refCoordStr.size() < maxCoordLength)
+                       refCoordStr = " "+refCoordStr;
+               while (seqCoordStr.size() < maxCoordLength)
+                       seqCoordStr = " "+seqCoordStr;
+
+               seqWrap += " : " + seqCoordStr;
+               refWrap += " : " + refCoordStr;
+
+               pretty += refWrap + '\n' + prettyWrap + '\n' + seqWrap + "\n\n";
+       }
+       float similarity = matches/seq.size();
+
+       std::stringstream output;
+
+       output << "Read        : " << record.FullName() << std::endl;
+       output << "Reference   : " << record.ReferenceName() << std::endl;
+       output << std::endl;
+       output << "Read-length : " << seqLength << std::endl;
+       output << "Concordance : " << std::setprecision(3) << (similarity);
+       output << std::endl;
+       output << std::endl;
+       output << pretty;
+
+    return output.str();
+}
diff --git a/src/AssertUtils.cpp b/src/AssertUtils.cpp

new file mode 100644 (file)

index 0000000..42f4ea8
--- /dev/null
+++ b/src/AssertUtils.cpp
@@ -0,0 +1,90 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "AssertUtils.h"
+#include <cstdarg>
+#include <cstdio>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static inline
+void message_out(FILE* stream,
+                 const char* format,
+                 const char* msg)
+{
+    fprintf(stream, format, msg);
+    fprintf(stream, "\n");
+    fflush(stream);
+}
+
+void printInfo(const char* msg, ...) {
+
+    va_list ap;
+    va_start(ap, msg);
+
+    char buffer[256] = {'\0' };
+    buffer[255] = '\0';
+    if (msg)
+        vsnprintf(buffer, 255, msg, ap);
+    va_end(ap);
+
+    message_out(stdout, "%s", buffer);
+}
+
+void printError(const char* msg, ...) {
+
+    va_list ap;
+    va_start(ap, msg);
+
+    char buffer[256] = {'\0' };
+    buffer[255] = '\0';
+    if (msg)
+        vsnprintf(buffer, 255, msg, ap);
+    va_end(ap);
+
+    message_out(stderr, "%s", buffer);
+}
+
+void printFailedAssert(const char* msg) {
+    printError("ASSERT FAILED: %s", msg);
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/src/AssertUtils.h b/src/AssertUtils.h

new file mode 100644 (file)

index 0000000..d098964
--- /dev/null
+++ b/src/AssertUtils.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef ASSERTUTILS_H
+#define ASSERTUTILS_H
+
+// ---------------------------------------------------
+// This file contains dev/debugging helper utilities
+// ---------------------------------------------------
+
+#ifndef PBBAM_UNUSED
+#  define PBBAM_UNUSED(x) (void)x;
+#endif
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+inline void pbbam_noop(void) { }
+
+// a la fprintf(...). Auto-adds a newline
+void printError(const char* msg, ...);
+void printInfo(const char* msg, ...);
+void printFailedAssert(const char* msg);
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+//
+// This assert construct below allows us to report failures as well as take some
+// fallback action (return, break, continue, etc) so as not to crash at runtime.
+// In other words, it's basically a 'weak' assert with customized information &
+// failure response.
+//
+// PB_VERIFY(cond)           if condition fails, print message
+// PB_ASSERT(cond, action)   if condition fails, print message & perform action
+// PB_ASSERT_OR_BREAK        overload of ASSERT where action is 'break'
+// PB_ASSERT_OR_CONTINUE     overload of ASSERT where action is 'continue'
+// PB_ASSERT_OR_RETURN       overload of ASSERT where action is 'return'
+// PB_ASSERT_OR_RETURN_VALUE overload of ASSERT where action is 'return <value>'
+// PB_ASSERT_UNREACHABLE     overload of ASSERT(false) where action is a no-op. Used as a visual marker for
+//                        unreachable code-paths (e.g. invalid values in a switch statement)
+//
+#define PB_ASSERT_STRINGIFY2(x) #x
+#define PB_ASSERT_STRINGIFY(x) PB_ASSERT_STRINGIFY2(x)
+#define PB_ASSERT_STRING(cond) ::PacBio::BAM::internal::printFailedAssert( \
+    "\"" cond"\" in file " __FILE__ ", line " PB_ASSERT_STRINGIFY(__LINE__))
+
+#define PB_VERIFY(cond)             if (cond) {} else { PB_ASSERT_STRING(#cond);         } do {} while (0)
+#define PB_ASSERT(cond, action)     if (cond) {} else { PB_ASSERT_STRING(#cond); action; } do {} while (0)
+#define PB_ASSERT_OR_BREAK(cond)    PB_ASSERT(cond, break)
+#define PB_ASSERT_OR_CONTINUE(cond) PB_ASSERT(cond, continue)
+#define PB_ASSERT_OR_RETURN(cond)   PB_ASSERT(cond, return)
+#define PB_ASSERT_OR_RETURN_VALUE(cond, value) PB_ASSERT(cond, return value)
+
+#define PB_ASSERT_UNREACHABLE PB_ASSERT(false, ::PacBio::BAM::internal::pbbam_noop())
+
+#endif // ASSERTUTILS_H
diff --git a/src/BaiIndexedBamReader.cpp b/src/BaiIndexedBamReader.cpp

new file mode 100644 (file)

index 0000000..3f9d538
--- /dev/null
+++ b/src/BaiIndexedBamReader.cpp
@@ -0,0 +1,141 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BaiIndexedBamReader.cpp
+/// \brief Implements the BaiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BaiIndexedBamReader.h"
+#include "MemoryUtils.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct BaiIndexedBamReaderPrivate
+{
+public:
+    BaiIndexedBamReaderPrivate(const BamFile& file,
+                               const GenomicInterval& interval)
+        : htsIndex_(nullptr)
+        , htsIterator_(nullptr)
+    {
+        LoadIndex(file.Filename());
+        Interval(file.Header(), interval);
+    }
+
+    void Interval(const BamHeader& header,
+                  const GenomicInterval& interval)
+    {
+        htsIterator_.reset(nullptr);
+
+        if (header.HasSequence(interval.Name())) {
+            auto id = header.SequenceId(interval.Name());
+            if (id >= 0 && static_cast<size_t>(id) < header.NumSequences()) {
+                htsIterator_.reset(bam_itr_queryi(htsIndex_.get(),
+                                                  id,
+                                                  interval.Start(),
+                                                  interval.Stop()));
+            }
+        }
+
+        if (!htsIterator_)
+            throw std::runtime_error("could not create iterator for requested region");
+    }
+
+    void LoadIndex(const string& fn)
+    {
+        htsIndex_.reset(bam_index_load(fn.c_str()));
+        if (!htsIndex_)
+            throw std::runtime_error("could not load BAI index data");
+    }
+
+    int ReadRawData(BGZF* bgzf, bam1_t* b)
+    {
+        assert(htsIterator_.get());
+        return hts_itr_next(bgzf, htsIterator_.get(), b, nullptr);
+    }
+
+public:
+    GenomicInterval interval_;
+    std::unique_ptr<hts_idx_t, internal::HtslibIndexDeleter>    htsIndex_;
+    std::unique_ptr<hts_itr_t, internal::HtslibIteratorDeleter> htsIterator_;
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval,
+                                         const std::string& filename)
+    : BaiIndexedBamReader(interval, BamFile(filename))
+{ }
+
+BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval,
+                                         const BamFile& bamFile)
+    : BamReader(bamFile)
+    , d_(new BaiIndexedBamReaderPrivate(File(), interval))
+{ }
+
+BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval,
+                                         BamFile&& bamFile)
+    : BamReader(std::move(bamFile))
+    , d_(new BaiIndexedBamReaderPrivate(File(), interval))
+{ }
+
+const GenomicInterval& BaiIndexedBamReader::Interval(void) const
+{
+    assert(d_);
+    return d_->interval_;
+}
+
+int BaiIndexedBamReader::ReadRawData(BGZF* bgzf, bam1_t* b)
+{
+    assert(d_);
+    return d_->ReadRawData(bgzf, b);
+}
+
+BaiIndexedBamReader& BaiIndexedBamReader::Interval(const GenomicInterval& interval)
+{
+    assert(d_);
+    d_->Interval(Header(), interval);
+    return *this;
+}
diff --git a/src/BamFile.cpp b/src/BamFile.cpp

new file mode 100644 (file)

index 0000000..ed942b9
--- /dev/null
+++ b/src/BamFile.cpp
@@ -0,0 +1,245 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamFile.cpp
+/// \brief Implements the BamFile class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamFile.h"
+#include "pbbam/PbiFile.h"
+#include "FileUtils.h"
+#include "MemoryUtils.h"
+#include <htslib/sam.h>
+#include <memory>
+#include <sstream>
+#include <cassert>
+#include <sys/stat.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class BamFilePrivate
+{
+public:
+    BamFilePrivate(const string& fn)
+        : filename_(fn)
+        , firstAlignmentOffset_(-1)
+    {
+        // ensure we've updated htslib verbosity with requested verbosity here
+        hts_verbose = ( PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity);
+
+        // attempt open
+        auto f = RawOpen();
+
+#if !defined (PBBAM_NO_CHECK_EOF) || PBBAM_AUTOVALIDATE
+        // sanity check on file
+        const int eofCheck = bgzf_check_EOF(f->fp.bgzf);
+        if (eofCheck <= 0 ) {
+            // 1:  EOF present & correct
+            // 2:  not seekable (e.g. reading from stdin)
+            // 0:  EOF absent
+            // -1: some other error
+            stringstream e;
+            if (eofCheck == 0)
+                e << fn << " : is missing EOF block" << endl;
+            else
+                e << fn << " : unknown error while checking EOF block" << endl;
+            throw std::runtime_error(e.str());
+        }
+#endif
+
+        // attempt fetch header
+        std::unique_ptr<bam_hdr_t, internal::HtslibHeaderDeleter> hdr(sam_hdr_read(f.get()));
+        header_ = internal::BamHeaderMemory::FromRawData(hdr.get());
+
+        // cache first alignment offset
+        firstAlignmentOffset_ = bgzf_tell(f->fp.bgzf);
+    }
+
+    unique_ptr<BamFilePrivate> DeepCopy(void)
+    {
+        return unique_ptr<BamFilePrivate>(new BamFilePrivate(filename_));
+    }
+
+    bool HasEOF(void) const
+    {
+        // streamed input is unknown, since it's not random-accessible
+        if (filename_ == "-")
+            return false;
+
+        // attempt open
+        auto f = RawOpen();
+        return RawEOFCheck(f) == 1;
+    }
+
+    int RawEOFCheck(const std::unique_ptr<samFile, internal::HtslibFileDeleter>& f) const
+    {
+        assert(f);
+        assert(f->fp.bgzf);
+        return bgzf_check_EOF(f->fp.bgzf);
+    }
+
+    std::unique_ptr<samFile, internal::HtslibFileDeleter> RawOpen(void) const
+    {
+        std::unique_ptr<samFile, internal::HtslibFileDeleter> f(sam_open(filename_.c_str(), "rb"));
+        if (!f || !f->fp.bgzf)
+            throw std::runtime_error(string("could not open BAM file: ") + filename_);
+        if (f->format.format != bam)
+            throw std::runtime_error("expected BAM, unknown format");
+        return f;
+    }
+
+public:
+    std::string filename_;
+    BamHeader header_;
+    int64_t firstAlignmentOffset_;
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+// ------------------------
+// BamFile implementation
+// ------------------------
+
+BamFile::BamFile(const std::string& filename)
+    : d_(new internal::BamFilePrivate(filename))
+{ }
+
+BamFile::BamFile(const BamFile& other)
+    : d_(other.d_->DeepCopy())
+{ }
+
+BamFile::BamFile(BamFile&& other)
+    : d_(std::move(other.d_))
+{ }
+
+BamFile& BamFile::operator=(const BamFile& other)
+{
+    d_ = other.d_->DeepCopy();
+    return *this;
+}
+
+BamFile& BamFile::operator=(BamFile&& other)
+{ d_ = std::move(other.d_); return *this; }
+
+BamFile::~BamFile(void) { }
+
+void BamFile::CreatePacBioIndex(void) const
+{
+    PbiFile::CreateFrom(*this);
+}
+
+void BamFile::CreateStandardIndex(void) const
+{
+    if (bam_index_build(d_->filename_.c_str(), 0) != 0)
+        throw std::runtime_error("could not build BAI index");
+}
+
+void BamFile::EnsurePacBioIndexExists(void) const
+{
+    if (!PacBioIndexExists())
+        CreatePacBioIndex();
+}
+
+void BamFile::EnsureStandardIndexExists(void) const
+{
+    if (!StandardIndexExists())
+        CreateStandardIndex();
+}
+
+std::string BamFile::Filename(void) const
+{ return d_->filename_; }
+
+int64_t BamFile::FirstAlignmentOffset(void) const
+{ return d_->firstAlignmentOffset_; }
+
+bool BamFile::HasEOF(void) const
+{ return d_->HasEOF(); }
+
+bool BamFile::HasReference(const std::string& name) const
+{ return d_->header_.HasSequence(name); }
+
+const BamHeader& BamFile::Header(void) const
+{ return d_->header_; }
+
+bool BamFile::IsPacBioBAM(void) const
+{ return !d_->header_.PacBioBamVersion().empty(); }
+
+bool BamFile::PacBioIndexExists(void) const
+{ return internal::FileUtils::Exists(PacBioIndexFilename()); }
+
+std::string BamFile::PacBioIndexFilename(void) const
+{ return d_->filename_ + ".pbi"; }
+
+bool BamFile::PacBioIndexIsNewer(void) const
+{
+    const auto bamTimestamp = internal::FileUtils::LastModified(Filename());
+    const auto pbiTimestamp = internal::FileUtils::LastModified(PacBioIndexFilename());
+    return bamTimestamp <= pbiTimestamp;
+}
+
+int BamFile::ReferenceId(const std::string& name) const
+{ return d_->header_.SequenceId(name); }
+
+uint32_t BamFile::ReferenceLength(const std::string& name) const
+{ return ReferenceLength(ReferenceId(name)); }
+
+uint32_t BamFile::ReferenceLength(const int id) const
+{ return std::stoul(d_->header_.SequenceLength(id)); }
+
+std::string BamFile::ReferenceName(const int id) const
+{ return d_->header_.SequenceName(id); }
+
+bool BamFile::StandardIndexExists(void) const
+{ return internal::FileUtils::Exists(StandardIndexFilename()); }
+
+std::string BamFile::StandardIndexFilename(void) const
+{ return d_->filename_ + ".bai"; }
+
+bool BamFile::StandardIndexIsNewer(void) const 
+{ 
+    const auto bamTimestamp  = internal::FileUtils::LastModified(Filename());
+    const auto baiTimestamp = internal::FileUtils::LastModified(StandardIndexFilename());
+    return bamTimestamp <= baiTimestamp;
+}
+
diff --git a/src/BamHeader.cpp b/src/BamHeader.cpp

new file mode 100644 (file)

index 0000000..b69f4a3
--- /dev/null
+++ b/src/BamHeader.cpp
@@ -0,0 +1,386 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamHeader.cpp
+/// \brief Implements the BamHeader class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamHeader.h"
+#include "StringUtils.h"
+#include "Version.h"
+#include <htslib/hts.h>
+#include <sstream>
+#include <set>
+#include <cassert>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static const string prefix_HD = string("@HD");
+static const string prefix_SQ = string("@SQ");
+static const string prefix_RG = string("@RG");
+static const string prefix_PG = string("@PG");
+static const string prefix_CO = string("@CO");
+
+static const string token_VN = string("VN");
+static const string token_SO = string("SO");
+static const string token_pb = string("pb");
+
+static inline 
+bool CheckSortOrder(const string& lhs, const string& rhs)
+{ return lhs == rhs; } 
+
+static inline
+bool CheckPbVersion(const string& lhs, const string& rhs)
+{ 
+    return ( Version{ lhs } >= Version::Minimum && 
+             Version{ rhs } >= Version::Minimum);
+}
+
+static inline
+bool CheckSequences(const string& sortOrder, 
+                    const vector<SequenceInfo>& lhs,
+                    const vector<SequenceInfo>& rhs)
+{
+    return ( (sortOrder == "coordinate") ? lhs == rhs : true);
+} 
+
+static
+void EnsureCanMerge(const BamHeader& lhs, const BamHeader& rhs)
+{
+    // check compatibility
+    const bool sortOrderOk  = CheckSortOrder(lhs.SortOrder(), rhs.SortOrder());
+    const bool pbVersionOk  = CheckPbVersion(lhs.PacBioBamVersion(), rhs.PacBioBamVersion());
+    const bool sequencesOk  = CheckSequences(lhs.SortOrder(), lhs.Sequences(), rhs.Sequences());
+    if (sortOrderOk && pbVersionOk && sequencesOk)
+        return;
+
+    // if any checks failed, format error message & throw
+    stringstream e;
+    e << "could not merge BAM headers:" << endl;
+
+    if (!sortOrderOk) {
+        e << "  mismatched sort orders (@HD:SO) : ("
+          << lhs.SortOrder() << ", " << rhs.SortOrder()
+          << ")" << endl;
+    }
+
+    if (!pbVersionOk) {
+        e << "  incompatible PacBio BAM versions (@HD:pb) : ("
+          << lhs.PacBioBamVersion() << ", " << rhs.PacBioBamVersion()
+          << ")" << endl;
+    }
+
+    if (!sequencesOk)
+        e << "  mismatched sequence lists (@SQ entries)" << endl;
+
+    throw std::runtime_error(e.str());
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+BamHeader::BamHeader(const string& samHeaderText)
+    : d_(new internal::BamHeaderPrivate)
+{
+    istringstream s(samHeaderText);
+    string line("");
+    string firstToken;
+    while (getline(s, line)) {
+
+        // skip if line is not long enough to contain true values
+        if (line.length() < 5)
+            continue;
+
+        // determine token at beginning of line
+        firstToken = line.substr(0,3);
+
+        if (firstToken == internal::prefix_HD) {
+
+            // pop off '@HD\t', then split HD lines into tokens
+            const vector<string>& tokens = internal::Split(line.substr(4), '\t');
+            for (const string& token : tokens) {
+                const string& tokenTag   = token.substr(0,2);
+                const string& tokenValue = token.substr(3);
+
+                // set header contents
+                if      (tokenTag == internal::token_VN) Version(tokenValue);
+                else if (tokenTag == internal::token_SO) SortOrder(tokenValue);
+                else if (tokenTag == internal::token_pb) PacBioBamVersion(tokenValue);
+            }
+
+            // check for required tags
+            if (Version().empty())
+                Version(string(hts_version()));
+        }
+
+        else if (firstToken == internal::prefix_SQ)
+            AddSequence(SequenceInfo::FromSam(line));
+
+        else if (firstToken == internal::prefix_RG)
+            AddReadGroup(ReadGroupInfo::FromSam(line));
+
+        else if (firstToken == internal::prefix_PG)
+            AddProgram(ProgramInfo::FromSam(line));
+
+        else if (firstToken == internal::prefix_CO)
+            AddComment(line.substr(4));
+    }
+}
+
+BamHeader& BamHeader::operator+=(const BamHeader& other)
+{
+    internal::EnsureCanMerge(*this, other);
+
+    // merge read groups
+    for (const auto& rg : other.ReadGroups()) {
+        if (!HasReadGroup(rg.Id()))
+            AddReadGroup(rg);
+    }
+
+    // merge programs
+    for (const auto& pg : other.Programs()) {
+        if (!HasProgram(pg.Id()))
+            AddProgram(pg);
+    }
+
+    // merge comments
+    for (const auto& comment : other.Comments())
+        AddComment(comment);
+
+    return *this;
+}
+
+BamHeader& BamHeader::AddSequence(const SequenceInfo& sequence)
+{
+    d_->sequences_.push_back(sequence);
+    d_->sequenceIdLookup_[sequence.Name()] = d_->sequences_.size() - 1;
+    return *this;
+}
+
+BamHeader& BamHeader::ClearSequences(void)
+{
+    d_->sequenceIdLookup_.clear();
+    d_->sequences_.clear();
+    return *this;
+}
+
+BamHeader BamHeader::DeepCopy(void) const
+{
+    BamHeader result;
+    result.d_->version_ = d_->version_;
+    result.d_->pacbioBamVersion_ = d_->pacbioBamVersion_;
+    result.d_->sortOrder_ = d_->sortOrder_;
+    result.d_->headerLineCustom_ = d_->headerLineCustom_;
+    result.d_->readGroups_ = d_->readGroups_;
+    result.d_->programs_ = d_->programs_;
+    result.d_->comments_ = d_->comments_;
+    result.d_->sequences_ = d_->sequences_;
+    result.d_->sequenceIdLookup_ = d_->sequenceIdLookup_;
+    return result;
+}
+
+BamHeader& BamHeader::PacBioBamVersion(const std::string& version)
+{
+    d_->pacbioBamVersion_ = version;
+    const auto fileVersion = internal::Version{ version };
+    if (fileVersion < internal::Version::Minimum) {
+        auto msg  = string{ "invalid PacBio BAM version number" };
+             msg += ( "(" + fileVersion.ToString() + ")");
+             msg += string{ "is older than the minimum supported version" };
+             msg += ( "(" + internal::Version::Minimum.ToString() + ")");
+        throw std::runtime_error(msg);
+    }
+    return *this;
+}
+
+ProgramInfo BamHeader::Program(const std::string& id) const
+{
+    const auto iter = d_->programs_.find(id);
+    if (iter == d_->programs_.cend())
+        throw std::runtime_error("program ID not found");
+    return iter->second;
+}
+
+vector<string> BamHeader::ProgramIds(void) const
+{
+    vector<string> result;
+    result.reserve(d_->programs_.size());
+    const auto end = d_->programs_.cend();
+    auto iter = d_->programs_.cbegin();
+    for ( ; iter != end; ++iter )
+        result.push_back(iter->first);
+    return result;
+}
+
+vector<ProgramInfo> BamHeader::Programs(void) const
+{
+    vector<ProgramInfo> result;
+    result.reserve(d_->programs_.size());
+    const auto end = d_->programs_.cend();
+    auto iter = d_->programs_.cbegin();
+    for ( ; iter != end; ++iter )
+        result.push_back(iter->second);
+    return result;
+}
+
+BamHeader& BamHeader::Programs(const vector<ProgramInfo>& programs)
+{
+    d_->programs_.clear();
+    for (const ProgramInfo& pg : programs)
+        d_->programs_[pg.Id()] = pg;
+    return *this;
+}
+
+ReadGroupInfo BamHeader::ReadGroup(const std::string& id) const
+{
+    const auto iter = d_->readGroups_.find(id);
+    if (iter == d_->readGroups_.cend())
+        throw std::runtime_error("read group ID not found");
+    return iter->second;
+}
+
+vector<string> BamHeader::ReadGroupIds(void) const
+{
+    vector<string> result;
+    result.reserve(d_->readGroups_.size());
+    const auto end = d_->readGroups_.cend();
+    auto iter = d_->readGroups_.cbegin();
+    for ( ; iter != end; ++iter )
+        result.push_back(iter->first);
+    return result;
+}
+
+vector<ReadGroupInfo> BamHeader::ReadGroups(void) const
+{
+    vector<ReadGroupInfo> result;
+    result.reserve(d_->readGroups_.size());
+    const auto end = d_->readGroups_.cend();
+    auto iter = d_->readGroups_.cbegin();
+    for ( ; iter != end; ++iter )
+        result.push_back(iter->second);
+    return result;
+}
+
+BamHeader& BamHeader::ReadGroups(const vector<ReadGroupInfo>& readGroups)
+{
+    d_->readGroups_.clear();
+    for (const ReadGroupInfo& rg : readGroups)
+        d_->readGroups_[rg.Id()] = rg;
+    return *this;
+}
+
+SequenceInfo BamHeader::Sequence(const std::string& name) const
+{
+    // TODO: SequenceId(name) throws if not found, should we do so here as well?
+
+    const auto iter = d_->sequenceIdLookup_.find(name);
+    if (iter == d_->sequenceIdLookup_.cend())
+        return SequenceInfo();
+    const int index = iter->second;
+    assert(index >= 0 && (size_t)index < d_->sequences_.size());
+    return d_->sequences_.at(index);
+}
+
+int32_t BamHeader::SequenceId(const std::string& name) const
+{
+    const auto iter = d_->sequenceIdLookup_.find(name);
+    if (iter == d_->sequenceIdLookup_.cend())
+        throw std::runtime_error("sequence not found");
+    return iter->second;
+}
+
+vector<string> BamHeader::SequenceNames(void) const
+{
+    vector<string> result;
+    result.reserve(d_->sequences_.size());
+    const auto end = d_->sequences_.cend();
+    auto iter = d_->sequences_.cbegin();
+    for ( ; iter != end; ++iter )
+        result.push_back(iter->Name());
+    return result;
+}
+
+BamHeader& BamHeader::Sequences(const vector<SequenceInfo>& sequences)
+{
+    d_->sequences_.clear();
+    for (const SequenceInfo& seq : sequences)
+        AddSequence(seq);
+    return *this;
+}
+
+string BamHeader::ToSam(void) const
+{
+    // init stream
+    stringstream out("");
+
+    // @HD
+    const string& outputVersion   = (d_->version_.empty() ? string(hts_version()) : d_->version_);
+    const string& outputSortOrder = (d_->sortOrder_.empty() ? string("unknown") : d_->sortOrder_);
+    const string& outputPbBamVersion = (d_->pacbioBamVersion_.empty() ? internal::Version::Current.ToString()
+                                                                      : d_->pacbioBamVersion_);
+
+    out << internal::prefix_HD
+        << internal::MakeSamTag(internal::token_VN, outputVersion)
+        << internal::MakeSamTag(internal::token_SO, outputSortOrder)
+        << internal::MakeSamTag(internal::token_pb, outputPbBamVersion)
+        << endl;
+
+    // @SQ
+    for (const SequenceInfo& seq : d_->sequences_)
+        out << seq.ToSam() << endl;
+
+    // @RG
+    for (const auto& rgIter : d_->readGroups_)
+        out << rgIter.second.ToSam() << endl;
+
+    // @PG
+    for (const auto& progIter : d_->programs_)
+        out << progIter.second.ToSam() << endl;
+
+    // @CO
+    for (const string& comment : d_->comments_)
+        out << internal::prefix_CO << '\t' << comment << endl;
+
+    // return result
+    return out.str();
+}
diff --git a/src/BamReader.cpp b/src/BamReader.cpp

new file mode 100644 (file)

index 0000000..e10eeba
--- /dev/null
+++ b/src/BamReader.cpp
@@ -0,0 +1,195 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamReader.cpp
+/// \brief Implements the BamReader class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamReader.h"
+#include "pbbam/Validator.h"
+#include "MemoryUtils.h"
+#include <htslib/bgzf.h>
+#include <htslib/hfile.h>
+#include <htslib/hts.h>
+#include <cassert>
+#include <cstdio>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct BamReaderPrivate
+{
+public:
+    BamReaderPrivate(const BamFile& bamFile)
+        : htsFile_(nullptr)
+        , bamFile_(bamFile)
+    {
+        DoOpen();
+    }
+
+    BamReaderPrivate(BamFile&& bamFile)
+        : htsFile_(nullptr)
+        , bamFile_(std::move(bamFile))
+    {
+        DoOpen();
+    }
+
+    void DoOpen(void) {
+
+        // fetch file pointer
+        htsFile_.reset(sam_open(bamFile_.Filename().c_str(), "rb"));
+        if (!htsFile_)
+            throw std::runtime_error("could not open BAM file for reading");
+    }
+
+public:
+    std::unique_ptr<samFile, internal::HtslibFileDeleter> htsFile_;
+    BamFile bamFile_;
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+BamReader::BamReader(const string& fn)
+    : BamReader(BamFile(fn))
+{ }
+
+BamReader::BamReader(const BamFile& bamFile)
+    : d_(new internal::BamReaderPrivate(bamFile))
+{
+    // skip header
+    VirtualSeek(d_->bamFile_.FirstAlignmentOffset());
+}
+
+BamReader::BamReader(BamFile&& bamFile)
+    : d_(new internal::BamReaderPrivate(std::move(bamFile)))
+{
+    // skip header
+    VirtualSeek(d_->bamFile_.FirstAlignmentOffset());
+}
+
+BamReader::~BamReader(void) { }
+
+BGZF* BamReader::Bgzf(void) const
+{
+    assert(d_);
+    assert(d_->htsFile_);
+    assert(d_->htsFile_->fp.bgzf);
+    return d_->htsFile_->fp.bgzf;
+}
+
+const BamFile& BamReader::File(void) const
+{
+    assert(d_);
+    return d_->bamFile_;
+}
+
+std::string BamReader::Filename(void) const
+{
+    assert(d_);
+    return d_->bamFile_.Filename();
+}
+
+const BamHeader& BamReader::Header(void) const
+{
+    assert(d_);
+    return d_->bamFile_.Header();
+}
+
+bool BamReader::GetNext(BamRecord& record)
+{
+    assert(Bgzf());
+    assert(internal::BamRecordMemory::GetRawData(record).get());
+
+    auto result = ReadRawData(Bgzf(), internal::BamRecordMemory::GetRawData(record).get());
+
+    // success
+    if (result >= 0) {
+        internal::BamRecordMemory::UpdateRecordTags(record);
+        record.header_ = Header();
+        record.ResetCachedPositions();
+
+#if PBBAM_AUTOVALIDATE
+        Validator::Validate(record);
+#endif
+        return true;
+    }
+
+    // EOF or end-of-data range (not an error)
+    else if (result == -1)
+        return false;
+
+    // error corrupted file
+    else {
+        auto errorMsg = string{"corrupted BAM file: "};
+        if (result == -2)
+            errorMsg += "probably truncated";
+        else if (result == -3)
+            errorMsg += "could not read BAM record's' core data";
+        else if (result == -4)
+            errorMsg += "could not read BAM record's' variable-length data";
+        else
+            errorMsg += "unknown reason " + to_string(result);
+        errorMsg += string{" ("};
+        errorMsg += Filename();
+        errorMsg += string{")"};
+        throw std::runtime_error{errorMsg};
+    }
+}
+
+int BamReader::ReadRawData(BGZF* bgzf, bam1_t* b)
+{
+    return bam_read1(bgzf, b);
+}
+
+void BamReader::VirtualSeek(int64_t virtualOffset)
+{
+    auto result = bgzf_seek(Bgzf(), virtualOffset, SEEK_SET);
+    if (result != 0)
+        throw std::runtime_error("Failed to seek in BAM file");
+}
+
+int64_t BamReader::VirtualTell(void) const
+{
+    return bgzf_tell(Bgzf());
+}
diff --git a/src/BamRecord.cpp b/src/BamRecord.cpp

new file mode 100644 (file)

index 0000000..ae5253e
--- /dev/null
+++ b/src/BamRecord.cpp
@@ -0,0 +1,2457 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecord.cpp
+/// \brief Implements the BamRecord class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/virtual/VirtualRegionTypeMap.h"
+#include "pbbam/ZmwTypeMap.h"
+#include "AssertUtils.h"
+#include "BamRecordTags.h"
+#include "MemoryUtils.h"
+#include "Pulse2BaseCache.h"
+#include "SequenceUtils.h"
+#include <boost/numeric/conversion/cast.hpp>
+#include <htslib/sam.h>
+#include <iostream>
+#include <stdexcept>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// record type names
+static const string recordTypeName_ZMW        = "ZMW";
+static const string recordTypeName_Polymerase = "POLYMERASE";
+static const string recordTypeName_HqRegion   = "HQREGION";
+static const string recordTypeName_Subread    = "SUBREAD";
+static const string recordTypeName_CCS        = "CCS";
+static const string recordTypeName_Scrap      = "SCRAP";
+static const string recordTypeName_Unknown    = "UNKNOWN";
+
+static
+int32_t HoleNumberFromName(const string& fullName)
+{
+    const auto mainTokens = Split(fullName, '/');
+    if (mainTokens.size() != 3)
+        throw std::runtime_error("malformed record name");
+    return stoi(mainTokens.at(1));
+}
+
+static
+Position QueryEndFromName(const string& fullName)
+{
+    const auto mainTokens = Split(fullName, '/');
+    if (mainTokens.size() != 3)
+        throw std::runtime_error("malformed record name");
+    const auto queryTokens = Split(mainTokens.at(2), '_');
+    if (queryTokens.size() != 2)
+        throw std::runtime_error("malformed record name");
+    return stoi(queryTokens.at(1));
+}
+
+static
+Position QueryStartFromName(const string& fullName)
+{
+    const auto mainTokens = Split(fullName, '/');
+    if (mainTokens.size() != 3)
+        throw std::runtime_error("malformed record name");
+    const auto queryTokens = Split(mainTokens.at(2), '_');
+    if (queryTokens.size() != 2)
+        throw std::runtime_error("malformed record name");
+    return stoi(queryTokens.at(0));
+}
+
+static inline
+string Label(const BamRecordTag tag)
+{
+    return BamRecordTags::LabelFor(tag);
+}
+
+static
+BamRecordImpl* CreateOrEdit(const BamRecordTag tag,
+                            const Tag& value,
+                            BamRecordImpl* impl)
+{
+    if (impl->HasTag(tag))
+        impl->EditTag(tag, value);
+    else
+        impl->AddTag(tag, value);
+    return impl;
+}
+
+static
+pair<int32_t, int32_t> AlignedOffsets(const BamRecord& record,
+                                      const int seqLength)
+{
+    int32_t startOffset = 0;
+    int32_t endOffset = seqLength;
+
+    PBBAM_SHARED_PTR<bam1_t> b = internal::BamRecordMemory::GetRawData(record);
+    uint32_t* cigarData = bam_get_cigar(b.get());
+    const size_t numCigarOps = b->core.n_cigar;
+    if (numCigarOps > 0) {
+
+        // start offset
+        for (size_t i = 0; i < numCigarOps; ++i) {
+            const CigarOperationType type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+            if (type == CigarOperationType::HARD_CLIP) {
+                if (startOffset != 0 && startOffset != seqLength) {
+                    startOffset = -1;
+                    break;
+                }
+            }
+            else if (type == CigarOperationType::SOFT_CLIP)
+                startOffset += bam_cigar_oplen(cigarData[i]);
+            else
+                break;
+        }
+
+        // end offset
+        for (int i = numCigarOps-1; i >= 0; --i) {
+            const CigarOperationType type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+            if (type == CigarOperationType::HARD_CLIP) {
+                if (endOffset != 0 && endOffset != seqLength) {
+                    endOffset = -1;
+                    break;
+                }
+            }
+            else if (type == CigarOperationType::SOFT_CLIP)
+                endOffset -= bam_cigar_oplen(cigarData[i]);
+            else
+                break;
+
+        }
+
+        if (endOffset == 0)
+            endOffset = seqLength;
+    }
+    return std::make_pair(startOffset, endOffset);
+}
+
+template<typename T>
+T Clip(const T& input, const size_t pos, const size_t len)
+{
+    if (input.empty())
+        return T();
+    return T{ input.cbegin() + pos,
+              input.cbegin() + pos + len };
+}
+
+template<typename T>
+T ClipPulse(const T& input,
+            internal::Pulse2BaseCache* p2bCache,
+            const size_t pos,
+            const size_t len)
+{
+    assert(p2bCache);
+    if (input.empty())
+        return T();
+
+    // find start
+    size_t start = p2bCache->FindFirst();
+    size_t basesSeen = 0;
+    while (basesSeen < pos) {
+        start = p2bCache->FindNext(start);
+        ++basesSeen;
+    }
+
+    // find end
+    size_t end = start;
+    size_t seen = 1;
+    while (seen < len) {
+        end = p2bCache->FindNext(end);
+        ++seen;
+    }
+
+    // return clipped
+    return T{ input.cbegin() + start,
+              input.cbegin() + end + 1 };
+}
+
+template< class InputIt, class Size, class OutputIt>
+OutputIt Move_N(InputIt first, Size count, OutputIt result)
+{
+    return std::move(first, first+count, result);
+}
+
+template <typename F, typename N>
+static void ClipAndGapify(const BamRecordImpl& impl,
+                          const bool aligned,
+                          const bool exciseSoftClips,
+                          F* seq,
+                          N paddingNullValue,
+                          N deletionNullValue)
+{
+    assert(seq);
+
+    const bool clipOrGapRequested = aligned || exciseSoftClips;
+    if (impl.IsMapped() && clipOrGapRequested)
+    {
+        // determine final container length
+        auto incrementsOutputLength = [](const CigarOperationType type,
+                                         const bool aligned,
+                                         const bool exciseSoftClips)
+        {
+            if (type == CigarOperationType::HARD_CLIP ||
+                type == CigarOperationType::REFERENCE_SKIP)
+            {
+                return false;
+            }
+            else if (type == CigarOperationType::SOFT_CLIP && exciseSoftClips)
+            {
+                return false;
+            }
+            else if (!aligned && (type == CigarOperationType::DELETION ||
+                                  type == CigarOperationType::PADDING))
+            {
+                return false;
+            }
+            else
+                return true;
+        };
+
+        size_t outputLength = 0;
+        const auto cigar = impl.CigarData();
+        for (const CigarOperation& op : cigar) {
+            if (incrementsOutputLength(op.Type(), aligned, exciseSoftClips))
+                outputLength += op.Length();
+        }
+
+        // move original data to temp, prep output container size
+        F originalSeq = std::move(*seq);
+        seq->resize(outputLength);
+
+        // apply CIGAR ops
+        size_t srcIndex = 0;
+        size_t dstIndex = 0;
+        for (const CigarOperation& op : cigar) {
+            const auto opType = op.Type();
+            const auto opLength = op.Length();
+
+            // nothing to do for hard-clipped & ref-skipped positions
+            if (opType == CigarOperationType::HARD_CLIP ||
+                opType == CigarOperationType::REFERENCE_SKIP)
+            {
+                continue;
+            }
+
+            // maybe skip soft-clipped positions
+            else if (opType == CigarOperationType::SOFT_CLIP) {
+                if (exciseSoftClips)
+                    srcIndex += opLength;
+                else {
+                    Move_N(originalSeq.begin() + srcIndex,
+                           opLength,
+                           seq->begin() + dstIndex);
+                    srcIndex += opLength;
+                    dstIndex += opLength;
+                }
+            }
+
+            // maybe add deletion/padding values
+            else if (aligned && opType == CigarOperationType::DELETION) {
+                for (size_t i = 0; i < opLength; ++i)
+                    (*seq)[dstIndex++] = deletionNullValue;
+            }
+            else if (aligned && opType == CigarOperationType::PADDING) {
+                for (size_t i = 0; i < opLength; ++i)
+                    (*seq)[dstIndex++] = paddingNullValue;
+            }
+
+            // all other CIGAR ops
+            else {
+                Move_N(originalSeq.begin() + srcIndex,
+                       opLength,
+                       seq->begin() + dstIndex);
+                srcIndex += opLength;
+                dstIndex += opLength;
+            }
+        }
+    }
+}
+
+static inline
+void ClipAndGapifyBases(const BamRecordImpl& impl,
+                        const bool aligned,
+                        const bool exciseSoftClips,
+                        string* seq)
+{
+    ClipAndGapify<string, char>(impl, aligned, exciseSoftClips,
+                                seq, char('*'), char('-'));
+}
+
+static inline
+void ClipAndGapifyFrames(const BamRecordImpl& impl,
+                         const bool aligned,
+                         const bool exciseSoftClips,
+                         Frames* frames)
+{
+    assert(frames);
+    std::vector<uint16_t> data = std::move(frames->Data());
+    ClipAndGapify<std::vector<uint16_t>, uint16_t>(impl, aligned, exciseSoftClips,
+                                                   &data, 0, 0);
+    frames->Data(data);
+}
+
+static inline
+void ClipAndGapifyPhotons(const BamRecordImpl& impl,
+                          const bool aligned,
+                          const bool exciseSoftClips,
+                          std::vector<float>* data)
+{
+    ClipAndGapify<std::vector<float>, float>(impl, aligned, exciseSoftClips,
+                                             data, 0.0, 0.0);
+}
+
+static inline
+void ClipAndGapifyQualities(const BamRecordImpl& impl,
+                            const bool aligned,
+                            const bool exciseSoftClips,
+                            QualityValues* quals)
+{
+    ClipAndGapify<QualityValues, QualityValue>(impl, aligned, exciseSoftClips,
+                                               quals, QualityValue(0), QualityValue(0));
+}
+
+static inline
+void ClipAndGapifyUInts(const BamRecordImpl& impl,
+                        const bool aligned,
+                        const bool exciseSoftClips,
+                        std::vector<uint32_t>* data)
+{
+    ClipAndGapify<std::vector<uint32_t>, uint32_t>(impl, aligned, exciseSoftClips,
+                                                   data, 0, 0);
+}
+
+static
+RecordType NameToType(const string& name)
+{
+    if (name == recordTypeName_Subread)
+        return RecordType::SUBREAD;
+    if (name == recordTypeName_ZMW || name == recordTypeName_Polymerase)
+        return RecordType::ZMW;
+    if (name == recordTypeName_HqRegion)
+        return RecordType::HQREGION;
+    if (name == recordTypeName_CCS)
+        return RecordType::CCS;
+    if (name == recordTypeName_Scrap)
+        return RecordType::SCRAP;
+    return RecordType::UNKNOWN;
+}
+
+static
+void OrientBasesAsRequested(string* bases,
+                            Orientation current,
+                            Orientation requested,
+                            bool isReverseStrand,
+                            bool isPulse)
+{
+    assert(bases);
+    if (current != requested && isReverseStrand) {
+        if (isPulse)
+            internal::ReverseComplementCaseSens(*bases);
+        else
+            internal::ReverseComplement(*bases);
+    }
+}
+
+template<typename Container> inline
+void OrientTagDataAsRequested(Container* data,
+                              Orientation current,
+                              Orientation requested,
+                              bool isReverseStrand)
+{
+    assert(data);
+    if (current != requested && isReverseStrand)
+        std::reverse(data->begin(), data->end());
+}
+
+static inline
+bool ConsumesQuery(const CigarOperationType type)
+{ return (bam_cigar_type(static_cast<int>(type)) & 0x1) != 0; }
+
+static inline
+bool ConsumesReference(const CigarOperationType type)
+{ return (bam_cigar_type(static_cast<int>(type)) & 0x2) != 0; }
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+const float BamRecord::photonFactor = 10.0;
+
+BamRecord::BamRecord(void)
+    : alignedStart_(PacBio::BAM::UnmappedPosition)
+    , alignedEnd_(PacBio::BAM::UnmappedPosition)
+    , p2bCache_(nullptr)
+{ }
+
+BamRecord::BamRecord(const BamHeader& header)
+    : header_(header)
+    , alignedStart_(PacBio::BAM::UnmappedPosition)
+    , alignedEnd_(PacBio::BAM::UnmappedPosition)
+    , p2bCache_(nullptr)
+{ }
+
+BamRecord::BamRecord(const BamRecordImpl& impl)
+    : impl_(impl)
+    , alignedStart_(PacBio::BAM::UnmappedPosition)
+    , alignedEnd_(PacBio::BAM::UnmappedPosition)
+    , p2bCache_(nullptr)
+{ }
+
+BamRecord::BamRecord(BamRecordImpl&& impl)
+    : impl_(std::move(impl))
+    , alignedStart_(PacBio::BAM::UnmappedPosition)
+    , alignedEnd_(PacBio::BAM::UnmappedPosition)
+    , p2bCache_(nullptr)
+{ }
+
+BamRecord::BamRecord(const BamRecord& other)
+    : impl_(other.impl_)
+    , header_(other.header_)
+    , alignedStart_(other.alignedStart_)
+    , alignedEnd_(other.alignedEnd_)
+    , p2bCache_(nullptr) // just reset, for now at least
+{ }
+
+BamRecord::BamRecord(BamRecord&& other)
+    : impl_(std::move(other.impl_))
+    , header_(std::move(other.header_))
+    , alignedStart_(std::move(other.alignedStart_))
+    , alignedEnd_(std::move(other.alignedEnd_))
+    , p2bCache_(std::move(other.p2bCache_))
+{ }
+
+BamRecord& BamRecord::operator=(const BamRecord& other)
+{
+    impl_ = other.impl_;
+    header_ = other.header_;
+    alignedStart_ = other.alignedStart_;
+    alignedEnd_ = other.alignedEnd_;
+    p2bCache_.reset(nullptr); // just reset, for now at least
+    return *this;
+}
+
+BamRecord& BamRecord::operator=(BamRecord&& other)
+{
+    impl_ = std::move(other.impl_);
+    header_ = std::move(other.header_);
+    alignedStart_ = std::move(other.alignedStart_);
+    alignedEnd_ = std::move(other.alignedEnd_);
+    p2bCache_ = std::move(other.p2bCache_);
+    return *this;
+}
+
+BamRecord::~BamRecord(void) { }
+
+Position BamRecord::AlignedEnd(void) const
+{
+    if (alignedEnd_ == PacBio::BAM::UnmappedPosition)
+        CalculateAlignedPositions();
+    return alignedEnd_;
+}
+
+Position BamRecord::AlignedStart(void) const
+{
+    if (alignedStart_ == PacBio::BAM::UnmappedPosition)
+        CalculateAlignedPositions();
+    return alignedStart_;
+}
+
+Strand BamRecord::AlignedStrand(void) const
+{ return impl_.IsReverseStrand() ? Strand::REVERSE : Strand::FORWARD; }
+
+QualityValues BamRecord::AltLabelQV(Orientation orientation,
+                                    bool aligned,
+                                    bool exciseSoftClips,
+                                    PulseBehavior pulseBehavior) const
+{
+    return FetchQualities(BamRecordTag::ALT_LABEL_QV,
+                          orientation,
+                          aligned,
+                          exciseSoftClips,
+                          pulseBehavior);
+}
+
+BamRecord& BamRecord::AltLabelQV(const QualityValues& altLabelQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::ALT_LABEL_QV,
+                           altLabelQVs.Fastq(),
+                           &impl_);
+    return *this;
+}
+
+string BamRecord::AltLabelTag(Orientation orientation,
+                                   bool aligned,
+                                   bool exciseSoftClips,
+                                   PulseBehavior pulseBehavior) const
+{
+    return FetchBases(BamRecordTag::ALT_LABEL_TAG,
+                      orientation,
+                      aligned,
+                      exciseSoftClips,
+                      pulseBehavior);
+}
+
+BamRecord& BamRecord::AltLabelTag(const string& tags)
+{
+    internal::CreateOrEdit(BamRecordTag::ALT_LABEL_TAG,
+                           tags,
+                           &impl_);
+    return *this;
+}
+
+int16_t BamRecord::BarcodeForward(void) const
+{ return Barcodes().first; }
+
+int16_t BamRecord::BarcodeReverse(void) const
+{ return Barcodes().second; }
+
+uint8_t BamRecord::BarcodeQuality(void) const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::BARCODE_QUALITY);
+    const auto bq = impl_.TagValue(tagName);
+    if (bq.IsNull())
+        return 0; // ?? "missing" value for tags ?? should we consider boost::optional<T> for these kind of guys ??
+    return bq.ToUInt8();
+}
+
+BamRecord& BamRecord::BarcodeQuality(const uint8_t quality)
+{
+    internal::CreateOrEdit(BamRecordTag::BARCODE_QUALITY,
+                           quality,
+                           &impl_);
+    return *this;
+}
+
+std::pair<int16_t,int16_t> BamRecord::Barcodes(void) const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::BARCODES);
+    const Tag& bc = impl_.TagValue(tagName);
+    if (bc.IsNull())
+        throw std::runtime_error("barcode tag (bc) was requested but is missing");
+
+    // NOTE: barcodes are still stored, per the spec, as uint16, even though
+    // we're now using them as int16_t in the API (bug 31511)
+    //
+    if (!bc.IsUInt16Array())
+        throw std::runtime_error("barcode tag (bc) is malformed: should be a uint16_t array of size==2.");
+    const auto bcArray = bc.ToUInt16Array();
+    if (bcArray.size() != 2)
+        throw std::runtime_error("barcode tag (bc) is malformed: should be a uint16_t array of size==2.");
+
+    return std::make_pair(boost::numeric_cast<int16_t>(bcArray[0]),
+                          boost::numeric_cast<int16_t>(bcArray[1]));
+}
+
+BamRecord& BamRecord::Barcodes(const std::pair<int16_t,int16_t>& barcodeIds)
+{
+    const vector<uint16_t> data =
+    {
+        boost::numeric_cast<uint16_t>(barcodeIds.first),
+        boost::numeric_cast<uint16_t>(barcodeIds.second)
+    };
+    internal::CreateOrEdit(BamRecordTag::BARCODES,
+                           data,
+                           &impl_);
+    return *this;
+}
+
+void BamRecord::CalculateAlignedPositions(void) const
+{
+    // reset
+    ResetCachedPositions();
+
+    // skip if unmapped, or has no queryStart/End
+    if (!impl_.IsMapped())
+        return;
+
+    // get the query start/end
+    const size_t seqLength = impl_.SequenceLength();
+    const RecordType type  = Type();
+    const Position qStart  = (type == RecordType::CCS) ? Position(0) : QueryStart();
+    const Position qEnd    = (type == RecordType::CCS) ? Position(seqLength) : QueryEnd();
+    
+    if (qStart == PacBio::BAM::UnmappedPosition || qEnd == PacBio::BAM::UnmappedPosition)
+        return;
+
+    // determine clipped end ranges
+    const std::pair<int32_t, int32_t> alignedOffsets = internal::AlignedOffsets(*this, seqLength);
+    const int32_t startOffset = alignedOffsets.first;
+    const int32_t endOffset = alignedOffsets.second;
+    if (endOffset == -1 || startOffset == -1)
+        return; // TODO: handle error more??
+
+    // store aligned positions (polymerase read coordinates)
+    if (impl_.IsReverseStrand()) {
+        alignedStart_ = qStart + (seqLength - endOffset);
+        alignedEnd_   = qEnd - startOffset;
+    }
+    else {
+        alignedStart_ = qStart + startOffset;
+        alignedEnd_   = qEnd - (seqLength - endOffset);
+    }
+}
+
+void BamRecord::CalculatePulse2BaseCache(void) const
+{
+    // skip already calculated
+    if (p2bCache_)
+        return;
+
+    // else try to calculate p2b cache.
+    if (!HasPulseCall())
+        throw std::runtime_error("BamRecord cannot calculate pulse2base mapping without 'pc' tag.");
+    const auto pulseCalls = FetchBases(BamRecordTag::PULSE_CALL,
+                                       Orientation::NATIVE,
+                                       false,
+                                       false,
+                                       PulseBehavior::ALL);
+    p2bCache_.reset(new internal::Pulse2BaseCache{ pulseCalls });
+}
+
+Cigar BamRecord::CigarData(bool exciseAllClips) const
+{
+    auto isClippingOp = [](const CigarOperation& op)
+    {
+        const auto type = op.Type();
+        return type == CigarOperationType::SOFT_CLIP ||
+               type == CigarOperationType::HARD_CLIP;
+    };
+
+    auto cigar = impl_.CigarData();
+    if (exciseAllClips) {
+        cigar.erase(std::remove_if(cigar.begin(),
+                                   cigar.end(),
+                                   isClippingOp),
+                    cigar.end());
+    }
+    return cigar;
+}
+
+BamRecord& BamRecord::Clip(const ClipType clipType,
+                           const Position start,
+                           const Position end)
+{
+    switch (clipType)
+    {
+        case ClipType::CLIP_NONE         : return *this;
+        case ClipType::CLIP_TO_QUERY     : return ClipToQuery(start, end);
+        case ClipType::CLIP_TO_REFERENCE : return ClipToReference(start, end);
+        default:
+            throw std::runtime_error("unsupported clip type requested");
+    }
+}
+
+void BamRecord::ClipFields(const size_t clipFrom,
+                           const size_t clipLength)
+{
+    const bool isForwardStrand = (AlignedStrand() == Strand::FORWARD);
+
+    // clip seq, quals
+    string sequence = internal::Clip(Sequence(Orientation::NATIVE), clipFrom, clipLength);
+    QualityValues qualities = internal::Clip(Qualities(Orientation::NATIVE), clipFrom, clipLength);
+    if (!isForwardStrand) {
+        internal::ReverseComplement(sequence);
+        internal::Reverse(qualities);
+    }
+    impl_.SetSequenceAndQualities(sequence, qualities.Fastq());
+
+    // update BAM tags
+    TagCollection tags = impl_.Tags();
+    if (HasDeletionQV())
+        tags[internal::Label(BamRecordTag::DELETION_QV)]      = internal::Clip(DeletionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasInsertionQV())
+        tags[internal::Label(BamRecordTag::INSERTION_QV)]     = internal::Clip(InsertionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasMergeQV())
+        tags[internal::Label(BamRecordTag::MERGE_QV)]         = internal::Clip(MergeQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasSubstitutionQV())
+        tags[internal::Label(BamRecordTag::SUBSTITUTION_QV)]  = internal::Clip(SubstitutionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasIPD())
+        tags[internal::Label(BamRecordTag::IPD)]              = internal::Clip(IPD(Orientation::NATIVE).Data(), clipFrom, clipLength);
+    if (HasPulseWidth())
+        tags[internal::Label(BamRecordTag::PULSE_WIDTH)]      = internal::Clip(PulseWidth(Orientation::NATIVE).Data(), clipFrom, clipLength);
+    if (HasDeletionTag())
+        tags[internal::Label(BamRecordTag::DELETION_TAG)]     = internal::Clip(DeletionTag(Orientation::NATIVE), clipFrom, clipLength);
+    if (HasSubstitutionTag())
+        tags[internal::Label(BamRecordTag::SUBSTITUTION_TAG)] = internal::Clip(SubstitutionTag(Orientation::NATIVE), clipFrom, clipLength);
+
+    // internal BAM tags
+    if (HasPulseCall()) {
+
+        // ensure p2bCache initialized
+        CalculatePulse2BaseCache();
+        internal::Pulse2BaseCache* p2bCache = p2bCache_.get();
+
+        if (HasAltLabelQV())
+            tags[internal::Label(BamRecordTag::ALT_LABEL_QV)]     = internal::ClipPulse(AltLabelQV(Orientation::NATIVE), p2bCache, clipFrom, clipLength).Fastq();
+        if (HasLabelQV())
+            tags[internal::Label(BamRecordTag::LABEL_QV)]         = internal::ClipPulse(LabelQV(Orientation::NATIVE), p2bCache, clipFrom, clipLength).Fastq();
+        if (HasPulseMergeQV())
+            tags[internal::Label(BamRecordTag::PULSE_MERGE_QV)]   = internal::ClipPulse(PulseMergeQV(Orientation::NATIVE), p2bCache, clipFrom, clipLength).Fastq();
+        if (HasAltLabelTag())
+            tags[internal::Label(BamRecordTag::ALT_LABEL_TAG)]    = internal::ClipPulse(AltLabelTag(Orientation::NATIVE), p2bCache, clipFrom, clipLength);
+        if (HasPulseCall())
+            tags[internal::Label(BamRecordTag::PULSE_CALL)]       = internal::ClipPulse(PulseCall(Orientation::NATIVE), p2bCache, clipFrom, clipLength);
+        if (HasPkmean())
+            tags[internal::Label(BamRecordTag::PKMEAN)]           = EncodePhotons(internal::ClipPulse(Pkmean(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPkmid())
+            tags[internal::Label(BamRecordTag::PKMID)]            = EncodePhotons(internal::ClipPulse(Pkmid(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPkmean2())
+            tags[internal::Label(BamRecordTag::PKMEAN_2)]         = EncodePhotons(internal::ClipPulse(Pkmean2(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPkmid2())
+            tags[internal::Label(BamRecordTag::PKMID_2)]          = EncodePhotons(internal::ClipPulse(Pkmid2(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPrePulseFrames())
+            tags[internal::Label(BamRecordTag::PRE_PULSE_FRAMES)] = internal::ClipPulse(PrePulseFrames(Orientation::NATIVE).Data(), p2bCache, clipFrom, clipLength);
+        if (HasPulseCallWidth())
+            tags[internal::Label(BamRecordTag::PULSE_CALL_WIDTH)] = internal::ClipPulse(PulseCallWidth(Orientation::NATIVE).Data(), p2bCache, clipFrom, clipLength);
+        if (HasStartFrame())
+            tags[internal::Label(BamRecordTag::START_FRAME)]      = internal::ClipPulse(StartFrame(Orientation::NATIVE), p2bCache, clipFrom, clipLength);
+
+    }
+
+    impl_.Tags(tags);
+}
+
+BamRecord& BamRecord::ClipToQuery(const Position start,
+                                  const Position end)
+{
+    // cache original coords, skip out if clip not needed
+    const size_t seqLength = impl_.SequenceLength();
+    const RecordType type  = Type();
+    const Position origQStart = (type == RecordType::CCS) ? Position(0) : QueryStart();
+    const Position origQEnd   = (type == RecordType::CCS) ? Position(seqLength) : QueryEnd();
+    if (start <= origQStart && end >= origQEnd)
+        return *this;
+
+    // determine new offsets into data
+    const size_t startOffset = start - origQStart;
+    const size_t endOffset   = origQEnd - end;
+
+    // maybe update CIGAR & aligned position
+    if (IsMapped()) {
+
+        // fetch a 'working copy' of CIGAR data
+        Cigar cigar = impl_.CigarData();
+
+        // clip leading CIGAR ops
+        size_t referencePositionOffset = 0;
+        size_t remaining = startOffset;
+        while (remaining > 0 && !cigar.empty()) {
+            CigarOperation& firstOp = cigar.front();
+            const size_t firstOpLength = firstOp.Length();
+            const bool consumesQuery = internal::ConsumesQuery(firstOp.Type());
+            const bool consumesRef = internal::ConsumesReference(firstOp.Type());
+
+            // if (!consumesQuery)
+            //    just pop (e.g. deletion) ?
+            // else {
+            //    check bounds, like clip to reference ?
+            // }
+
+            // CIGAR op ends at or before clip
+            if (firstOpLength <= remaining) {
+                cigar.erase(cigar.begin());
+                if (consumesQuery)
+                    remaining -= firstOpLength;
+                if (consumesRef)
+                    referencePositionOffset += firstOpLength;
+            }
+
+            // CIGAR op straddles clip
+            else {
+                firstOp.Length(firstOpLength - remaining);
+                if (consumesRef)
+                    referencePositionOffset += remaining;
+                remaining = 0;
+            }
+        }
+
+        // clip trailing CIGAR ops
+        remaining = endOffset;
+        while (remaining > 0 && !cigar.empty()) {
+            CigarOperation& lastOp = cigar.back();
+            const size_t lastOpLength = lastOp.Length();
+            const bool consumesQuery = internal::ConsumesQuery(lastOp.Type());
+
+            // CIGAR op ends at or after clip
+            if (lastOpLength <= remaining) {
+                cigar.pop_back();
+                if (consumesQuery)
+                    remaining -= lastOpLength;
+            }
+
+            // CIGAR op straddles clip
+            else {
+                lastOp.Length(lastOpLength - remaining);
+                remaining = 0;
+            }
+        }
+
+        // update CIGAR & position
+        impl_.CigarData(cigar);
+        const Position origPosition = impl_.Position();
+        impl_.Position(origPosition + referencePositionOffset);
+    }
+
+    // clip SEQ, QUAL, & tags
+    const size_t clipFrom   = startOffset;
+    const size_t clipLength = (end - start);
+    ClipFields(clipFrom, clipLength);
+
+    // update query start/end
+    // TODO: update name to reflect new QS/QE ???
+    internal::CreateOrEdit(BamRecordTag::QUERY_START, start, &impl_);
+    internal::CreateOrEdit(BamRecordTag::QUERY_END,   end,   &impl_);
+//    UpdateName();
+
+    // reset any cached aligned start/end
+    ResetCachedPositions();
+    return *this;
+}
+
+BamRecord& BamRecord::ClipToReference(const Position start,
+                                      const Position end)
+{
+    // skip if not mapped, clipping to reference doesn't make sense
+    // or should we even consider throwing here?
+    if (!IsMapped())
+        return *this;
+
+    const bool isForwardStrand = (AlignedStrand() == Strand::FORWARD);
+    return (isForwardStrand ? ClipToReferenceForward(start, end)
+                            : ClipToReferenceReverse(start, end));
+}
+
+BamRecord& BamRecord::ClipToReferenceForward(const PacBio::BAM::Position start,
+                                             const PacBio::BAM::Position end)
+{
+    assert(IsMapped());
+    assert(AlignedStrand() == Strand::FORWARD);
+
+    // cache original coords
+    const size_t seqLength = impl_.SequenceLength();
+    const RecordType type  = Type();
+    const Position origQStart = (type == RecordType::CCS) ? Position(0) : QueryStart();
+    const Position origQEnd   = (type == RecordType::CCS) ? Position(seqLength) : QueryEnd();
+    const Position origTStart = ReferenceStart();
+    const Position origTEnd   = ReferenceEnd();
+    assert(AlignedStart() >= origQStart);
+    assert(AlignedEnd()   <= origQEnd);
+
+    // skip if already within requested clip range
+    if (start <= origTStart && end >= origTEnd)
+        return *this;
+
+    const Position newTStart = std::max(origTStart, start);
+    const Position newTEnd   = std::min(origTEnd, end);
+
+    // fetch a 'working copy' of CIGAR data
+    Cigar cigar = impl_.CigarData();
+
+    // we're going to skip query sequence outside aligned region
+    size_t queryPosRemovedFront = 0;
+    size_t queryPosRemovedBack  = 0;
+
+    // ------------------------
+    // clip leading CIGAR ops
+    // ------------------------
+
+    size_t remaining = newTStart - origTStart;
+    while (remaining > 0 && !cigar.empty()) {
+        CigarOperation& firstOp = cigar.front();
+        const size_t firstOpLength = firstOp.Length();
+        const bool consumesQuery = internal::ConsumesQuery(firstOp.Type());
+        const bool consumesRef = internal::ConsumesReference(firstOp.Type());
+
+        if (!consumesRef) {
+
+            // e.g. softclip - just pop it completely
+            cigar.erase(cigar.begin());
+            if (consumesQuery)
+                queryPosRemovedFront += firstOpLength;
+
+        } else {
+            assert(consumesRef);
+
+            // CIGAR ends at or before clip
+            if (firstOpLength <= remaining) {
+                cigar.erase(cigar.begin());
+                if (consumesQuery)
+                    queryPosRemovedFront += firstOpLength;
+                if (consumesRef)
+                    remaining -= firstOpLength;
+            }
+
+            // CIGAR straddles clip
+            else {
+                assert(firstOpLength > remaining);
+                firstOp.Length(firstOpLength - remaining);
+                if (consumesQuery)
+                    queryPosRemovedFront += remaining;
+                remaining = 0;
+            }
+        }
+    }
+
+    // -------------------------
+    // clip trailing CIGAR ops
+    // -------------------------
+
+    remaining = origTEnd - newTEnd;
+    while (remaining > 0 && !cigar.empty()) {
+        CigarOperation& lastOp = cigar.back();
+        const size_t lastOpLength = lastOp.Length();
+        const bool consumesQuery = internal::ConsumesQuery(lastOp.Type());
+        const bool consumesRef = internal::ConsumesReference(lastOp.Type());
+
+        if (!consumesRef) {
+
+            // e.g. softclip - just pop it completely
+            cigar.pop_back();
+            if (consumesQuery)
+                queryPosRemovedBack += lastOpLength;
+
+        } else {
+            assert(consumesRef);
+
+            // CIGAR ends at or after clip
+            if (lastOpLength <= remaining) {
+                cigar.pop_back();
+                if (consumesQuery)
+                    queryPosRemovedBack += lastOpLength;
+                if (consumesRef)
+                    remaining -= lastOpLength;
+            }
+
+            // CIGAR straddles clip
+            else {
+                assert(lastOpLength > remaining);
+                lastOp.Length(lastOpLength - remaining);
+                if (consumesQuery)
+                    queryPosRemovedBack += remaining;
+                remaining = 0;
+            }
+        }
+    }
+
+    // update CIGAR and position
+    impl_.CigarData(cigar);
+    impl_.Position(newTStart);
+
+    // clip SEQ, QUAL, tags
+    const Position qStart = origQStart + queryPosRemovedFront;
+    const Position qEnd   = origQEnd   - queryPosRemovedBack;
+    const size_t clipFrom = queryPosRemovedFront;
+    const size_t clipLength = qEnd - qStart;
+    ClipFields(clipFrom, clipLength);
+
+    // update query start/end
+    internal::CreateOrEdit(BamRecordTag::QUERY_START, qStart, &impl_);
+    internal::CreateOrEdit(BamRecordTag::QUERY_END,   qEnd,   &impl_);
+//    UpdateName();
+
+    // reset any cached aligned start/end
+    ResetCachedPositions();
+    return *this;
+}
+
+BamRecord& BamRecord::ClipToReferenceReverse(const PacBio::BAM::Position start,
+                                             const PacBio::BAM::Position end)
+{
+    assert(IsMapped());
+    assert(AlignedStrand() == Strand::REVERSE);
+
+    // cache original coords
+    const size_t seqLength = impl_.SequenceLength();
+    const RecordType type  = Type();
+    const Position origQStart = (type == RecordType::CCS) ? Position(0) : QueryStart();
+    const Position origQEnd   = (type == RecordType::CCS) ? Position(seqLength) : QueryEnd();
+    const Position origTStart = ReferenceStart();
+    const Position origTEnd   = ReferenceEnd();
+
+    // skip if already within requested clip range
+    if (start <= origTStart && end >= origTEnd)
+        return *this;
+    assert(AlignedStart() >= origQStart);
+    assert(AlignedEnd()   <= origQEnd);
+
+    const Position newTStart = std::max(origTStart, start);
+    const Position newTEnd   = std::min(origTEnd, end);
+
+    Cigar cigar = impl_.CigarData();
+
+    size_t queryPosRemovedFront = 0;
+    size_t queryPosRemovedBack  = 0;
+
+    // update CIGAR - clip front ops, then clip back ops
+    size_t remaining = newTStart - origTStart;
+    while (remaining > 0 && !cigar.empty()) {
+        CigarOperation& firstOp = cigar.front();
+        const CigarOperationType firstOpType = firstOp.Type();
+        const size_t firstOpLength = firstOp.Length();
+        const bool consumesQuery = internal::ConsumesQuery(firstOpType);
+        const bool consumesRef = internal::ConsumesReference(firstOpType);
+
+        if (!consumesRef) {
+
+            // e.g. softclip - just pop it completely
+            cigar.erase(cigar.begin());
+            if (consumesQuery)
+                queryPosRemovedBack += firstOpLength;
+
+        } else {
+            assert(consumesRef);
+
+            // CIGAR ends at or before clip
+            if (firstOpLength <= remaining) {
+                cigar.erase(cigar.begin());
+                if (consumesQuery)
+                    queryPosRemovedBack += firstOpLength;
+                if (consumesRef)
+                    remaining -= firstOpLength;
+            }
+
+            // CIGAR straddles clip
+            else {
+                assert(firstOpLength > remaining);
+                firstOp.Length(firstOpLength - remaining);
+                if (consumesQuery)
+                    queryPosRemovedBack += remaining;
+                remaining = 0;
+            }
+        }
+    }
+
+    remaining = origTEnd - newTEnd;
+    while (remaining > 0 && !cigar.empty()) {
+        CigarOperation& lastOp = cigar.back();
+        const CigarOperationType lastOpType = lastOp.Type();
+        const size_t lastOpLength = lastOp.Length();
+        const bool consumesQuery = internal::ConsumesQuery(lastOpType);
+        const bool consumesRef = internal::ConsumesReference(lastOpType);
+
+        if (!consumesRef) {
+
+            // e.g. softclip - just pop it completely
+            cigar.pop_back();
+            if (consumesQuery)
+                queryPosRemovedFront += lastOpLength;
+
+        } else {
+            assert(consumesRef);
+
+            // CIGAR ends at or before clip
+            if (lastOpLength <= remaining) {
+                cigar.pop_back();
+                if (consumesQuery)
+                    queryPosRemovedFront += lastOpLength;
+                if (consumesRef)
+                    remaining -= lastOpLength;
+            }
+
+            // CIGAR straddles clip
+            else {
+                assert(lastOpLength > remaining);
+                lastOp.Length(lastOpLength - remaining);
+                if (consumesQuery)
+                    queryPosRemovedFront += remaining;
+                remaining = 0;
+            }
+        }
+    }
+    impl_.CigarData(cigar);
+
+    // update aligned reference position
+    impl_.Position(newTStart);
+
+    // clip SEQ, QUAL, tags
+    const Position qStart = origQStart + queryPosRemovedFront;
+    const Position qEnd   = origQEnd   - queryPosRemovedBack;
+    const size_t clipFrom = queryPosRemovedFront;
+    const size_t clipLength = qEnd - qStart;
+    ClipFields(clipFrom, clipLength);
+
+    // update query start/end
+    internal::CreateOrEdit(BamRecordTag::QUERY_START, qStart, &impl_);
+    internal::CreateOrEdit(BamRecordTag::QUERY_END,   qEnd,   &impl_);
+//    UpdateName();
+
+    // reset any cached aligned start/end
+    ResetCachedPositions();
+    return *this;
+}
+
+QualityValues BamRecord::DeletionQV(Orientation orientation,
+                                    bool aligned,
+                                    bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::DELETION_QV,
+                          orientation,
+                          aligned,
+                          exciseSoftClips);
+}
+
+BamRecord& BamRecord::DeletionQV(const QualityValues& deletionQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::DELETION_QV,
+                           deletionQVs.Fastq(),
+                           &impl_);
+    return *this;
+}
+
+
+string BamRecord::DeletionTag(Orientation orientation,
+                              bool aligned,
+                              bool exciseSoftClips) const
+{
+    return FetchBases(BamRecordTag::DELETION_TAG,
+                      orientation,
+                      aligned,
+                      exciseSoftClips);
+}
+
+BamRecord& BamRecord::DeletionTag(const string& tags)
+{
+    internal::CreateOrEdit(BamRecordTag::DELETION_TAG,
+                           tags,
+                           &impl_);
+    return *this;
+}
+
+vector<uint16_t>
+BamRecord::EncodePhotons(const vector<float>& data)
+{
+    vector<uint16_t> encoded;
+    encoded.reserve(data.size());
+    for (const auto& d : data)
+        encoded.emplace_back(d * photonFactor);
+    return encoded;
+}
+
+string BamRecord::FetchBasesRaw(const BamRecordTag tag) const
+{
+    const Tag& seqTag = impl_.TagValue(tag);
+    return seqTag.ToString();
+}
+
+string BamRecord::FetchBases(const BamRecordTag tag,
+                             const Orientation orientation,
+                             const bool aligned,
+                             const bool exciseSoftClips,
+                             const PulseBehavior pulseBehavior) const
+{
+    const bool isBamSeq = (tag == BamRecordTag::SEQ);
+    const bool isPulse = internal::BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    string bases;
+    Orientation current;
+    if (isBamSeq) { // SEQ stored in genomic orientation
+        bases = impl_.Sequence();
+        current = Orientation::GENOMIC;
+    } else { // all tags stored in native orientation
+        bases = FetchBasesRaw(tag);
+        current = Orientation::NATIVE;
+    }
+
+    // maybe strip 'squashed' pulse loci
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        CalculatePulse2BaseCache();
+        bases = p2bCache_->RemoveSquashedPulses(bases);
+    }
+
+    // if we need to touch CIGAR
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error("Cannot return data at all pulses when gapping and/or soft-clipping are requested. "
+                                     "Use PulseBehavior::BASECALLS_ONLY instead.");
+
+        // force into genomic orientation
+        internal::OrientBasesAsRequested(&bases,
+                                         current,
+                                         Orientation::GENOMIC,
+                                         impl_.IsReverseStrand(),
+                                         isPulse);
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        internal::ClipAndGapifyBases(impl_,
+                                     aligned,
+                                     exciseSoftClips,
+                                     &bases);
+    }
+
+    // return in the orientation requested
+    internal::OrientBasesAsRequested(&bases,
+                                     current,
+                                     orientation,
+                                     impl_.IsReverseStrand(),
+                                     isPulse);
+    return bases;
+}
+
+Frames BamRecord::FetchFramesRaw(const BamRecordTag tag) const
+{
+    Frames frames;
+    const Tag& frameTag = impl_.TagValue(tag);
+    if (frameTag.IsNull())
+        return frames;  // throw ?
+
+    // lossy frame codes
+    if (frameTag.IsUInt8Array()) {
+        const vector<uint8_t> codes = frameTag.ToUInt8Array();
+        frames = Frames::Decode(codes);
+    }
+
+    // lossless frame data
+    else {
+        assert(frameTag.IsUInt16Array());
+        frames.Data(frameTag.ToUInt16Array());
+    }
+
+    return frames;
+}
+
+Frames BamRecord::FetchFrames(const BamRecordTag tag,
+                              const Orientation orientation,
+                              const bool aligned,
+                              const bool exciseSoftClips,
+                              const PulseBehavior pulseBehavior) const
+{
+    const bool isPulse = internal::BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    Frames frames = FetchFramesRaw(tag);
+    Orientation current = Orientation::NATIVE;
+
+    // maybe strip 'squashed' pulse loci
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        CalculatePulse2BaseCache();
+        frames.DataRaw() = p2bCache_->RemoveSquashedPulses(frames.Data());
+    }
+
+    // if we need to touch the CIGAR
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error("Cannot return data at all pulses when gapping and/or soft-clipping are requested. "
+                                     "Use PulseBehavior::BASECALLS_ONLY instead.");
+
+        // force into genomic orientation
+        internal::OrientTagDataAsRequested(&frames,
+                                           current,
+                                           Orientation::GENOMIC,
+                                           impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        internal::ClipAndGapifyFrames(impl_,
+                                      aligned,
+                                      exciseSoftClips,
+                                      &frames);
+    }
+
+    // return in the orientation requested
+    internal::OrientTagDataAsRequested(&frames,
+                                       current,
+                                       orientation,
+                                       impl_.IsReverseStrand());
+    return frames;
+
+}
+
+vector<float> BamRecord::FetchPhotonsRaw(const BamRecordTag tag) const
+{
+    const Tag& frameTag = impl_.TagValue(tag);
+    if (frameTag.IsNull())
+        return vector<float>();
+    if(!frameTag.IsUInt16Array())
+        throw std::runtime_error("Photons are not a uint16_t array, tag " +
+                                 internal::BamRecordTags::LabelFor(tag));
+    const vector<uint16_t> data = frameTag.ToUInt16Array();
+
+    vector<float> photons;
+    photons.reserve(data.size());
+    for (const auto& d : data)
+        photons.emplace_back(d / photonFactor);
+    return photons;
+}
+
+vector<float> BamRecord::FetchPhotons(const BamRecordTag tag,
+                                      const Orientation orientation,
+                                      const bool aligned,
+                                      const bool exciseSoftClips,
+                                      const PulseBehavior pulseBehavior) const
+{
+    const bool isPulse = internal::BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    auto data = FetchPhotonsRaw(tag);
+    Orientation current = Orientation::NATIVE;
+
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        // strip 'squashed' pulse loci
+        CalculatePulse2BaseCache();
+        data = p2bCache_->RemoveSquashedPulses(data);
+    }
+
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error("Cannot return data at all pulses when gapping and/or soft-clipping are requested. "
+                                     "Use PulseBehavior::BASECALLS_ONLY instead.");
+
+        // force into genomic orientation
+        internal::OrientTagDataAsRequested(&data,
+                                           current,
+                                           Orientation::GENOMIC,
+                                           impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        internal::ClipAndGapifyPhotons(impl_,
+                                       aligned,
+                                       exciseSoftClips,
+                                       &data);
+    }
+
+    // return in the orientation requested
+    internal::OrientTagDataAsRequested(&data,
+                                       current,
+                                       orientation,
+                                       impl_.IsReverseStrand());
+    return data;
+}
+
+QualityValues BamRecord::FetchQualitiesRaw(const BamRecordTag tag) const
+{
+    const Tag& qvsTag = impl_.TagValue(tag);
+    return QualityValues::FromFastq(qvsTag.ToString());
+}
+
+QualityValues BamRecord::FetchQualities(const BamRecordTag tag,
+                                        const Orientation orientation,
+                                        const bool aligned,
+                                        const bool exciseSoftClips,
+                                        const PulseBehavior pulseBehavior) const
+{
+    // requested data info
+    const bool isBamQual = (tag == BamRecordTag::QUAL);
+    const bool isPulse = internal::BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    QualityValues quals;
+    Orientation current;
+    if (isBamQual) { // QUAL stored in genomic orientation
+        quals = impl_.Qualities();
+        current = Orientation::GENOMIC;
+    } else {        // all tags stored in native orientation
+        quals = FetchQualitiesRaw(tag);
+        current = Orientation::NATIVE;
+    }
+
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        // strip 'squashed' pulse loci
+        CalculatePulse2BaseCache();
+        quals = p2bCache_->RemoveSquashedPulses(quals);
+    }
+
+    // if we need to touch CIGAR
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error("Cannot return data at all pulses when gapping and/or soft-clipping are requested. "
+                                     "Use PulseBehavior::BASECALLS_ONLY instead.");
+
+        // force into genomic orientation
+        internal::OrientTagDataAsRequested(&quals,
+                                          current,
+                                          Orientation::GENOMIC,
+                                          impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        internal::ClipAndGapifyQualities(impl_,
+                                         aligned,
+                                         exciseSoftClips,
+                                         &quals);
+    }
+
+    // return in the orientation requested
+    internal::OrientTagDataAsRequested(&quals,
+                                       current,
+                                       orientation,
+                                       impl_.IsReverseStrand());
+    return quals;
+}
+
+vector<uint32_t> BamRecord::FetchUIntsRaw(const BamRecordTag tag) const
+{
+    // fetch tag data
+    const Tag& frameTag = impl_.TagValue(tag);
+    if (frameTag.IsNull())
+        return std::vector<uint32_t>();
+    if(!frameTag.IsUInt32Array())
+        throw std::runtime_error("Tag data are not a uint32_t array, tag " +
+                                 internal::BamRecordTags::LabelFor(tag));
+    return frameTag.ToUInt32Array();
+}
+
+vector<uint32_t> BamRecord::FetchUInts(const BamRecordTag tag,
+                                       const Orientation orientation,
+                                       const bool aligned,
+                                       const bool exciseSoftClips,
+                                       const PulseBehavior pulseBehavior) const
+{
+    const bool isPulse = internal::BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    auto  arr = FetchUIntsRaw(tag);
+    Orientation current = Orientation::NATIVE;
+
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        // strip 'squashed' pulse loci
+        CalculatePulse2BaseCache();
+        arr = p2bCache_->RemoveSquashedPulses(arr);
+    }
+
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error("Cannot return data at all pulses when gapping and/or soft-clipping are requested. "
+                                     "Use PulseBehavior::BASECALLS_ONLY instead.");
+
+        // force into genomic orientation
+        internal::OrientTagDataAsRequested(&arr,
+                                           current,
+                                           Orientation::GENOMIC,
+                                           impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        internal::ClipAndGapifyUInts(impl_,
+                                     aligned,
+                                     exciseSoftClips,
+                                     &arr);
+    }
+
+    // return in the orientation requested
+    internal::OrientTagDataAsRequested(&arr,
+                                     current,
+                                     orientation,
+                                     impl_.IsReverseStrand());
+    return arr;
+}
+
+string BamRecord::FullName(void) const
+{ return impl_.Name(); }
+
+bool BamRecord::HasAltLabelQV(void) const
+{ return impl_.HasTag(BamRecordTag::ALT_LABEL_QV); }
+
+bool BamRecord::HasAltLabelTag(void) const
+{ return impl_.HasTag(BamRecordTag::ALT_LABEL_TAG); }
+
+bool BamRecord::HasBarcodes(void) const
+{ return impl_.HasTag(BamRecordTag::BARCODES); }
+
+bool BamRecord::HasBarcodeQuality(void) const
+{ return impl_.HasTag(BamRecordTag::BARCODE_QUALITY); }
+
+bool BamRecord::HasLabelQV(void) const
+{ return impl_.HasTag(BamRecordTag::LABEL_QV); }
+
+bool BamRecord::HasDeletionQV(void) const
+{ return impl_.HasTag(BamRecordTag::DELETION_QV); }
+
+bool BamRecord::HasDeletionTag(void) const
+{ return impl_.HasTag(BamRecordTag::DELETION_TAG); }
+
+bool BamRecord::HasHoleNumber(void) const
+{
+    return impl_.HasTag(BamRecordTag::HOLE_NUMBER)
+          && !impl_.TagValue(BamRecordTag::HOLE_NUMBER).IsNull();
+}
+
+bool BamRecord::HasInsertionQV(void) const
+{ return impl_.HasTag(BamRecordTag::INSERTION_QV); }
+
+bool BamRecord::HasNumPasses(void) const
+{ return impl_.HasTag(BamRecordTag::NUM_PASSES); }
+
+bool BamRecord::HasPreBaseFrames(void) const
+{ return HasIPD(); }
+
+bool BamRecord::HasIPD(void) const
+{ return impl_.HasTag(BamRecordTag::IPD); }
+
+bool BamRecord::HasLocalContextFlags(void) const
+{ return impl_.HasTag(BamRecordTag::CONTEXT_FLAGS); }
+
+bool BamRecord::HasMergeQV(void) const
+{ return impl_.HasTag(BamRecordTag::MERGE_QV); }
+
+bool BamRecord::HasPulseMergeQV(void) const
+{ return impl_.HasTag(BamRecordTag::PULSE_MERGE_QV); }
+
+bool BamRecord::HasPkmean(void) const
+{ return impl_.HasTag(BamRecordTag::PKMEAN); }
+
+bool BamRecord::HasPkmean2(void) const
+{ return impl_.HasTag(BamRecordTag::PKMEAN_2); }
+
+bool BamRecord::HasPkmid(void) const
+{ return impl_.HasTag(BamRecordTag::PKMID); }
+
+bool BamRecord::HasPkmid2(void) const
+{ return impl_.HasTag(BamRecordTag::PKMID_2); }
+
+bool BamRecord::HasPrePulseFrames(void) const
+{ return impl_.HasTag(BamRecordTag::PRE_PULSE_FRAMES); }
+
+bool BamRecord::HasPulseCall(void) const
+{ return impl_.HasTag(BamRecordTag::PULSE_CALL)
+          && !impl_.TagValue(BamRecordTag::PULSE_CALL).IsNull();
+}
+
+bool BamRecord::HasPulseCallWidth(void) const
+{ return impl_.HasTag(BamRecordTag::PULSE_CALL_WIDTH); }
+
+bool BamRecord::HasPulseWidth(void) const
+{ return impl_.HasTag(BamRecordTag::PULSE_WIDTH); }
+
+bool BamRecord::HasQueryEnd(void) const
+{ return impl_.HasTag(BamRecordTag::QUERY_END); }
+
+bool BamRecord::HasQueryStart(void) const
+{ return impl_.HasTag(BamRecordTag::QUERY_START); }
+
+bool BamRecord::HasReadAccuracy(void) const
+{ return impl_.HasTag(BamRecordTag::READ_ACCURACY)
+          && !impl_.TagValue(BamRecordTag::READ_ACCURACY).IsNull();
+}
+
+bool BamRecord::HasScrapRegionType(void) const
+{ return impl_.HasTag(BamRecordTag::SCRAP_REGION_TYPE)
+          && !impl_.TagValue(BamRecordTag::SCRAP_REGION_TYPE).IsNull();
+}
+
+bool BamRecord::HasScrapZmwType(void) const
+{ return impl_.HasTag(BamRecordTag::SCRAP_ZMW_TYPE)
+          && !impl_.TagValue(BamRecordTag::SCRAP_ZMW_TYPE).IsNull();
+}
+
+bool BamRecord::HasStartFrame(void) const
+{ return impl_.HasTag(BamRecordTag::START_FRAME); }
+
+bool BamRecord::HasSignalToNoise(void) const
+{ return impl_.HasTag(BamRecordTag::SNR); }
+
+bool BamRecord::HasSubstitutionQV(void) const
+{ return impl_.HasTag(BamRecordTag::SUBSTITUTION_QV); }
+
+bool BamRecord::HasSubstitutionTag(void) const
+{ return impl_.HasTag(BamRecordTag::SUBSTITUTION_TAG); }
+
+BamHeader BamRecord::Header(void) const
+{ return header_; }
+
+int32_t BamRecord::HoleNumber(void) const
+{
+    const Tag& holeNumber = impl_.TagValue(BamRecordTag::HOLE_NUMBER);
+    if (!holeNumber.IsNull())
+        return holeNumber.ToInt32();
+
+    // missing zm tag - try to pull from name
+    return internal::HoleNumberFromName(FullName());
+}
+
+BamRecord& BamRecord::HoleNumber(const int32_t holeNumber)
+{
+    internal::CreateOrEdit(BamRecordTag::HOLE_NUMBER,
+                           holeNumber,
+                           &impl_);
+    return *this;
+}
+
+BamRecordImpl& BamRecord::Impl(void)
+{ return impl_; }
+
+const BamRecordImpl& BamRecord::Impl(void) const
+{ return impl_; }
+
+QualityValues BamRecord::InsertionQV(Orientation orientation,
+                                     bool aligned,
+                                     bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::INSERTION_QV,
+                          orientation,
+                          aligned,
+                          exciseSoftClips);
+}
+
+BamRecord& BamRecord::InsertionQV(const QualityValues& insertionQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::INSERTION_QV,
+                           insertionQVs.Fastq(),
+                           &impl_);
+    return *this;
+}
+
+Frames BamRecord::IPD(Orientation orientation,
+                      bool aligned,
+                      bool exciseSoftClips) const
+{
+    return FetchFrames(BamRecordTag::IPD,
+                       orientation,
+                       aligned,
+                       exciseSoftClips);
+}
+
+BamRecord& BamRecord::IPD(const Frames& frames,
+                          const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY)
+        internal::CreateOrEdit(BamRecordTag::IPD, frames.Encode(), &impl_);
+    else
+        internal::CreateOrEdit(BamRecordTag::IPD, frames.Data(), &impl_);
+    return *this;
+}
+
+Frames BamRecord::IPDRaw(Orientation orientation) const
+{
+    Frames frames;
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::IPD);
+    const Tag& frameTag = impl_.TagValue(tagName);
+    if (frameTag.IsNull())
+        return frames;
+
+    // lossy frame codes
+    if (frameTag.IsUInt8Array()) {
+        const vector<uint8_t> codes = frameTag.ToUInt8Array();
+        const vector<uint16_t> codes16(codes.begin(), codes.end());
+        frames.Data(std::move(codes16));
+    }
+
+    // lossless frame data
+    else {
+        assert(frameTag.IsUInt16Array());
+        frames.Data(frameTag.ToUInt16Array());
+    }
+
+    // return in requested orientation
+    internal::OrientTagDataAsRequested(&frames,
+                                       Orientation::NATIVE,     // current
+                                       orientation,             // requested
+                                       impl_.IsReverseStrand());
+    return frames;
+}
+
+bool BamRecord::IsMapped(void) const
+{ return impl_.IsMapped(); }
+
+QualityValues BamRecord::LabelQV(Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchQualities(BamRecordTag::LABEL_QV,
+                          orientation,
+                          aligned,
+                          exciseSoftClips,
+                          pulseBehavior);
+}
+
+BamRecord& BamRecord::LabelQV(const QualityValues& labelQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::LABEL_QV,
+                           labelQVs.Fastq(),
+                           &impl_);
+    return *this;
+}
+
+LocalContextFlags BamRecord::LocalContextFlags(void) const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::CONTEXT_FLAGS);
+    const Tag& cxTag = impl_.TagValue(tagName);
+    return static_cast<PacBio::BAM::LocalContextFlags>(cxTag.ToUInt8());
+}
+
+BamRecord& BamRecord::LocalContextFlags(const PacBio::BAM::LocalContextFlags flags)
+{
+    internal::CreateOrEdit(BamRecordTag::CONTEXT_FLAGS,
+                           static_cast<uint8_t>(flags),
+                           &impl_);
+    return *this;
+}
+
+BamRecord& BamRecord::Map(const int32_t referenceId,
+                          const Position refStart,
+                          const Strand strand,
+                          const Cigar& cigar,
+                          const uint8_t mappingQuality)
+{
+    impl_.Position(refStart);
+    impl_.ReferenceId(referenceId);
+    impl_.CigarData(cigar);
+    impl_.MapQuality(mappingQuality);
+    impl_.SetMapped(true);
+
+    if (strand == Strand::FORWARD)
+        impl_.SetReverseStrand(false);
+
+    else {
+        assert(strand == Strand::REVERSE);
+        impl_.SetReverseStrand(true);
+
+        // switch seq & qual
+        string sequence  = impl_.Sequence();
+        QualityValues qualities = impl_.Qualities();
+
+        internal::ReverseComplement(sequence);
+        internal::Reverse(qualities);
+
+        impl_.SetSequenceAndQualities(sequence, qualities.Fastq());
+    }
+
+    // reset any cached aligned start/end
+    alignedStart_ = PacBio::BAM::UnmappedPosition;
+    alignedEnd_ = PacBio::BAM::UnmappedPosition;
+
+    return *this;
+}
+
+uint8_t BamRecord::MapQuality(void) const
+{ return impl_.MapQuality(); }
+
+QualityValues BamRecord::MergeQV(Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::MERGE_QV,
+                          orientation,
+                          aligned,
+                          exciseSoftClips);
+}
+
+BamRecord& BamRecord::MergeQV(const QualityValues& mergeQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::MERGE_QV,
+                           mergeQVs.Fastq(),
+                           &impl_);
+    return *this;
+}
+
+string BamRecord::MovieName(void) const
+{ return ReadGroup().MovieName(); }
+
+size_t BamRecord::NumDeletedBases(void) const
+{
+    auto tEnd = ReferenceEnd();
+    auto tStart = ReferenceStart();
+    auto numMatchesAndMismatches = NumMatchesAndMismatches();
+    auto nM = numMatchesAndMismatches.first;
+    auto nMM = numMatchesAndMismatches.second;
+    return (tEnd - tStart - nM - nMM);
+}
+
+size_t BamRecord::NumInsertedBases(void) const
+{
+    auto aEnd = AlignedEnd();
+    auto aStart = AlignedStart();
+    auto numMatchesAndMismatches = NumMatchesAndMismatches();
+    auto nM = numMatchesAndMismatches.first;
+    auto nMM = numMatchesAndMismatches.second;
+    return (aEnd - aStart - nM - nMM);
+}
+
+size_t BamRecord::NumMatches(void) const
+{ return NumMatchesAndMismatches().first; }
+
+pair<size_t, size_t> BamRecord::NumMatchesAndMismatches(void) const
+{
+    pair<size_t, size_t> result = make_pair(0,0);
+    PBBAM_SHARED_PTR<bam1_t> b = internal::BamRecordMemory::GetRawData(this);
+    uint32_t* cigarData = bam_get_cigar(b.get());
+    for (uint32_t i = 0; i < b->core.n_cigar; ++i) {
+        const CigarOperationType type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+        if (type == CigarOperationType::SEQUENCE_MATCH)
+            result.first += bam_cigar_oplen(cigarData[i]);
+        else if (type == CigarOperationType::SEQUENCE_MISMATCH)
+            result.second += bam_cigar_oplen(cigarData[i]);
+    }
+    return result;
+}
+
+size_t BamRecord::NumMismatches(void) const
+{ return NumMatchesAndMismatches().second; }
+
+int32_t BamRecord::NumPasses(void) const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::NUM_PASSES);
+    const Tag& numPasses = impl_.TagValue(tagName);
+    return numPasses.ToInt32();
+}
+
+BamRecord& BamRecord::NumPasses(const int32_t numPasses)
+{
+    internal::CreateOrEdit(BamRecordTag::NUM_PASSES,
+                           numPasses,
+                           &impl_);
+    return *this;
+}
+
+vector<float> BamRecord::Pkmean(Orientation orientation,
+                                     bool aligned,
+                                     bool exciseSoftClips,
+                                     PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMEAN,
+                        orientation,
+                        aligned,
+                        exciseSoftClips,
+                        pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmean(const vector<float>& photons)
+{
+    Pkmean(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmean(const vector<uint16_t>& encodedPhotons)
+{
+    internal::CreateOrEdit(BamRecordTag::PKMEAN,
+                           encodedPhotons,
+                           &impl_);
+    return *this;
+}
+
+vector<float> BamRecord::Pkmid(Orientation orientation,
+                                    bool aligned,
+                                    bool exciseSoftClips,
+                                    PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMID,
+                        orientation,
+                        aligned,
+                        exciseSoftClips,
+                        pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmid(const vector<float>& photons)
+{
+    Pkmid(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmid(const vector<uint16_t>& encodedPhotons)
+{
+    internal::CreateOrEdit(BamRecordTag::PKMID,
+                           encodedPhotons,
+                           &impl_);
+    return *this;
+}
+
+vector<float> BamRecord::Pkmean2(Orientation orientation,
+                                      bool aligned,
+                                      bool exciseSoftClips,
+                                      PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMEAN_2,
+                        orientation,
+                        aligned,
+                        exciseSoftClips,
+                        pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmean2(const vector<float>& photons)
+{
+    Pkmean2(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmean2(const vector<uint16_t>& encodedPhotons)
+{
+    internal::CreateOrEdit(BamRecordTag::PKMEAN_2,
+                           encodedPhotons,
+                           &impl_);
+    return *this;
+}
+
+vector<float> BamRecord::Pkmid2(Orientation orientation,
+                                     bool aligned,
+                                     bool exciseSoftClips,
+                                     PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMID_2,
+                        orientation,
+                        aligned,
+                        exciseSoftClips,
+                        pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmid2(const vector<float>& photons)
+{
+    Pkmid2(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmid2(const vector<uint16_t>& encodedPhotons)
+{
+    internal::CreateOrEdit(BamRecordTag::PKMID_2,
+                           encodedPhotons,
+                           &impl_);
+    return *this;
+}
+
+Frames BamRecord::PreBaseFrames(Orientation orientation,
+                                bool aligned,
+                                bool exciseSoftClips) const
+{ return IPD(orientation, aligned, exciseSoftClips); }
+
+BamRecord& BamRecord::PreBaseFrames(const Frames& frames,
+                                    const FrameEncodingType encoding)
+{ return IPD(frames, encoding); }
+
+Frames BamRecord::PrePulseFrames(Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchFrames(BamRecordTag::PRE_PULSE_FRAMES,
+                       orientation,
+                       aligned,
+                       exciseSoftClips,
+                       pulseBehavior);
+}
+
+BamRecord& BamRecord::PrePulseFrames(const Frames& frames,
+                                     const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY) {
+        internal::CreateOrEdit(BamRecordTag::PRE_PULSE_FRAMES,
+                               frames.Encode(),
+                               &impl_);
+    } else {
+        internal::CreateOrEdit(BamRecordTag::PRE_PULSE_FRAMES,
+                               frames.Data(),
+                               &impl_);
+    }
+    return *this;
+}
+
+Frames BamRecord::PulseWidthRaw(Orientation orientation,
+                                bool aligned,
+                                bool exciseSoftClips) const
+{
+    Frames frames;
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::PULSE_WIDTH);
+    const Tag& frameTag = impl_.TagValue(tagName);
+    if (frameTag.IsNull())
+        return frames;
+
+    // lossy frame codes
+    if (frameTag.IsUInt8Array()) {
+        const vector<uint8_t> codes = frameTag.ToUInt8Array();
+        const vector<uint16_t> codes16(codes.begin(), codes.end());
+        frames.Data(std::move(codes16));
+    }
+
+    // lossless frame data
+    else {
+        assert(frameTag.IsUInt16Array());
+        frames.Data(frameTag.ToUInt16Array());
+    }
+
+    // return in requested orientation
+    internal::OrientTagDataAsRequested(&frames,
+                                       Orientation::NATIVE,  // current
+                                       orientation,          // requested
+                                       impl_.IsReverseStrand());
+    return frames;
+}
+
+
+QualityValues BamRecord::PulseMergeQV(Orientation orientation,
+                                      bool aligned,
+                                      bool exciseSoftClips,
+                                      PulseBehavior pulseBehavior) const
+{
+    return FetchQualities(BamRecordTag::PULSE_MERGE_QV,
+                          orientation,
+                          aligned,
+                          exciseSoftClips,
+                          pulseBehavior);
+}
+
+BamRecord& BamRecord::PulseMergeQV(const QualityValues& mergeQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::PULSE_MERGE_QV,
+                           mergeQVs.Fastq(),
+                           &impl_);
+    return *this;
+}
+
+
+string BamRecord::PulseCall(Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchBases(BamRecordTag::PULSE_CALL,
+                      orientation,
+                      aligned,
+                      exciseSoftClips,
+                      pulseBehavior);
+}
+
+BamRecord& BamRecord::PulseCall(const string& tags)
+{
+    internal::CreateOrEdit(BamRecordTag::PULSE_CALL,
+                           tags,
+                           &impl_);
+    return *this;
+}
+
+Frames BamRecord::PulseCallWidth(Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchFrames(BamRecordTag::PULSE_CALL_WIDTH,
+                       orientation,
+                       aligned,
+                       exciseSoftClips,
+                       pulseBehavior);
+}
+
+BamRecord& BamRecord::PulseCallWidth(const Frames& frames,
+                                     const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY) {
+        internal::CreateOrEdit(BamRecordTag::PULSE_CALL_WIDTH,
+                               frames.Encode(),
+                               &impl_);
+    } else {
+        internal::CreateOrEdit(BamRecordTag::PULSE_CALL_WIDTH,
+                               frames.Data(),
+                               &impl_);
+    }
+    return *this;
+}
+
+Frames BamRecord::PulseWidth(Orientation orientation,
+                             bool aligned,
+                             bool exciseSoftClips) const
+{
+    return FetchFrames(BamRecordTag::PULSE_WIDTH,
+                       orientation,
+                       aligned,
+                       exciseSoftClips,
+                       PulseBehavior::ALL);
+}
+
+BamRecord& BamRecord::PulseWidth(const Frames& frames,
+                                 const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY) {
+        internal::CreateOrEdit(BamRecordTag::PULSE_WIDTH,
+                               frames.Encode(),
+                               &impl_);
+    } else {
+        internal::CreateOrEdit(BamRecordTag::PULSE_WIDTH,
+                               frames.Data(),
+                               &impl_);
+    }
+    return *this;
+}
+
+QualityValues BamRecord::Qualities(Orientation orientation,
+                                   bool aligned,
+                                   bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::QUAL,
+                          orientation,
+                          aligned,
+                          exciseSoftClips);
+}
+
+Position BamRecord::QueryEnd(void) const
+{
+    // try 'qe' tag
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::QUERY_END);
+    const Tag& qe = impl_.TagValue(tagName);
+    if (!qe.IsNull())
+        return qe.ToInt32();
+
+    // tag missing, need to check movie name (fallback for non-PB BAMs, but ignore for CCS reads)
+    RecordType type;
+    try {
+        type = Type();
+    } catch (std::exception&) {
+        return Position(0);
+    }
+    if (type == RecordType::CCS)
+        throw std::runtime_error("no query end for CCS read type");
+
+    // PacBio BAM, non-CCS
+    try {
+        return internal::QueryEndFromName(FullName());
+    } catch (std::exception&) {
+        // return fallback position
+        return Position(0);
+    }
+}
+
+BamRecord& BamRecord::QueryEnd(const Position pos)
+{
+   internal::CreateOrEdit(BamRecordTag::QUERY_END,
+                          static_cast<int32_t>(pos),
+                          &impl_);
+   UpdateName();
+   return *this;
+}
+
+Position BamRecord::QueryStart(void) const
+{
+    // try 'qs' tag
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::QUERY_START);
+    const Tag& qs = impl_.TagValue(tagName);
+    if (!qs.IsNull())
+        return qs.ToInt32();
+
+    // tag missing, need to check movie name (fallback for non-PB BAMs, but ignore for CCS reads)
+    RecordType type;
+    try {
+        type = Type();
+    } catch (std::exception&) {
+        return Position(0);
+    }
+    if (type == RecordType::CCS)
+        throw std::runtime_error("no query start for CCS read type");
+
+    // PacBio BAM, non-CCS
+    try {
+        return internal::QueryStartFromName(FullName());
+    } catch (std::exception&) {
+        // return fallback position
+        return Position(0);
+    }
+}
+
+BamRecord& BamRecord::QueryStart(const Position pos)
+{
+   internal::CreateOrEdit(BamRecordTag::QUERY_START,
+                          static_cast<int32_t>(pos),
+                          &impl_);
+   UpdateName();
+   return *this;
+}
+
+Accuracy BamRecord::ReadAccuracy(void) const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::READ_ACCURACY);
+    const Tag& readAccuracy = impl_.TagValue(tagName);
+    return Accuracy(readAccuracy.ToFloat());
+}
+
+BamRecord& BamRecord::ReadAccuracy(const Accuracy& accuracy)
+{
+    internal::CreateOrEdit(BamRecordTag::READ_ACCURACY,
+                           static_cast<float>(accuracy),
+                           &impl_);
+    return *this;
+}
+
+ReadGroupInfo BamRecord::ReadGroup(void) const
+{ return header_.ReadGroup(ReadGroupId()); }
+
+BamRecord& BamRecord::ReadGroup(const ReadGroupInfo& rg)
+{
+   internal::CreateOrEdit(BamRecordTag::READ_GROUP,
+                          rg.Id(),
+                          &impl_);
+   UpdateName();
+   return *this;
+}
+
+string BamRecord::ReadGroupId(void) const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::READ_GROUP);
+    const Tag& rgTag = impl_.TagValue(tagName);
+    if (rgTag.IsNull())
+        return string();
+    return rgTag.ToString();
+}
+
+BamRecord& BamRecord::ReadGroupId(const string& id)
+{
+   internal::CreateOrEdit(BamRecordTag::READ_GROUP,
+                          id,
+                          &impl_);
+   UpdateName();
+   return *this;
+}
+
+int32_t BamRecord::ReadGroupNumericId(void) const
+{ return ReadGroupInfo::IdToInt(ReadGroupId()); }
+
+Position BamRecord::ReferenceEnd(void) const
+{
+    if (!impl_.IsMapped())
+        return PacBio::BAM::UnmappedPosition;
+    PBBAM_SHARED_PTR<bam1_t> htsData = internal::BamRecordMemory::GetRawData(impl_);
+    if (!htsData)
+        return PacBio::BAM::UnmappedPosition;
+    return bam_endpos(htsData.get());
+}
+
+int32_t BamRecord::ReferenceId(void) const
+{ return impl_.ReferenceId(); }
+
+string BamRecord::ReferenceName(void) const
+{
+    if (IsMapped())
+        return Header().SequenceName(ReferenceId());
+    else
+        throw std::runtime_error("unmapped record has no associated reference name");
+}
+
+Position BamRecord::ReferenceStart(void) const
+{ return impl_.Position(); }
+
+void BamRecord::ResetCachedPositions(void) const
+{
+    alignedEnd_   = PacBio::BAM::UnmappedPosition;
+    alignedStart_ = PacBio::BAM::UnmappedPosition;
+}
+
+void BamRecord::ResetCachedPositions(void)
+{
+    alignedEnd_   = PacBio::BAM::UnmappedPosition;
+    alignedStart_ = PacBio::BAM::UnmappedPosition;
+}
+
+VirtualRegionType BamRecord::ScrapRegionType(void) const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::SCRAP_REGION_TYPE);
+    const Tag& srTag = impl_.TagValue(tagName);
+    return VirtualRegionTypeMap::ParseChar[srTag.ToUInt8()];
+}
+
+BamRecord& BamRecord::ScrapRegionType(const VirtualRegionType type)
+{
+    internal::CreateOrEdit(BamRecordTag::SCRAP_REGION_TYPE,
+                           static_cast<uint8_t>(type),
+                           &impl_);
+    return *this;
+}
+
+BamRecord& BamRecord::ScrapRegionType(const char type)
+{
+    internal::CreateOrEdit(BamRecordTag::SCRAP_REGION_TYPE,
+                           type,
+                           &impl_);
+    return *this;
+}
+
+ZmwType BamRecord::ScrapZmwType(void) const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::SCRAP_ZMW_TYPE);
+    const Tag& szTag = impl_.TagValue(tagName);
+    return ZmwTypeMap::ParseChar[szTag.ToUInt8()];
+}
+
+BamRecord& BamRecord::ScrapZmwType(const ZmwType type)
+{
+    internal::CreateOrEdit(BamRecordTag::SCRAP_ZMW_TYPE,
+                           static_cast<uint8_t>(type),
+                           &impl_);
+    return *this;
+}
+
+BamRecord& BamRecord::ScrapZmwType(const char type)
+{
+    internal::CreateOrEdit(BamRecordTag::SCRAP_ZMW_TYPE,
+                           type,
+                           &impl_);
+    return *this;
+}
+
+string BamRecord::Sequence(const Orientation orientation,
+                                bool aligned,
+                                bool exciseSoftClips) const
+{
+    return FetchBases(BamRecordTag::SEQ,
+                      orientation,
+                      aligned,
+                      exciseSoftClips);
+}
+
+vector<float> BamRecord::SignalToNoise(void) const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::SNR);
+    const Tag& snTag = impl_.TagValue(tagName);
+    return snTag.ToFloatArray();
+}
+
+BamRecord& BamRecord::SignalToNoise(const vector<float>& snr)
+{
+    internal::CreateOrEdit(BamRecordTag::SNR,
+                           snr,
+                           &impl_);
+    return *this;
+}
+
+vector<uint32_t> BamRecord::StartFrame(Orientation orientation,
+                                       bool aligned,
+                                       bool exciseSoftClips,
+                                       PulseBehavior pulseBehavior) const
+{
+    return FetchUInts(BamRecordTag::START_FRAME,
+                      orientation,
+                      aligned,
+                      exciseSoftClips,
+                      pulseBehavior);
+}
+
+BamRecord& BamRecord::StartFrame(const vector<uint32_t>& startFrame)
+{
+    internal::CreateOrEdit(BamRecordTag::START_FRAME,
+                           startFrame,
+                           &impl_);
+    return *this;
+}
+
+QualityValues BamRecord::SubstitutionQV(Orientation orientation,
+                                        bool aligned,
+                                        bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::SUBSTITUTION_QV,
+                          orientation,
+                          aligned,
+                          exciseSoftClips);
+}
+
+BamRecord& BamRecord::SubstitutionQV(const QualityValues& substitutionQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::SUBSTITUTION_QV,
+                           substitutionQVs.Fastq(),
+                           &impl_);
+    return *this;
+}
+
+string BamRecord::SubstitutionTag(Orientation orientation,
+                                        bool aligned,
+                                        bool exciseSoftClips) const
+{
+    return FetchBases(BamRecordTag::SUBSTITUTION_TAG,
+                      orientation,
+                      aligned,
+                      exciseSoftClips);
+}
+
+BamRecord& BamRecord::SubstitutionTag(const string& tags)
+{
+    internal::CreateOrEdit(BamRecordTag::SUBSTITUTION_TAG,
+                           tags,
+                           &impl_);
+    return *this;
+}
+
+RecordType BamRecord::Type(void) const
+{
+    try {
+        const string& typeName = ReadGroup().ReadType();
+        return internal::NameToType(typeName);
+    } catch (std::exception&) {
+
+        // read group not found
+        // peek at name to see if we're CCS
+        if (FullName().find("ccs") != string::npos)
+            return RecordType::CCS;
+
+        // otherwise unknown
+        else
+            return RecordType::UNKNOWN;
+    }
+}
+
+void BamRecord::UpdateName()
+{
+    string newName;
+    newName.reserve(100);
+
+    newName += MovieName();
+    newName += "/";
+
+    if (HasHoleNumber())
+        newName += std::to_string(HoleNumber());
+    else
+        newName += "?";
+
+    newName += "/";
+
+    if (Type() == RecordType::CCS)
+        newName += "ccs";
+    else {
+        if (HasQueryStart())
+            newName += std::to_string(QueryStart());
+        else
+            newName += "?";
+
+        newName += '_';
+
+        if (HasQueryEnd())
+            newName += std::to_string(QueryEnd());
+        else
+            newName += "?";
+    }
+
+    impl_.Name(newName);
+}
diff --git a/src/BamRecordBuilder.cpp b/src/BamRecordBuilder.cpp

new file mode 100644 (file)

index 0000000..004a0ea
--- /dev/null
+++ b/src/BamRecordBuilder.cpp
@@ -0,0 +1,393 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "pbbam/BamRecordBuilder.h"
+#include "pbbam/BamTagCodec.h"
+#include "AssertUtils.h"
+#include "MemoryUtils.h"
+#include <htslib/sam.h>
+#include <cstring>
+#include <memory>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+BamRecordBuilder::BamRecordBuilder(void)
+{
+    // ensure proper clean slate
+    Reset();
+
+    // initialize with some space for data
+    name_.reserve(256);
+    sequence_.reserve(2096);
+    qualities_.reserve(2096);
+    cigar_.reserve(256);
+}
+
+BamRecordBuilder::BamRecordBuilder(const BamHeader& header)
+    : header_(header)
+{
+    // ensure proper clean slate
+    Reset();
+
+    // initialize with some space for data
+    name_.reserve(256);
+    sequence_.reserve(2096);
+    qualities_.reserve(2096);
+    cigar_.reserve(256);
+}
+
+BamRecordBuilder::BamRecordBuilder(const BamRecord& prototype)
+    : header_(prototype.Header())
+{
+    Reset(prototype);
+}
+
+BamRecordBuilder::BamRecordBuilder(const BamRecordBuilder& other)
+    : core_(other.core_)
+    , name_(other.name_)
+    , sequence_(other.sequence_)
+    , qualities_(other.qualities_)
+    , cigar_(other.cigar_)
+    , tags_(other.tags_)
+{ }
+
+BamRecordBuilder::BamRecordBuilder(BamRecordBuilder&& other)
+    : core_(std::move(other.core_))
+    , name_(std::move(other.name_))
+    , sequence_(std::move(other.sequence_))
+    , qualities_(std::move(other.qualities_))
+    , cigar_(std::move(other.cigar_))
+    , tags_(std::move(other.tags_))
+{  }
+
+BamRecordBuilder& BamRecordBuilder::operator=(const BamRecordBuilder& other)
+{
+    core_ = other.core_;
+    name_ = other.name_;
+    sequence_  = other.sequence_;
+    qualities_ = other.qualities_;
+    cigar_ = other.cigar_;
+    tags_  = other.tags_;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::operator=(BamRecordBuilder&& other)
+{
+    core_ = std::move(other.core_);
+    name_ = std::move(other.name_);
+    sequence_  = std::move(other.sequence_);
+    qualities_ = std::move(other.qualities_);
+    cigar_ = std::move(other.cigar_);
+    tags_  = std::move(other.tags_);
+    return *this;
+}
+
+BamRecordBuilder::~BamRecordBuilder(void) { }
+
+BamRecord BamRecordBuilder::Build(void) const
+{
+    BamRecord result(header_);
+    BuildInPlace(result);
+    return result;
+}
+
+bool BamRecordBuilder::BuildInPlace(BamRecord& record) const
+{
+    // initialize with basic 'core data'
+    PBBAM_SHARED_PTR<bam1_t> recordRawData = internal::BamRecordMemory::GetRawData(record); /*   record.impl_.RawData().get();*/
+    PB_ASSERT_OR_RETURN_VALUE(recordRawData, false);
+    PB_ASSERT_OR_RETURN_VALUE(recordRawData->data, false);
+    recordRawData->core = core_;
+
+    // setup variable length data
+    const vector<uint8_t> encodedTags = BamTagCodec::Encode(tags_);
+
+    const size_t nameLength  = name_.size() + 1;
+    const size_t numCigarOps = cigar_.size();
+    const size_t cigarLength = numCigarOps * sizeof(uint32_t);
+    const size_t seqLength   = sequence_.size();
+    const size_t qualLength  = seqLength;
+    const size_t tagLength   = encodedTags.size();
+    const size_t dataLength  = nameLength + cigarLength + seqLength + qualLength + tagLength;
+
+    // realloc if necessary
+    uint8_t* varLengthDataBlock = recordRawData->data;
+    PB_ASSERT_OR_RETURN_VALUE(varLengthDataBlock, false);
+    size_t allocatedDataLength = recordRawData->m_data;
+    if (allocatedDataLength < dataLength) {
+        allocatedDataLength = dataLength;
+        kroundup32(allocatedDataLength);
+        varLengthDataBlock = (uint8_t*)realloc(varLengthDataBlock, allocatedDataLength);
+    }
+    recordRawData->data = varLengthDataBlock;
+    recordRawData->l_data = dataLength;
+    recordRawData->m_data = allocatedDataLength;
+
+    size_t index = 0;
+
+    // name
+    memcpy(&varLengthDataBlock[index], name_.c_str(), nameLength);
+    index += nameLength;
+
+    // cigar
+    if (cigarLength > 0) {
+        vector<uint32_t> encodedCigar(numCigarOps);
+        for (size_t i = 0; i < numCigarOps; ++i) {
+            const CigarOperation& op = cigar_.at(i);
+            encodedCigar[i] = op.Length() << BAM_CIGAR_SHIFT;
+            const uint8_t type = static_cast<uint8_t>(op.Type());
+            PB_ASSERT_OR_RETURN_VALUE(type >= 0 && type < 8, false);
+            encodedCigar[i] |= type;
+        }
+        memcpy(&varLengthDataBlock[index], &encodedCigar[0], cigarLength);
+        index += cigarLength;
+
+        // update bin after we've calculated cigar info
+        const int32_t endPosition = bam_cigar2rlen(recordRawData->core.n_cigar, &encodedCigar[0]);
+        recordRawData->core.bin = hts_reg2bin(core_.pos, endPosition, 14, 5);
+    }
+
+    // seq & qual
+    if (seqLength > 0) {
+
+        uint8_t* s = &varLengthDataBlock[index];
+        for (size_t i = 0; i < seqLength; ++i)
+            s[i>>1] |= ( seq_nt16_table[static_cast<int>(sequence_.at(i))] << ((~i&1)<<2) );
+        index += seqLength;
+
+        uint8_t* q = &varLengthDataBlock[index];
+        if (!qualities_.empty())
+            memset(q, 0xFF, seqLength);
+        else {
+            for (size_t i = 0; i < seqLength; ++i)
+                q[i] = qualities_.at(i) - 33;
+        }
+        index += seqLength;
+    }
+
+    // tags
+    if (tagLength > 0) {
+        PB_ASSERT_OR_RETURN_VALUE(!encodedTags.empty(), false);
+        memcpy(&varLengthDataBlock[index], &encodedTags[0], tagLength);
+        index += tagLength;
+    }
+
+    // sanity check
+    PB_ASSERT_OR_RETURN_VALUE(index == dataLength, false);
+    return true;
+}
+
+BamRecordBuilder& BamRecordBuilder::Cigar(const PacBio::BAM::Cigar& cigar)
+{
+    core_.n_cigar = cigar.size();
+    cigar_ = cigar;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::Cigar(PacBio::BAM::Cigar&& cigar)
+{
+    core_.n_cigar = cigar.size();
+    cigar_ = std::move(cigar);
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::Name(const std::string& name)
+{
+    core_.l_qname = name.size() + 1; // (NULL-term)
+    name_ = name;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::Name(std::string&& name)
+{
+    core_.l_qname = name.size() + 1; // (NULL-term)
+    name_ = std::move(name);
+    return *this;
+}
+
+void BamRecordBuilder::Reset(void)
+{
+    // zeroize fixed-length data
+    memset(&core_, 0, sizeof(bam1_core_t));
+    core_.l_qname = 1; // always has a NULL-term
+
+    // reset variable-length data
+    name_.clear();
+    sequence_.clear();
+    qualities_.clear();
+    cigar_.clear();
+    tags_.clear();
+}
+
+void BamRecordBuilder::Reset(const BamRecord& prototype)
+{
+    // ensure clean slate
+    Reset();
+    header_ = prototype.Header();
+
+    // reset core data
+    const PBBAM_SHARED_PTR<bam1_t> rawData = internal::BamRecordMemory::GetRawData(prototype); //  prototype.impl_.RawData().get();
+    PB_ASSERT_OR_RETURN(rawData);
+    core_ = rawData->core;
+
+    // reset variable-length data
+    const BamRecordImpl& impl = internal::BamRecordMemory::GetImpl(prototype);
+    name_ = impl.Name();
+    sequence_ = impl.Sequence();
+    qualities_ = impl.Qualities().Fastq();
+    cigar_ = impl.CigarData();
+    tags_ = impl.Tags();
+}
+
+void BamRecordBuilder::Reset(BamRecord&& prototype)
+{
+    // ensure clean slate
+    Reset();
+    header_ = prototype.Header();
+
+    // reset core data
+    const PBBAM_SHARED_PTR<bam1_t> rawData = internal::BamRecordMemory::GetRawData(prototype); //  prototype.impl_.RawData().get();
+    PB_ASSERT_OR_RETURN(rawData);
+    core_ = std::move(rawData->core);
+
+    // reset variable-length data
+    const BamRecordImpl& impl = internal::BamRecordMemory::GetImpl(prototype);
+    name_ = impl.Name();
+    sequence_ = impl.Sequence();
+    qualities_ = impl.Qualities().Fastq();
+    cigar_ = impl.CigarData();
+    tags_ = impl.Tags();
+}
+
+BamRecordBuilder& BamRecordBuilder::Sequence(const std::string& sequence)
+{
+    core_.l_qseq = sequence.size();
+    sequence_ = sequence;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::Sequence(std::string&& sequence)
+{
+    core_.l_qseq = sequence.size();
+    sequence_ = std::move(sequence);
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetDuplicate(bool ok)
+{
+    if (ok) core_.flag |=  BamRecordImpl::DUPLICATE;
+    else    core_.flag &= ~BamRecordImpl::DUPLICATE;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetFailedQC(bool ok)
+{
+    if (ok) core_.flag |=  BamRecordImpl::FAILED_QC;
+    else    core_.flag &= ~BamRecordImpl::FAILED_QC;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetFirstMate(bool ok)
+{
+    if (ok) core_.flag |=  BamRecordImpl::MATE_1;
+    else    core_.flag &= ~BamRecordImpl::MATE_1;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetMapped(bool ok)
+{
+    if (ok) core_.flag &= ~BamRecordImpl::UNMAPPED;
+    else    core_.flag |=  BamRecordImpl::UNMAPPED;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetMateMapped(bool ok)
+{
+    if (ok) core_.flag &= ~BamRecordImpl::MATE_UNMAPPED;
+    else    core_.flag |=  BamRecordImpl::MATE_UNMAPPED;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetMateReverseStrand(bool ok)
+{
+    if (ok) core_.flag |=  BamRecordImpl::MATE_REVERSE_STRAND;
+    else    core_.flag &= ~BamRecordImpl::MATE_REVERSE_STRAND;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetPaired(bool ok)
+{
+    if (ok) core_.flag |=  BamRecordImpl::PAIRED;
+    else    core_.flag &= ~BamRecordImpl::PAIRED;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetPrimaryAlignment(bool ok)
+{
+    if (ok) core_.flag &= ~BamRecordImpl::SECONDARY;
+    else    core_.flag |=  BamRecordImpl::SECONDARY;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetProperPair(bool ok)
+{
+    if (ok) core_.flag |=  BamRecordImpl::PROPER_PAIR;
+    else    core_.flag &= ~BamRecordImpl::PROPER_PAIR;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetReverseStrand(bool ok)
+{
+    if (ok) core_.flag |=  BamRecordImpl::REVERSE_STRAND;
+    else    core_.flag &= ~BamRecordImpl::REVERSE_STRAND;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetSecondMate(bool ok)
+{
+    if (ok) core_.flag |=  BamRecordImpl::MATE_2;
+    else    core_.flag &= ~BamRecordImpl::MATE_2;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetSupplementaryAlignment(bool ok)
+{
+    if (ok) core_.flag |=  BamRecordImpl::SUPPLEMENTARY;
+    else    core_.flag &= ~BamRecordImpl::SUPPLEMENTARY;
+    return *this;
+}
diff --git a/src/BamRecordImpl.cpp b/src/BamRecordImpl.cpp

new file mode 100644 (file)

index 0000000..c6c127e
--- /dev/null
+++ b/src/BamRecordImpl.cpp
@@ -0,0 +1,598 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "pbbam/BamRecordImpl.h"
+#include "pbbam/BamTagCodec.h"
+#include "AssertUtils.h"
+#include "BamRecordTags.h"
+#include "MemoryUtils.h"
+#include <algorithm>
+#include <iostream>
+#include <utility>
+#include <cstdlib>
+#include <cstring>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+BamRecordImpl::BamRecordImpl(void)
+    : d_(nullptr)
+{
+    InitializeData();
+}
+
+BamRecordImpl::BamRecordImpl(const BamRecordImpl& other)
+    : d_(bam_dup1(other.d_.get()), internal::HtslibRecordDeleter())
+    , tagOffsets_(other.tagOffsets_)
+{ }
+
+BamRecordImpl::BamRecordImpl(BamRecordImpl&& other)
+    : d_(nullptr)
+    , tagOffsets_(std::move(other.tagOffsets_))
+{
+    d_.swap(other.d_);
+    other.d_.reset();
+}
+
+BamRecordImpl& BamRecordImpl::operator=(const BamRecordImpl& other)
+{
+    if (this != & other) {
+        if (d_ == nullptr)
+            InitializeData();
+        bam_copy1(d_.get(), other.d_.get());
+        tagOffsets_ = other.tagOffsets_;
+    }
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::operator=(BamRecordImpl&& other)
+{
+    if (this != & other) {
+        d_.swap(other.d_);
+        other.d_.reset();
+
+        tagOffsets_ = std::move(other.tagOffsets_);
+    }
+    return *this;
+}
+
+BamRecordImpl::~BamRecordImpl(void) { }
+
+bool BamRecordImpl::AddTag(const string& tagName,
+                           const Tag &value)
+{
+    return AddTag(tagName, value, TagModifier::NONE);
+}
+
+bool BamRecordImpl::AddTag(const BamRecordTag tag,
+                           const Tag& value)
+{
+   return AddTag(internal::BamRecordTags::LabelFor(tag),
+                 value,
+                 TagModifier::NONE);
+}
+
+bool BamRecordImpl::AddTag(const string& tagName,
+                           const Tag& value,
+                           const TagModifier additionalModifier)
+{
+    if (tagName.size() != 2 || HasTag(tagName))
+        return false;
+    const bool added = AddTagImpl(tagName, value, additionalModifier);
+    if (added)
+        UpdateTagMap();
+    return added;
+}
+
+bool BamRecordImpl::AddTag(const BamRecordTag tag,
+                           const Tag& value,
+                           const TagModifier additionalModifier)
+{
+    return AddTag(internal::BamRecordTags::LabelFor(tag),
+                  value,
+                  additionalModifier);
+}
+
+bool BamRecordImpl::AddTagImpl(const string& tagName,
+                               const Tag& value,
+                               const TagModifier additionalModifier)
+{
+    const vector<uint8_t> rawData = BamTagCodec::ToRawData(value, additionalModifier);
+    if (rawData.empty())
+        return false;
+
+    bam_aux_append(d_.get(),
+                   tagName.c_str(),
+                   BamTagCodec::TagTypeCode(value, additionalModifier),
+                   rawData.size(),
+                   const_cast<uint8_t*>(rawData.data()));
+    return true;
+}
+
+Cigar BamRecordImpl::CigarData(void) const
+{
+    Cigar result;
+    result.reserve(d_->core.n_cigar);
+    uint32_t* cigarData = bam_get_cigar(d_);
+    for (uint32_t i = 0; i < d_->core.n_cigar; ++i) {
+        const uint32_t length = bam_cigar_oplen(cigarData[i]);
+        const CigarOperationType type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+        result.push_back(CigarOperation(type, length));
+    }
+
+    return result;
+}
+
+BamRecordImpl& BamRecordImpl::CigarData(const Cigar& cigar)
+{
+    // determine change in memory needed
+    // diffNumBytes: pos -> growing, neg -> shrinking
+    const size_t numCigarOps = cigar.size();
+    const int diffNumCigars = numCigarOps - d_->core.n_cigar;
+    const int diffNumBytes  = diffNumCigars * sizeof(uint32_t);
+    const int oldLengthData = d_->l_data;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+
+    // shift trailing data (seq, qual, tags) as needed
+    const uint8_t* oldSequenceStart = bam_get_seq(d_);
+    const size_t trailingDataLength = oldLengthData - (oldSequenceStart - d_->data);
+    d_->core.n_cigar = numCigarOps;
+    uint8_t* newSequenceStart = bam_get_seq(d_);
+    memmove(newSequenceStart, oldSequenceStart, trailingDataLength);
+
+    // fill in new CIGAR data
+    uint32_t* cigarDataStart = bam_get_cigar(d_);
+    for (size_t i = 0; i < numCigarOps; ++i) {
+        const CigarOperation& cigarOp = cigar.at(i);
+        cigarDataStart[i] = bam_cigar_gen(cigarOp.Length(), static_cast<int>(cigarOp.Type()));
+    }
+
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::CigarData(const std::string& cigarString)
+{
+    return CigarData(Cigar::FromStdString(cigarString));
+}
+
+bool BamRecordImpl::EditTag(const string& tagName,
+                            const Tag& newValue)
+{
+    return EditTag(tagName, newValue, TagModifier::NONE);
+}
+
+bool BamRecordImpl::EditTag(const BamRecordTag tag,
+                            const Tag& newValue)
+{
+    return EditTag(internal::BamRecordTags::LabelFor(tag),
+                   newValue,
+                   TagModifier::NONE);
+}
+
+bool BamRecordImpl::EditTag(const string& tagName,
+                            const Tag& newValue,
+                            const TagModifier additionalModifier)
+{
+    // try remove old value (with delayed tag map update)
+    const bool removed = RemoveTagImpl(tagName);
+    if (!removed)
+        return false;
+
+    // if old value removed, add new value
+    const bool added = AddTagImpl(tagName, newValue, additionalModifier);
+    if (added)
+        UpdateTagMap();
+    return added;
+}
+
+bool BamRecordImpl::EditTag(const BamRecordTag tag,
+                            const Tag& newValue,
+                            const TagModifier additionalModifier)
+{
+    return EditTag(internal::BamRecordTags::LabelFor(tag),
+                   newValue,
+                   additionalModifier);
+}
+
+BamRecordImpl BamRecordImpl::FromRawData(const PBBAM_SHARED_PTR<bam1_t>& rawData)
+{
+    BamRecordImpl result;
+    bam_copy1(result.d_.get(), rawData.get());
+    return result;
+}
+
+bool BamRecordImpl::HasTag(const string& tagName) const
+{
+    if (tagName.size() != 2)
+        return false;
+    return TagOffset(tagName) != -1;
+
+    // 27635
+//    return bam_aux_get(d_.get(), tagName.c_str()) != 0;
+}
+
+bool BamRecordImpl::HasTag(const BamRecordTag tag) const
+{
+    return HasTag(internal::BamRecordTags::LabelFor(tag));
+}
+
+void BamRecordImpl::InitializeData(void)
+{
+    d_.reset(bam_init1(), internal::HtslibRecordDeleter());
+    d_->data = (uint8_t*)(calloc(0x800, sizeof(uint8_t)));   // maybe make this value tune-able later?
+
+    // init unmapped
+    Position(PacBio::BAM::UnmappedPosition);
+    MatePosition(PacBio::BAM::UnmappedPosition);
+    ReferenceId(-1);
+    MateReferenceId(-1);
+    SetMapped(false);
+    MapQuality(255);
+
+    // initialized with NULL term for qname
+    d_->core.l_qname = 1;
+    d_->l_data = 1;
+    d_->m_data = 0x800;
+}
+
+void BamRecordImpl::MaybeReallocData(void)
+{
+    // about to grow data contents to l_data size, but m_data is our current max.
+    // so we may need to grow. if so, use kroundup to double to next power of 2
+    if (d_->m_data < d_->l_data) {
+        d_->m_data = d_->l_data;
+        kroundup32(d_->m_data);
+        d_->data = static_cast<uint8_t*>(realloc(d_->data, d_->m_data));
+    }
+}
+
+string BamRecordImpl::Name(void) const
+{
+    return string(bam_get_qname(d_));
+}
+
+BamRecordImpl& BamRecordImpl::Name(const std::string& name)
+{
+    // determine change in memory needed
+    // diffNumBytes: pos -> growing, neg -> shrinking
+    const size_t numChars = name.size() + 1; // +1 for NULL-term
+    const int diffNumBytes = numChars - d_->core.l_qname;
+    const int oldLengthData = d_->l_data;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+
+    // shift trailing data (cigar, seq, qual, tags) as needed
+    const uint32_t* oldCigarStart = bam_get_cigar(d_);
+    const size_t trailingDataLength = oldLengthData - ((uint8_t*)oldCigarStart - d_->data);
+    d_->core.l_qname = numChars;
+    uint32_t* newCigarStart = bam_get_cigar(d_);
+    memmove(newCigarStart, oldCigarStart, trailingDataLength);
+
+    // fill in new name
+    memcpy(d_->data, name.c_str(), numChars);
+    return *this;
+}
+
+QualityValues BamRecordImpl::Qualities(void) const
+{
+    if (d_->core.l_qseq == 0)
+        return QualityValues();
+
+    uint8_t* qualData = bam_get_qual(d_);
+    if (qualData[0] == 0xff)
+        return QualityValues();
+
+    const size_t numQuals = d_->core.l_qseq;
+    QualityValues result;
+    result.reserve(numQuals);
+    for (size_t i = 0; i < numQuals; ++i)
+        result.push_back(QualityValue(qualData[i]));
+    return result;
+}
+
+bool BamRecordImpl::RemoveTag(const string& tagName)
+{
+    const bool removed = RemoveTagImpl(tagName);
+    if (removed)
+        UpdateTagMap();
+    return removed;
+}
+
+bool BamRecordImpl::RemoveTag(const BamRecordTag tag)
+{
+    return RemoveTag(internal::BamRecordTags::LabelFor(tag));
+}
+
+bool BamRecordImpl::RemoveTagImpl(const string &tagName)
+{
+    if (tagName.size() != 2)
+        return false;
+    uint8_t* data = bam_aux_get(d_.get(), tagName.c_str());
+    if (data == 0)
+        return false;
+    const bool ok = bam_aux_del(d_.get(), data) == 0;
+    return ok;
+}
+
+string BamRecordImpl::Sequence(void) const
+{
+    string result;
+    result.reserve(d_->core.l_qseq);
+    static const string DnaLookup = string("=ACMGRSVTWYHKDBN");
+    const uint8_t* seqData = bam_get_seq(d_);
+    for (int i = 0; i < d_->core.l_qseq; ++i)
+        result.append(1, DnaLookup[bam_seqi(seqData, i)]);
+    return result;
+}
+
+size_t BamRecordImpl::SequenceLength(void) const
+{ return d_->core.l_qseq; }
+
+BamRecordImpl& BamRecordImpl::SetSequenceAndQualities(const std::string& sequence,
+                                                      const std::string& qualities)
+{
+    // TODO: I'm ok with the assert for now, but how to handle at runtime?
+    if (!qualities.empty()) {
+        PB_ASSERT_OR_RETURN_VALUE(sequence.size() == qualities.size(), *this);
+    }
+
+    return SetSequenceAndQualitiesInternal(sequence.c_str(),
+                                           sequence.size(),
+                                           qualities.c_str(),
+                                           false);
+}
+
+BamRecordImpl& BamRecordImpl::SetSequenceAndQualities(const char* sequence,
+                                                      const size_t sequenceLength,
+                                                      const char* qualities)
+{
+    return SetSequenceAndQualitiesInternal(sequence,
+                                           sequenceLength,
+                                           qualities,
+                                           false);
+}
+
+BamRecordImpl& BamRecordImpl::SetPreencodedSequenceAndQualities(const char* encodedSequence,
+                                                                const size_t rawSequenceLength,
+                                                                const char* qualities)
+{
+    return SetSequenceAndQualitiesInternal(encodedSequence,
+                                           rawSequenceLength,
+                                           qualities,
+                                           true);
+}
+
+BamRecordImpl& BamRecordImpl::SetSequenceAndQualitiesInternal(const char* sequence,
+                                                              const size_t sequenceLength,
+                                                              const char* qualities,
+                                                              bool isPreencoded)
+{
+    // determine change in memory needed
+    // diffNumBytes: pos -> growing, neg -> shrinking
+    const int encodedSequenceLength = static_cast<int>((sequenceLength+1)/2);
+    const int oldSeqAndQualLength = static_cast<int>((d_->core.l_qseq+1)/2) + d_->core.l_qseq; // encoded seq + qual
+    const int newSeqAndQualLength = encodedSequenceLength + sequenceLength;                    // encoded seq + qual
+    const int diffNumBytes = newSeqAndQualLength - oldSeqAndQualLength;
+    const int oldLengthData = d_->l_data;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+
+    // shift trailing data (tags) as needed
+    const uint8_t* oldTagStart = bam_get_aux(d_);
+    const size_t trailingDataLength = oldLengthData - ((uint8_t*)oldTagStart - d_->data);
+    d_->core.l_qseq = sequenceLength;
+    uint8_t* newTagStart = bam_get_aux(d_);
+    memmove(newTagStart, oldTagStart, trailingDataLength);
+
+    // fill in new sequence
+    uint8_t* pEncodedSequence = bam_get_seq(d_);
+    if (isPreencoded) {
+        memcpy(pEncodedSequence, sequence, encodedSequenceLength);
+    } else {
+        memset(pEncodedSequence, 0, encodedSequenceLength);
+        for (size_t i = 0; i < sequenceLength; ++i)
+            pEncodedSequence[i>>1] |= seq_nt16_table[(int)sequence[i]] << ((~i&1)<<2);
+    }
+
+    // fill in quality values
+    uint8_t* encodedQualities = bam_get_qual(d_);
+    if ( (qualities == 0 ) || (strlen(qualities) == 0) )
+        memset(encodedQualities, 0xff, sequenceLength);
+    else {
+        for (size_t i = 0; i < sequenceLength; ++i)
+            encodedQualities[i] = qualities[i] - 33;  // FASTQ ASCII -> int conversion
+    }
+    return *this;
+}
+
+int BamRecordImpl::TagOffset(const string& tagName) const
+{
+    if (tagName.size() != 2)
+        throw std::runtime_error("invalid tag name size");
+
+    if (tagOffsets_.empty())
+        UpdateTagMap();
+
+    const uint16_t tagCode = (static_cast<uint8_t>(tagName.at(0)) << 8) | static_cast<uint8_t>(tagName.at(1));
+    const auto found = tagOffsets_.find(tagCode);
+    return (found != tagOffsets_.cend() ? found->second : -1);
+}
+
+BamRecordImpl& BamRecordImpl::Tags(const TagCollection& tags)
+{
+    // convert tags to binary
+    const vector<uint8_t>& tagData = BamTagCodec::Encode(tags);
+    const size_t numBytes = tagData.size();
+    const uint8_t* data = tagData.data();
+
+    // determine change in memory needed
+    uint8_t* tagStart = bam_get_aux(d_);
+    const size_t oldNumBytes = d_->l_data - (tagStart - d_->data);
+    const int diffNumBytes = numBytes - oldNumBytes;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+    tagStart = bam_get_aux(d_);
+
+    // fill in new tag data
+    memcpy((void*)tagStart, data, numBytes);
+
+    // update tag info
+    UpdateTagMap();
+    return *this;
+}
+
+TagCollection BamRecordImpl::Tags(void) const
+{
+    const uint8_t* tagDataStart = bam_get_aux(d_);
+    const size_t numBytes = d_->l_data - (tagDataStart - d_->data);
+    return BamTagCodec::Decode(vector<uint8_t>(tagDataStart, tagDataStart+numBytes));
+}
+
+Tag BamRecordImpl::TagValue(const string& tagName) const
+{
+    if (tagName.size() != 2)
+        return Tag();
+
+    const int offset = TagOffset(tagName);
+    if (offset == -1)
+        return Tag();
+
+    bam1_t* b = d_.get();
+    assert(bam_get_aux(b));
+    uint8_t* tagData = bam_get_aux(b) + offset;
+    if (offset >= b->l_data)
+        return Tag();
+
+    // skip tag name
+    return BamTagCodec::FromRawData(tagData);
+}
+
+Tag BamRecordImpl::TagValue(const BamRecordTag tag) const
+{
+    return TagValue(internal::BamRecordTags::LabelFor(tag));
+}
+
+void BamRecordImpl::UpdateTagMap(void) const
+{
+    // clear out offsets, leave map structure basically intact
+    auto tagIter = tagOffsets_.begin();
+    auto tagEnd  = tagOffsets_.end();
+    for ( ; tagIter != tagEnd; ++tagIter )
+        tagIter->second = -1;
+
+    const uint8_t* tagStart = bam_get_aux(d_);
+    if (tagStart == 0)
+        return;
+    const ptrdiff_t numBytes = d_->l_data - (tagStart - d_->data);
+
+    // NOTE: using a 16-bit 'code' for tag name here instead of string, to avoid
+    // a lot of string constructions & comparisons. All valid tags will be 2 chars
+    // anyway, so this should be a nice lookup mechanism.
+    //
+    uint16_t tagNameCode;
+    int64_t i = 0;
+    while(i < numBytes) {
+
+        // store (tag name code -> start offset into tag data)
+        tagNameCode = static_cast<char>(tagStart[i]) << 8 | static_cast<char>(tagStart[i+1]);
+        i += 2;
+        tagOffsets_[tagNameCode] = i;
+
+        // skip tag contents
+        const char tagType = static_cast<char>(tagStart[i++]);
+        switch (tagType) {
+            case 'A' :
+            case 'a' :
+            case 'c' :
+            case 'C' :
+            {
+                i += 1;
+                break;
+            }
+            case 's' :
+            case 'S' :
+            {
+                i += 2;
+                break;
+            }
+            case 'i' :
+            case 'I' :
+            case 'f' :
+            {
+                i += 4;
+                break;
+            }
+
+            case 'Z' :
+            case 'H' :
+            {
+                // null-terminated string
+                i += strlen((const char*)&tagStart[i]) + 1;
+                break;
+            }
+
+            case 'B' :
+            {
+                const char subTagType = tagStart[i++];
+                size_t elementSize = 0;
+                switch (subTagType) {
+                    case 'c' :
+                    case 'C' : elementSize = 1; break;
+                    case 's' :
+                    case 'S' : elementSize = 2; break;
+                    case 'i' :
+                    case 'I' :
+                    case 'f' : elementSize = 4; break;
+
+                    // unknown subTagType
+                    default:
+                        PB_ASSERT_OR_RETURN(false);
+                }
+
+                uint32_t numElements = 0;
+                memcpy(&numElements, &tagStart[i], sizeof(uint32_t));
+                i += (4 + (elementSize * numElements));
+                break;
+            }
+
+            // unknown tagType
+            default:
+                PB_ASSERT_OR_RETURN(false);
+        }
+    }
+}
diff --git a/src/BamRecordTags.cpp b/src/BamRecordTags.cpp

new file mode 100644 (file)

index 0000000..c039aae
--- /dev/null
+++ b/src/BamRecordTags.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecordTags.h
+/// \brief Implements the BamRecordTags utility class.
+//
+// Author: Derek Barnett
+
+#include "BamRecordTags.h"
+#include "EnumClassHash.h"
+#include <unordered_map>
+#include <cassert>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+const BamRecordTags::TagLookupType BamRecordTags::tagLookup =
+{
+    //     enum name                   label  isPulse?
+    //     ---------                   -----  --------
+    { BamRecordTag::ALT_LABEL_QV,      {"pv", true}  },
+    { BamRecordTag::ALT_LABEL_TAG,     {"pt", true}  },
+    { BamRecordTag::BARCODE_QUALITY,   {"bq", false} },
+    { BamRecordTag::BARCODES,          {"bc", false} },
+    { BamRecordTag::CONTEXT_FLAGS,     {"cx", false} },
+    { BamRecordTag::DELETION_QV,       {"dq", false} },
+    { BamRecordTag::DELETION_TAG,      {"dt", false} },
+    { BamRecordTag::HOLE_NUMBER,       {"zm", false} },
+    { BamRecordTag::INSERTION_QV,      {"iq", false} },
+    { BamRecordTag::IPD,               {"ip", false} },
+    { BamRecordTag::LABEL_QV,          {"pq", true}  },
+    { BamRecordTag::MERGE_QV,          {"mq", false} },
+    { BamRecordTag::NUM_PASSES,        {"np", false} },
+    { BamRecordTag::PKMEAN,            {"pa", true}  },
+    { BamRecordTag::PKMEAN_2,          {"ps", true}  },
+    { BamRecordTag::PKMID,             {"pm", true}  },
+    { BamRecordTag::PKMID_2,           {"pi", true}  },
+    { BamRecordTag::PRE_PULSE_FRAMES,  {"pd", true}  },
+    { BamRecordTag::PULSE_CALL,        {"pc", true}  },
+    { BamRecordTag::PULSE_CALL_WIDTH,  {"px", true}  },
+    { BamRecordTag::PULSE_MERGE_QV,    {"pg", true}  },
+    { BamRecordTag::PULSE_WIDTH,       {"pw", false} }, // 'pulse' in the name; but stored per-base, not per-pulse
+    { BamRecordTag::QUERY_END,         {"qe", false} },
+    { BamRecordTag::QUERY_START,       {"qs", false} },
+    { BamRecordTag::READ_ACCURACY,     {"rq", false} },
+    { BamRecordTag::READ_GROUP,        {"RG", false} },
+    { BamRecordTag::SCRAP_REGION_TYPE, {"sc", false} },
+    { BamRecordTag::SCRAP_ZMW_TYPE,    {"sz", false} },
+    { BamRecordTag::SNR,               {"sn", false} },
+    { BamRecordTag::START_FRAME,       {"sf", true}  },
+    { BamRecordTag::SUBSTITUTION_QV,   {"sq", false} },
+    { BamRecordTag::SUBSTITUTION_TAG,  {"st", false} },
+
+    // faux tags
+    { BamRecordTag::SEQ,  {"  ",  false} },
+    { BamRecordTag::QUAL, {"  ", false} }
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/src/BamRecordTags.h b/src/BamRecordTags.h

new file mode 100644 (file)

index 0000000..002142a
--- /dev/null
+++ b/src/BamRecordTags.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamRecordTags.h
+/// \brief Defines the BamRecordTags utility class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDTAGS_H
+#define BAMRECORDTAGS_H
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/BamRecordImpl.h"
+#include "pbbam/BamRecordTag.h"
+#include "EnumClassHash.h"
+#include <string>
+#include <unordered_map>
+#include <cassert>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class BamRecordTags
+{
+public:
+    // tag info
+    static inline bool IsPulse(const BamRecordTag tag);
+    static inline std::string LabelFor(const BamRecordTag tag);
+
+private:
+    struct BamRecordTagData
+    {
+        const std::string label_; //[3]; // 2-char tag plus NULL
+        const bool isPulse_;
+    };
+    typedef std::unordered_map<BamRecordTag,
+                               BamRecordTagData,
+                               EnumClassHash> TagLookupType;
+
+    static const TagLookupType tagLookup;
+};
+
+inline bool BamRecordTags::IsPulse(const BamRecordTag tag)
+{
+    assert(tagLookup.find(tag) != tagLookup.cend());
+    return tagLookup.at(tag).isPulse_;
+}
+
+inline std::string BamRecordTags::LabelFor(const BamRecordTag tag)
+{
+    assert(tagLookup.find(tag) != tagLookup.cend());
+    return tagLookup.at(tag).label_;
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // BAMRECORDTAGS_H
diff --git a/src/BamTagCodec.cpp b/src/BamTagCodec.cpp

new file mode 100644 (file)

index 0000000..fca2cbe
--- /dev/null
+++ b/src/BamTagCodec.cpp
@@ -0,0 +1,515 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BamTagCodec.cpp
+/// \brief Implements the BamTagCodec class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamTagCodec.h"
+#include "AssertUtils.h"
+#include <htslib/kstring.h>
+#include <cstring>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+template<typename T>
+inline void appendBamValue(const T& value, kstring_t* str)
+{
+    kputsn_((char*)&value, sizeof(value), str);
+}
+
+template<typename T>
+inline void appendBamMultiValue(const vector<T>& container, kstring_t* str)
+{
+    const uint32_t n = container.size();
+    kputsn_(&n, sizeof(n), str);
+    kputsn_((char*)&container[0], n*sizeof(T), str);
+}
+
+template<typename T>
+inline T readBamValue(const uint8_t* src, size_t& offset)
+{
+    T value;
+    memcpy(&value, &src[offset], sizeof(value));
+    offset += sizeof(value);
+    return value;
+}
+
+template<typename T>
+vector<T> readBamMultiValue(const uint8_t* src, size_t& offset)
+{
+    uint32_t numElements;
+    memcpy(&numElements, &src[offset], sizeof(uint32_t));
+    offset += 4;
+
+    vector<T> result;
+    result.reserve(numElements);
+    for (size_t i = 0; i < numElements; ++i) {
+        const T& value = readBamValue<T>(src, offset);
+        result.push_back(value);
+    }
+    return result;
+}
+
+TagCollection BamTagCodec::Decode(const vector<uint8_t>& data)
+{
+    TagCollection tags;
+
+    // NOTE: not completely safe - no real bounds-checking yet on input data
+
+    const uint8_t* pData  = data.data();
+    const size_t numBytes = data.size();
+    size_t i = 0;
+    while (i < numBytes) {
+
+        string tagName;
+        tagName.reserve(2);
+        tagName.append(1, pData[i++]);
+        tagName.append(1, pData[i++]);
+
+        const char tagType = static_cast<char>(pData[i++]);
+        switch (tagType) {
+            case 'A' :
+            case 'a' :
+            {
+                tags[tagName] = readBamValue<uint8_t>(pData, i);
+                tags[tagName].Modifier(TagModifier::ASCII_CHAR);
+                break;
+            }
+
+            case 'c' : tags[tagName] = readBamValue<int8_t>(pData, i);   break;
+            case 'C' : tags[tagName] = readBamValue<uint8_t>(pData, i);  break;
+            case 's' : tags[tagName] = readBamValue<int16_t>(pData, i);  break;
+            case 'S' : tags[tagName] = readBamValue<uint16_t>(pData, i); break;
+            case 'i' : tags[tagName] = readBamValue<int32_t>(pData, i);  break;
+            case 'I' : tags[tagName] = readBamValue<uint32_t>(pData, i); break;
+            case 'f' : tags[tagName] = readBamValue<float>(pData, i);    break;
+
+            case 'Z' :
+            case 'H' :
+            {
+                const size_t dataLength = strlen((const char*)&pData[i]);
+                string value;
+                value.resize(dataLength);
+                memcpy((char*)value.data(), &pData[i], dataLength);
+                tags[tagName] = value;
+                if (tagType == 'H')
+                    tags[tagName].Modifier(TagModifier::HEX_STRING);
+                i += dataLength + 1;
+                break;
+            }
+
+            case 'B' :
+            {
+                const char subTagType = pData[i++];
+                switch (subTagType) {
+                    case 'c' : tags[tagName] = readBamMultiValue<int8_t>(pData, i);   break;
+                    case 'C' : tags[tagName] = readBamMultiValue<uint8_t>(pData, i);  break;
+                    case 's' : tags[tagName] = readBamMultiValue<int16_t>(pData, i);  break;
+                    case 'S' : tags[tagName] = readBamMultiValue<uint16_t>(pData, i); break;
+                    case 'i' : tags[tagName] = readBamMultiValue<int32_t>(pData, i);  break;
+                    case 'I' : tags[tagName] = readBamMultiValue<uint32_t>(pData, i); break;
+                    case 'f' : tags[tagName] = readBamMultiValue<float>(pData, i);    break;
+
+                    // unknown subTagType
+                    default:
+                        PB_ASSERT_OR_RETURN_VALUE(false, TagCollection());
+                }
+                break;
+            }
+
+            // unknown tagType
+            default:
+                PB_ASSERT_OR_RETURN_VALUE(false, TagCollection());
+        }
+    }
+
+    return tags;
+}
+
+vector<uint8_t> BamTagCodec::Encode(const TagCollection& tags)
+{
+    kstring_t str = { 0, 0, NULL };
+
+    const auto tagEnd  = tags.cend();
+    for (auto tagIter = tags.cbegin(); tagIter != tagEnd; ++tagIter) {
+        const string& name = (*tagIter).first;
+        const Tag& tag = (*tagIter).second;
+        PB_ASSERT_OR_CONTINUE(name.size() == 2);
+        if (tag.IsNull())
+            continue;
+
+        // "<TAG>:"
+        kputsn_(name.c_str(), 2, &str);
+
+        // "<TYPE>:<DATA>" for printable, ASCII char
+        if (tag.HasModifier(TagModifier::ASCII_CHAR)) {
+            char c = tag.ToAscii();
+            if (c != '\0') {
+                kputc_('A', &str);
+                kputc_(c, &str);
+                continue;
+            }
+        }
+
+        // "<TYPE>:<DATA>" for all other data
+        switch ( tag.Type() ) {
+            case TagDataType::INT8   :
+            {
+                kputc_('c', &str);
+                appendBamValue(tag.ToInt8(), &str);
+                break;
+            }
+            case TagDataType::UINT8  :
+            {
+                kputc_('C', &str);
+                appendBamValue(tag.ToUInt8(), &str);
+                break;
+            }
+            case TagDataType::INT16  :
+            {
+                kputc_('s', &str);
+                appendBamValue(tag.ToInt16(), &str);
+                break;
+            }
+            case TagDataType::UINT16 :
+            {
+                kputc_('S', &str);
+                appendBamValue(tag.ToUInt16(), &str);
+                break;
+            }
+            case TagDataType::INT32  :
+            {
+                kputc_('i', &str);
+                appendBamValue(tag.ToInt32(), &str);
+                break;
+            }
+            case TagDataType::UINT32 :
+            {
+                kputc_('I', &str);
+                appendBamValue(tag.ToUInt32(), &str);
+                break;
+            }
+            case TagDataType::FLOAT :
+            {
+                kputc_('f', &str);
+                appendBamValue(tag.ToFloat(), &str);
+                break;
+            }
+
+            case TagDataType::STRING :
+            {
+                if (tag.HasModifier(TagModifier::HEX_STRING))
+                    kputc_('H', &str);
+                else
+                    kputc_('Z', &str);
+                const string& s = tag.ToString();
+                kputsn_(s.c_str(), s.size()+1, &str); // this adds the null-term
+                break;
+            }
+
+            case TagDataType::INT8_ARRAY   :
+            {
+                kputc_('B', &str);
+                kputc_('c', &str);
+                appendBamMultiValue(tag.ToInt8Array(), &str);
+                break;
+            }
+            case TagDataType::UINT8_ARRAY  :
+            {
+                kputc_('B', &str);
+                kputc_('C', &str);
+                appendBamMultiValue(tag.ToUInt8Array(), &str);
+                break;
+            }
+            case TagDataType::INT16_ARRAY  :
+            {
+                kputc_('B', &str);
+                kputc_('s', &str);
+                appendBamMultiValue(tag.ToInt16Array(), &str);
+                break;
+            }
+            case TagDataType::UINT16_ARRAY :
+            {
+                kputc_('B', &str);
+                kputc_('S', &str);
+                appendBamMultiValue(tag.ToUInt16Array(), &str);
+                break;
+            }
+            case TagDataType::INT32_ARRAY  :
+            {
+                kputc_('B', &str);
+                kputc_('i', &str);
+                appendBamMultiValue(tag.ToInt32Array(), &str);
+                break;
+            }
+            case TagDataType::UINT32_ARRAY :
+            {
+                kputc_('B', &str);
+                kputc_('I', &str);
+                appendBamMultiValue(tag.ToUInt32Array(), &str);
+                break;
+            }
+            case TagDataType::FLOAT_ARRAY :
+            {
+                kputc_('B', &str);
+                kputc_('f', &str);
+                appendBamMultiValue(tag.ToFloatArray(), &str);
+                break;
+            }
+
+            // unsupported tag type
+            default :
+                free(str.s);
+                PB_ASSERT_OR_RETURN_VALUE(false, vector<uint8_t>());
+        }
+    }
+
+    vector<uint8_t> result;
+    result.resize(str.l);
+    memcpy((char*)&result[0], str.s, str.l);
+    free(str.s);
+    return result;
+}
+
+Tag BamTagCodec::FromRawData(uint8_t* rawData)
+{
+    size_t offset = 0;
+    const char tagType = static_cast<char>(*rawData++);
+    switch (tagType) {
+        case 'A' :
+        case 'a' :
+        {
+            Tag t = Tag(readBamValue<uint8_t>(rawData, offset));
+            t.Modifier(TagModifier::ASCII_CHAR);
+            return t;
+        }
+
+        case 'c' : return Tag(readBamValue<int8_t>(rawData, offset));
+        case 'C' : return Tag(readBamValue<uint8_t>(rawData, offset));
+        case 's' : return Tag(readBamValue<int16_t>(rawData, offset));
+        case 'S' : return Tag(readBamValue<uint16_t>(rawData, offset));
+        case 'i' : return Tag(readBamValue<int32_t>(rawData, offset));
+        case 'I' : return Tag(readBamValue<uint32_t>(rawData, offset));
+        case 'f' : return Tag(readBamValue<float>(rawData, offset));
+
+        case 'Z' :
+        case 'H' :
+        {
+            const size_t dataLength = strlen((const char*)&rawData[0]);
+            string value;
+            value.resize(dataLength);
+            memcpy((char*)value.data(), &rawData[0], dataLength);
+            Tag t(value);
+            if (tagType == 'H')
+                t.Modifier(TagModifier::HEX_STRING);
+            return t;
+        }
+
+        case 'B' :
+        {
+            const char subTagType = *rawData++;
+            switch (subTagType) {
+
+                case 'c' : return Tag(readBamMultiValue<int8_t>(rawData, offset));
+                case 'C' : return Tag(readBamMultiValue<uint8_t>(rawData, offset));
+                case 's' : return Tag(readBamMultiValue<int16_t>(rawData, offset));
+                case 'S' : return Tag(readBamMultiValue<uint16_t>(rawData, offset));
+                case 'i' : return Tag(readBamMultiValue<int32_t>(rawData, offset));
+                case 'I' : return Tag(readBamMultiValue<uint32_t>(rawData, offset));
+                case 'f' : return Tag(readBamMultiValue<float>(rawData, offset));
+
+                // unknown subTagType
+                default:
+                    PB_ASSERT_OR_RETURN_VALUE(false, Tag());
+            }
+            break;
+        }
+
+        // unknown tagType
+        default:
+            PB_ASSERT_OR_RETURN_VALUE(false, Tag());
+    }
+    return Tag(); // to avoid compiler warning
+}
+
+vector<uint8_t> BamTagCodec::ToRawData(const Tag& tag,
+                                       const TagModifier& additionalModifier)
+{
+    // temp raw data destination (for use with htslib methods)
+    kstring_t str = { 0, 0, NULL };
+
+    // "<TYPE>:<DATA>" for printable, ASCII char
+    if (tag.HasModifier(TagModifier::ASCII_CHAR) || additionalModifier == TagModifier::ASCII_CHAR) {
+        const char c = tag.ToAscii();
+        if (c != '\0')
+            kputc_(c, &str);
+    }
+
+    // for all others
+    else {
+        switch (tag.Type()) {
+
+            // single, numeric values
+            case TagDataType::INT8   : appendBamValue(tag.ToInt8(),   &str); break;
+            case TagDataType::UINT8  : appendBamValue(tag.ToUInt8(),  &str); break;
+            case TagDataType::INT16  : appendBamValue(tag.ToInt16(),  &str); break;
+            case TagDataType::UINT16 : appendBamValue(tag.ToUInt16(), &str); break;
+            case TagDataType::INT32  : appendBamValue(tag.ToInt32(),  &str); break;
+            case TagDataType::UINT32 : appendBamValue(tag.ToUInt32(), &str); break;
+            case TagDataType::FLOAT  : appendBamValue(tag.ToFloat(),  &str); break;
+
+            // string & hex-string values
+            case TagDataType::STRING :
+            {
+                const string& s = tag.ToString();
+                kputsn_(s.c_str(), s.size()+1, &str); // this adds the null-term
+                break;
+            }
+
+            // array-type values
+            case TagDataType::INT8_ARRAY   :
+            {
+                kputc_('c', &str);
+                appendBamMultiValue(tag.ToInt8Array(), &str);
+                break;
+            }
+            case TagDataType::UINT8_ARRAY  :
+            {
+                kputc_('C', &str);
+                appendBamMultiValue(tag.ToUInt8Array(), &str);
+                break;
+            }
+            case TagDataType::INT16_ARRAY  :
+            {
+                kputc_('s', &str);
+                appendBamMultiValue(tag.ToInt16Array(), &str);
+                break;
+            }
+            case TagDataType::UINT16_ARRAY :
+            {
+                kputc_('S', &str);
+                appendBamMultiValue(tag.ToUInt16Array(), &str);
+                break;
+            }
+            case TagDataType::INT32_ARRAY  :
+            {
+                kputc_('i', &str);
+                appendBamMultiValue(tag.ToInt32Array(), &str);
+                break;
+            }
+            case TagDataType::UINT32_ARRAY :
+            {
+                kputc_('I', &str);
+                appendBamMultiValue(tag.ToUInt32Array(), &str);
+                break;
+            }
+            case TagDataType::FLOAT_ARRAY :
+            {
+                kputc_('f', &str);
+                appendBamMultiValue(tag.ToFloatArray(), &str);
+                break;
+            }
+
+            // unsupported tag type
+            default :
+                free(str.s);
+                PB_ASSERT_OR_RETURN_VALUE(false, vector<uint8_t>());
+        }
+    }
+
+    // store temp contents in actual destination
+    vector<uint8_t> result;
+    result.resize(str.l);
+    memcpy((char*)&result[0], str.s, str.l);
+    free(str.s);
+    return result;
+}
+
+uint8_t BamTagCodec::TagTypeCode(const Tag& tag,
+                                 const TagModifier& additionalModifier)
+{
+    if (tag.HasModifier(TagModifier::ASCII_CHAR) || additionalModifier == TagModifier::ASCII_CHAR) {
+        int64_t value = 0;
+        switch (tag.Type()) {
+            case TagDataType::INT8   : value = static_cast<int64_t>(tag.ToInt8());   break;
+            case TagDataType::UINT8  : value = static_cast<int64_t>(tag.ToUInt8());  break;
+            case TagDataType::INT16  : value = static_cast<int64_t>(tag.ToInt16());  break;
+            case TagDataType::UINT16 : value = static_cast<int64_t>(tag.ToUInt16()); break;
+            case TagDataType::INT32  : value = static_cast<int64_t>(tag.ToInt32());  break;
+            case TagDataType::UINT32 : value = static_cast<int64_t>(tag.ToUInt32()); break;
+            default:
+                // non integers not allowed
+                PB_ASSERT_OR_RETURN_VALUE(false, 0);
+        }
+        // printable range
+        PB_ASSERT_OR_RETURN_VALUE(value >= 33,  0);
+        PB_ASSERT_OR_RETURN_VALUE(value <= 126, 0);
+        return static_cast<uint8_t>('A');
+    }
+
+    switch (tag.Type()) {
+        case TagDataType::INT8   : return static_cast<uint8_t>('c');
+        case TagDataType::UINT8  : return static_cast<uint8_t>('C');
+        case TagDataType::INT16  : return static_cast<uint8_t>('s');
+        case TagDataType::UINT16 : return static_cast<uint8_t>('S');
+        case TagDataType::INT32  : return static_cast<uint8_t>('i');
+        case TagDataType::UINT32 : return static_cast<uint8_t>('I');
+        case TagDataType::FLOAT  : return static_cast<uint8_t>('f');
+
+        case TagDataType::STRING :
+        {
+            if (tag.HasModifier(TagModifier::HEX_STRING) || additionalModifier == TagModifier::HEX_STRING)
+                return static_cast<uint8_t>('H');
+            else
+                return static_cast<uint8_t>('Z');
+        }
+
+        case TagDataType::INT8_ARRAY   : // fall through
+        case TagDataType::UINT8_ARRAY  : // .
+        case TagDataType::INT16_ARRAY  : // .
+        case TagDataType::UINT16_ARRAY : // .
+        case TagDataType::INT32_ARRAY  : // .
+        case TagDataType::UINT32_ARRAY : // .
+        case TagDataType::FLOAT_ARRAY  : return static_cast<uint8_t>('B');
+
+        default:
+            PB_ASSERT_OR_RETURN_VALUE(false, 0);
+    }
+    return 0; // to avoid compiler warning
+}
diff --git a/src/BamWriter.cpp b/src/BamWriter.cpp

new file mode 100644 (file)

index 0000000..f7d8400
--- /dev/null
+++ b/src/BamWriter.cpp
@@ -0,0 +1,200 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "pbbam/BamWriter.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/Validator.h"
+#include "AssertUtils.h"
+#include "FileProducer.h"
+#include "MemoryUtils.h"
+#include <htslib/bgzf.h>
+#include <htslib/hfile.h>
+#include <htslib/hts.h>
+#include <iostream>
+#include <thread>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class BamWriterPrivate : public internal::FileProducer
+{
+public:
+    BamWriterPrivate(const std::string& filename,
+                     const PBBAM_SHARED_PTR<bam_hdr_t> rawHeader,
+                     const BamWriter::CompressionLevel compressionLevel,
+                     const size_t numThreads,
+                     const BamWriter::BinCalculationMode binCalculationMode);
+
+public:
+    void Write(const BamRecord& record);
+    void Write(const BamRecord& record, int64_t* vOffset);
+    void Write(const BamRecordImpl& recordImpl);
+
+public:
+    bool calculateBins_;
+    std::unique_ptr<samFile, internal::HtslibFileDeleter> file_;
+    PBBAM_SHARED_PTR<bam_hdr_t> header_;
+};
+
+BamWriterPrivate::BamWriterPrivate(const string& filename,
+                                   const PBBAM_SHARED_PTR<bam_hdr_t> rawHeader,
+                                   const BamWriter::CompressionLevel compressionLevel,
+                                   const size_t numThreads,
+                                   const BamWriter::BinCalculationMode binCalculationMode)
+    : internal::FileProducer(filename)
+    , calculateBins_(binCalculationMode == BamWriter::BinCalculation_ON)
+    , file_(nullptr)
+    , header_(rawHeader)
+{
+    if (!header_)
+        throw std::runtime_error("null header");
+
+    // open file
+    const string& usingFilename = TempFilename();
+    const string& mode = string("wb") + to_string(static_cast<int>(compressionLevel));
+    file_.reset(sam_open(usingFilename.c_str(), mode.c_str()));
+    if (!file_)
+        throw std::runtime_error("could not open file for writing");
+
+    // if no explicit thread count given, attempt built-in check
+    size_t actualNumThreads = numThreads;
+    if (actualNumThreads == 0) {
+        actualNumThreads = thread::hardware_concurrency();
+
+        // if still unknown, default to single-threaded
+        if (actualNumThreads == 0)
+            actualNumThreads = 1;
+    }
+
+    // if multithreading requested, enable it
+    if (actualNumThreads > 1)
+        hts_set_threads(file_.get(), actualNumThreads);
+
+    // write header
+    const int ret = sam_hdr_write(file_.get(), header_.get());
+    if (ret != 0)
+        throw std::runtime_error("could not write header");
+}
+
+void BamWriterPrivate::Write(const BamRecord& record)
+{
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(record);
+#endif
+
+    const auto rawRecord = internal::BamRecordMemory::GetRawData(record);
+
+    // (probably) store bins
+    // min_shift=14 & n_lvls=5 are BAM "magic numbers"
+    if (calculateBins_)
+        rawRecord->core.bin = hts_reg2bin(rawRecord->core.pos, bam_endpos(rawRecord.get()), 14, 5);
+
+    // write record to file
+    const int ret = sam_write1(file_.get(), header_.get(), rawRecord.get());
+    if (ret <= 0)
+        throw std::runtime_error("could not write record");
+}
+
+void BamWriterPrivate::Write(const BamRecord& record, int64_t* vOffset)
+{
+    BGZF* bgzf = file_.get()->fp.bgzf;
+    assert(bgzf);
+    assert(vOffset);
+
+    // ensure offsets up-to-date
+    bgzf_flush(bgzf);
+
+    // capture virtual offset where we’re about to write
+    const off_t rawTell = htell(bgzf->fp);
+    const int length = bgzf->block_offset;
+    *vOffset = (rawTell << 16) | length ;
+
+    // now write data
+    Write(record);
+}
+
+inline void BamWriterPrivate::Write(const BamRecordImpl& recordImpl)
+{ Write(BamRecord(recordImpl)); }
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+BamWriter::BamWriter(const std::string& filename,
+                     const BamHeader& header,
+                     const BamWriter::CompressionLevel compressionLevel,
+                     const size_t numThreads,
+                     const BinCalculationMode binCalculationMode)
+    : IRecordWriter()
+    , d_(nullptr)
+{
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(header);
+#endif
+    d_.reset(new internal::BamWriterPrivate{ filename,
+                                             internal::BamHeaderMemory::MakeRawHeader(header),
+                                             compressionLevel,
+                                             numThreads,
+                                             binCalculationMode
+                                           });
+}
+
+BamWriter::~BamWriter(void)
+{
+    bgzf_flush(d_->file_.get()->fp.bgzf);
+}
+
+void BamWriter::TryFlush(void)
+{
+    // TODO: sanity checks on file_ & fp
+    const int ret = bgzf_flush(d_->file_.get()->fp.bgzf);
+    if (ret != 0)
+        throw std::runtime_error("could not flush output buffer contents");
+}
+
+void BamWriter::Write(const BamRecord& record)
+{ d_->Write(record); }
+
+void BamWriter::Write(const BamRecord& record, int64_t* vOffset)
+{ d_->Write(record, vOffset); }
+
+void BamWriter::Write(const BamRecordImpl& recordImpl)
+{ d_->Write(recordImpl); }
diff --git a/src/BarcodeQuery.cpp b/src/BarcodeQuery.cpp

new file mode 100644 (file)

index 0000000..47af230
--- /dev/null
+++ b/src/BarcodeQuery.cpp
@@ -0,0 +1,68 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file BarcodeQuery.cpp
+/// \brief Implements the BarcodeQuery class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BarcodeQuery.h"
+#include "pbbam/PbiFilterTypes.h"
+#include "pbbam/CompositeBamReader.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+struct BarcodeQuery::BarcodeQueryPrivate
+{
+    BarcodeQueryPrivate(const int16_t barcode, const DataSet& dataset)
+        : reader_(PbiBarcodeFilter(barcode), dataset)
+    { }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_; // unsorted
+};
+
+BarcodeQuery::BarcodeQuery(const int16_t barcode,
+                           const DataSet& dataset)
+    : internal::IQuery()
+    , d_(new BarcodeQueryPrivate(barcode, dataset))
+{ }
+
+BarcodeQuery::~BarcodeQuery(void) { }
+
+bool BarcodeQuery::GetNext(BamRecord &r)
+{ return d_->reader_.GetNext(r); }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

new file mode 100644 (file)

index 0000000..4b0b2dc
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,89 @@
+
+# grab library source files
+include(files.cmake)
+set(SOURCES
+    ${PacBioBAM_H}
+    ${PacBioBAM_CPP}
+)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PacBioBAM_CXX_FLAGS}")
+
+# define actual library
+add_library(pbbam ${SOURCES})
+
+# library properties
+target_compile_definitions(pbbam
+    PRIVATE "-DPBBAM_LIBRARY"
+)
+set_target_properties(pbbam PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY ${PacBioBAM_LibDir}
+    RUNTIME_OUTPUT_DIRECTORY ${PacBioBAM_LibDir}
+    LIBRARY_OUTPUT_DIRECTORY ${PacBioBAM_LibDir}
+)
+
+if(PacBioBAM_wrap_r)
+    # SWIG R does not support std::shared_ptr, but it does support boost::shared_ptr
+    # So force boost if we're wrapping for R.
+    target_compile_definitions(pbbam
+        PUBLIC -DPBBAM_USE_BOOST_SHARED_PTR
+    )
+endif()
+
+if(PacBioBAM_auto_validate)
+    target_compile_definitions(pbbam
+        PUBLIC "-DPBBAM_AUTOVALIDATE=1"
+    )
+endif()
+
+# pbbam includes
+target_include_directories(pbbam
+    PUBLIC
+    ${PacBioBAM_IncludeDir}
+    ${HTSLIB_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIRS}
+    ${ZLIB_INCLUDE_DIRS}
+)
+
+# set link dependencies
+#   if htslib provided externally
+if(HTSLIB_LIBRARIES)
+    set(pbbam_all_dependency_libs
+        ${HTSLIB_LIBRARIES}
+        ${ZLIB_LIBRARIES}
+        ${SOCKET_LIBRARIES}
+        ${CMAKE_THREAD_LIBS_INIT}
+    )
+# otherwise, use the "in-project" htslib target
+else()
+    set(pbbam_all_dependency_libs
+        $<TARGET_FILE:hts>
+        ${ZLIB_LIBRARIES}
+        ${SOCKET_LIBRARIES}
+        ${CMAKE_THREAD_LIBS_INIT}
+    )
+endif()
+
+target_link_libraries(pbbam
+    PUBLIC
+    ${pbbam_all_dependency_libs}
+)
+
+# define include paths for projects that use pbbam
+set(PacBioBAM_INCLUDE_DIRS
+    ${PacBioBAM_IncludeDir}
+    ${HTSLIB_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIRS}
+    ${ZLIB_INCLUDE_DIRS}
+    CACHE INTERNAL
+    "${PROJECT_NAME}: Include Directories"
+    FORCE
+)
+set(PacBioBAM_LIBRARIES
+    $<TARGET_FILE:pbbam>
+    ${pbbam_all_dependency_libs}
+    CACHE INTERNAL
+    "${PROJECT_NAME}: Libraries"
+    FORCE
+)
+
+# add SWIG directory
+add_subdirectory(swig)
diff --git a/src/ChemistryTable.cpp b/src/ChemistryTable.cpp

new file mode 100644 (file)

index 0000000..fbc161f
--- /dev/null
+++ b/src/ChemistryTable.cpp
@@ -0,0 +1,85 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Lance Hepler
+
+#include "ChemistryTable.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+extern const std::vector<std::array<std::string, 4>> ChemistryTable = {
+
+    // BindingKit, SequencingKit, BasecallerVersion, Chemistry
+
+    // RS
+    {{"100356300",   "100356200",   "2.1", "P6-C4"}},
+    {{"100356300",   "100356200",   "2.3", "P6-C4"}},
+    {{"100356300",   "100612400",   "2.1", "P6-C4"}},
+    {{"100356300",   "100612400",   "2.3", "P6-C4"}},
+    {{"100372700",   "100356200",   "2.1", "P6-C4"}},
+    {{"100372700",   "100356200",   "2.3", "P6-C4"}},
+    {{"100372700",   "100612400",   "2.1", "P6-C4"}},
+    {{"100372700",   "100612400",   "2.3", "P6-C4"}},
+
+    // 3.0 ("Dromedary"): S/P1-C1/beta
+    {{"100-619-300", "100-620-000", "3.0", "S/P1-C1/beta"}},
+    {{"100-619-300", "100-620-000", "3.1", "S/P1-C1/beta"}},
+
+    // 3.1 ("Echidna"): S/P1-C1.1
+    {{"100-619-300", "100-867-300", "3.1", "S/P1-C1.1"}},
+    {{"100-619-300", "100-867-300", "3.2", "S/P1-C1.1"}},
+    {{"100-619-300", "100-867-300", "3.3", "S/P1-C1.1"}},
+
+    // 3.1.1 ("Flea"): S/P1-C1.2
+    {{"100-619-300", "100-902-100", "3.1", "S/P1-C1.2"}},
+    {{"100-619-300", "100-902-100", "3.2", "S/P1-C1.2"}},
+    {{"100-619-300", "100-902-100", "3.3", "S/P1-C1.2"}},
+    {{"100-619-300", "100-902-100", "4.0", "S/P1-C1.2"}},
+
+    // 3.2 ("Goat"): S/P1-C1.3
+    {{"100-619-300", "100-972-200", "3.2", "S/P1-C1.3"}},
+    {{"100-619-300", "100-972-200", "3.3", "S/P1-C1.3"}},
+    {{"100-619-300", "100-972-200", "4.0", "S/P1-C1.3"}},
+
+    // 4.0 ("Seabiscuit"); S/P2-C2
+    {{"100-862-200", "100-861-800", "4.0", "S/P2-C2"}}
+
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/src/ChemistryTable.h b/src/ChemistryTable.h

new file mode 100644 (file)

index 0000000..6caacaa
--- /dev/null
+++ b/src/ChemistryTable.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Lance Hepler
+
+#ifndef CHEMISTRYTABLE_H
+#define CHEMISTRYTABLE_H
+
+#include <array>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+extern const std::vector<std::array<std::string, 4>> ChemistryTable;
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // CHEMISTRYTABLE_H
diff --git a/src/Cigar.cpp b/src/Cigar.cpp

new file mode 100644 (file)

index 0000000..f099f54
--- /dev/null
+++ b/src/Cigar.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Cigar.cpp
+/// \brief Implements the Cigar class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Cigar.h"
+#include <sstream>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+Cigar::Cigar(const string& cigarString)
+    : vector<CigarOperation>()
+{
+    size_t numberStart = 0;
+    const size_t numChars = cigarString.size();
+    for (size_t i = 0; i < numChars; ++i) {
+        const char c = cigarString.at(i);
+        if (!isdigit(c)) {
+            const size_t distance = i - numberStart;
+            const uint32_t length = stoul(cigarString.substr(numberStart, distance));
+            push_back(CigarOperation(c, length));
+            numberStart = i+1;
+        }
+    }
+}
+
+string Cigar::ToStdString(void) const
+{
+    stringstream s;
+    const auto end  = this->cend();
+    for (auto iter = this->cbegin(); iter != end; ++iter) {
+        const CigarOperation& cigar = (*iter);
+        s << cigar.Length()
+          << cigar.Char();
+    }
+    return s.str();
+}
diff --git a/src/CigarOperation.cpp b/src/CigarOperation.cpp

new file mode 100644 (file)

index 0000000..a207a58
--- /dev/null
+++ b/src/CigarOperation.cpp
@@ -0,0 +1,67 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file CigarOperation.cpp
+/// \brief Implements the CigarOperation class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/CigarOperation.h"
+#include <htslib/sam.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+CigarOperationType CigarOperation::CharToType(const char c)
+{
+    switch(c)
+    {
+        case 'S' : return CigarOperationType::SOFT_CLIP;
+        case '=' : return CigarOperationType::SEQUENCE_MATCH;
+        case 'X' : return CigarOperationType::SEQUENCE_MISMATCH;
+        case 'I' : return CigarOperationType::INSERTION;
+        case 'D' : return CigarOperationType::DELETION;
+        case 'N' : return CigarOperationType::REFERENCE_SKIP;
+        case 'H' : return CigarOperationType::HARD_CLIP;
+        case 'P' : return CigarOperationType::PADDING;
+        case 'M' : return CigarOperationType::ALIGNMENT_MATCH;
+        default:
+            return CigarOperationType::UNKNOWN_OP;
+    }
+}
+
+char CigarOperation::TypeToChar(const CigarOperationType type)
+{ return bam_cigar_opchr(static_cast<int>(type)); }
diff --git a/src/Compare.cpp b/src/Compare.cpp

new file mode 100644 (file)

index 0000000..43874f2
--- /dev/null
+++ b/src/Compare.cpp
@@ -0,0 +1,141 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Compare.cpp
+/// \brief Implements the Compare class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Compare.h"
+#include <functional>
+#include <unordered_map>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct TypeAlias
+{
+    string name_;
+    string op_;
+    string opAlpha_;
+
+    TypeAlias(const string& name = string(),
+              const string& op = string(),
+              const string& opAlpha = string())
+        : name_(name)
+        , op_(op)
+        , opAlpha_(opAlpha)
+    { }
+};
+
+struct CompareTypeHash
+{
+    size_t operator()(const Compare::Type& t) const
+    { return std::hash<int>()(static_cast<int>(t)); }
+};
+
+static const unordered_map<string, Compare::Type> opToTypeMap =
+{
+    // basic operators plus some permissiveness for other representations
+
+    { "==",    Compare::EQUAL },
+    { "=",     Compare::EQUAL },
+    { "eq",    Compare::EQUAL },
+    { "!=",    Compare::NOT_EQUAL },
+    { "ne",    Compare::NOT_EQUAL },
+    { "<",     Compare::LESS_THAN },
+    { "lt",    Compare::LESS_THAN },
+    { "&lt;",  Compare::LESS_THAN },
+    { "<=",    Compare::LESS_THAN_EQUAL },
+    { "lte",   Compare::LESS_THAN_EQUAL },
+    { "&lt;=", Compare::LESS_THAN_EQUAL },
+    { ">",     Compare::GREATER_THAN },
+    { "gt",    Compare::GREATER_THAN },
+    { "&gt;",  Compare::GREATER_THAN },
+    { ">=",    Compare::GREATER_THAN_EQUAL },
+    { "gte",   Compare::GREATER_THAN_EQUAL },
+    { "&gt;=", Compare::GREATER_THAN_EQUAL },
+    { "&",     Compare::CONTAINS },
+    { "~",     Compare::NOT_CONTAINS }
+};
+
+static const unordered_map<Compare::Type, TypeAlias, CompareTypeHash> typeAliases =
+{
+    { Compare::EQUAL,              TypeAlias{ "Compare::EQUAL",              "==", "eq"  } },
+    { Compare::NOT_EQUAL,          TypeAlias{ "Compare::NOT_EQUAL",          "!=", "ne"  } },
+    { Compare::LESS_THAN,          TypeAlias{ "Compare::LESS_THAN",          "<",  "lt"  } },
+    { Compare::LESS_THAN_EQUAL,    TypeAlias{ "Compare::LESS_THAN_EQUAL",    "<=", "lte" } },
+    { Compare::GREATER_THAN,       TypeAlias{ "Compare::GREATER_THAN",       ">",  "gt"  } },
+    { Compare::GREATER_THAN_EQUAL, TypeAlias{ "Compare::GREATER_THAN_EQUAL", ">=", "gte" } },
+    { Compare::CONTAINS,           TypeAlias{ "Compare::CONTAINS",           "&",  "and" } },
+    { Compare::NOT_CONTAINS,       TypeAlias{ "Compare::NOT_CONTAINS",       "~",  "not" } }
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+Compare::Type Compare::TypeFromOperator(const string& opString)
+{
+    try {
+        return internal::opToTypeMap.at(opString);
+    } catch (std::exception&) {
+        throw std::runtime_error(opString + " is not a valid comparison operator." );
+    }
+}
+
+string Compare::TypeToName(const Compare::Type& type)
+{
+    try {
+        return internal::typeAliases.at(type).name_;
+    } catch (std::exception&) {
+        throw std::runtime_error("invalid comparison type encountered" );
+    }
+}
+
+string Compare::TypeToOperator(const Compare::Type& type, bool asAlpha)
+{
+    try {
+        return asAlpha ? internal::typeAliases.at(type).opAlpha_
+                       : internal::typeAliases.at(type).op_;
+    } catch (std::exception&) {
+        throw std::runtime_error("invalid comparison type encountered" );
+    }
+}
diff --git a/src/Config.cpp b/src/Config.cpp

new file mode 100644 (file)

index 0000000..095aa37
--- /dev/null
+++ b/src/Config.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Config.cpp
+/// \brief Initializes global variable defaults.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Config.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace PacBio {
+namespace BAM {
+
+// Initialized to -1 to indicate default. Client code may set this or not.
+//
+// To respect client code or else fallback to default[OFF], this value should be used like this:
+//
+//    hts_verbose = ( PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity);
+//
+//
+//
+int HtslibVerbosity = -1;
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/src/DataSet.cpp b/src/DataSet.cpp

new file mode 100644 (file)

index 0000000..af6141c
--- /dev/null
+++ b/src/DataSet.cpp
@@ -0,0 +1,298 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file DataSet.cpp
+/// \brief Implements the DataSet class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/DataSet.h"
+#include "pbbam/DataSetTypes.h"
+#include "pbbam/internal/DataSetBaseTypes.h"
+#include "DataSetIO.h"
+#include "FileUtils.h"
+#include "TimeUtils.h"
+#include <boost/algorithm/string.hpp>
+#include <unordered_map>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static const string defaultVersion{ "4.0.0" };
+
+static inline void InitDefaults(DataSet& ds)
+{
+    // provide default 'CreatedAt' & 'Version' attributes if not already present in XML
+
+    if (ds.CreatedAt().empty())
+        ds.CreatedAt(internal::ToIso8601(CurrentTime()));
+    
+    if (ds.Version().empty())
+        ds.Version(internal::defaultVersion);
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+DataSet::DataSet(void)
+    : DataSet(DataSet::GENERIC)
+{ 
+    InitDefaults(*this);
+}
+
+DataSet::DataSet(const DataSet::TypeEnum type)
+    : d_(nullptr)
+    , path_(FileUtils::CurrentWorkingDirectory())
+{
+    switch(type) {
+        case DataSet::GENERIC             : d_.reset(new DataSetBase); break;
+        case DataSet::ALIGNMENT           : d_.reset(new AlignmentSet); break;
+        case DataSet::BARCODE             : d_.reset(new BarcodeSet); break;
+        case DataSet::CONSENSUS_ALIGNMENT : d_.reset(new ConsensusAlignmentSet); break;
+        case DataSet::CONSENSUS_READ      : d_.reset(new ConsensusReadSet); break;
+        case DataSet::CONTIG              : d_.reset(new ContigSet); break;
+        case DataSet::HDF_SUBREAD         : d_.reset(new HdfSubreadSet); break;
+        case DataSet::REFERENCE           : d_.reset(new ReferenceSet); break;
+        case DataSet::SUBREAD             : d_.reset(new SubreadSet); break;
+        default:
+            throw std::runtime_error("unsupported dataset type"); // unknown type
+    }
+
+    InitDefaults(*this);
+}
+
+DataSet::DataSet(const BamFile& bamFile)
+    : d_(DataSetIO::FromUri(bamFile.Filename()))
+    , path_(FileUtils::CurrentWorkingDirectory())
+{
+    InitDefaults(*this);
+}
+
+DataSet::DataSet(const string& filename)
+    : d_(DataSetIO::FromUri(filename))
+    , path_(FileUtils::DirectoryName(filename))
+{
+    // for FOFN contents and raw BAM filenames, we can just use the current
+    // directory as the starting path.
+    //
+    // (any relative paths in the FOFN have already been resolved)
+    //
+    if (boost::algorithm::iends_with(filename, ".fofn") ||
+        boost::algorithm::iends_with(filename, ".bam"))
+    {
+        path_ = FileUtils::CurrentWorkingDirectory();
+    }
+    InitDefaults(*this);
+}
+
+DataSet::DataSet(const vector<string>& filenames)
+    : d_(DataSetIO::FromUris(filenames))
+    , path_(FileUtils::CurrentWorkingDirectory())
+{ 
+    InitDefaults(*this);
+}
+
+DataSet::DataSet(const DataSet& other)
+    : path_(other.path_)
+{
+    DataSetBase* otherDataset = other.d_.get();
+    DataSetElement* copyDataset = new DataSetElement(*otherDataset);
+    d_.reset(static_cast<DataSetBase*>(copyDataset));
+}
+
+DataSet::DataSet(DataSet&& other)
+    : d_(std::move(other.d_))
+    , path_(std::move(other.path_))
+{
+    assert(other.d_.get() == nullptr);
+}
+
+DataSet& DataSet::operator=(const DataSet& other)
+{
+    DataSetBase* otherDataset = other.d_.get();
+    DataSetElement* copyDataset = new DataSetElement(*otherDataset);
+    d_.reset(static_cast<DataSetBase*>(copyDataset));
+    path_ = other.path_;
+    return *this;
+}
+
+DataSet& DataSet::operator=(DataSet&& other)
+{
+    d_ = std::move(other.d_);
+    path_ = std::move(other.path_);
+    return *this;
+}
+
+DataSet::~DataSet(void) { }
+
+DataSet& DataSet::operator+=(const DataSet& other)
+{
+    *d_.get() += *other.d_.get();
+    return *this;
+}
+
+vector<BamFile> DataSet::BamFiles(void) const
+{
+    const PacBio::BAM::ExternalResources& resources = ExternalResources();
+    
+    vector<BamFile> result;
+    result.reserve(resources.Size());
+    for(const ExternalResource& ext : resources) {
+
+        // only bother resolving file path if this is a BAM file
+        boost::iterator_range<string::const_iterator> bamFound = boost::algorithm::ifind_first(ext.MetaType(), "bam");
+        if (!bamFound.empty()) {
+            const string fn = ResolvePath(ext.ResourceId());
+            result.push_back(BamFile(fn));
+        }
+    }
+    return result;
+}
+
+DataSet DataSet::FromXml(const string& xml)
+{
+    DataSet result;
+    result.d_ = internal::DataSetIO::FromXmlString(xml);
+    InitDefaults(result);
+    return result;
+}
+
+const NamespaceRegistry& DataSet::Namespaces(void) const
+{ return d_->Namespaces(); }
+
+NamespaceRegistry& DataSet::Namespaces(void)
+{ return d_->Namespaces(); }
+
+DataSet::TypeEnum DataSet::NameToType(const string& typeName)
+{
+    static std::unordered_map<std::string, DataSet::TypeEnum> lookup;
+    if (lookup.empty()) {
+        lookup["DataSet"]               = DataSet::GENERIC;
+        lookup["AlignmentSet"]          = DataSet::ALIGNMENT;
+        lookup["BarcodeSet"]            = DataSet::BARCODE;
+        lookup["ConsensusAlignmentSet"] = DataSet::CONSENSUS_ALIGNMENT;
+        lookup["ConsensusReadSet"]      = DataSet::CONSENSUS_READ;
+        lookup["ContigSet"]             = DataSet::CONTIG;
+        lookup["HdfSubreadSet"]         = DataSet::HDF_SUBREAD;
+        lookup["ReferenceSet"]          = DataSet::REFERENCE;
+        lookup["SubreadSet"]            = DataSet::SUBREAD;
+    }
+    return lookup.at(typeName); // throws if unknown typename
+}
+
+vector<string> DataSet::ResolvedResourceIds(void) const
+{
+    const PacBio::BAM::ExternalResources& resources = ExternalResources();
+
+    vector<string> result;
+    result.reserve(resources.Size());
+    for(const ExternalResource& ext : resources) {
+//        const string fn = ;
+//        const string fn = internal::FileUtils::ResolvedFilePath(ext.ResourceId(), path_);
+        result.push_back(ResolvePath(ext.ResourceId()));
+    }
+    return result;
+}
+
+string DataSet::ResolvePath(const string& originalPath) const
+{ return internal::FileUtils::ResolvedFilePath(originalPath, path_); }
+
+void DataSet::Save(const std::string& outputFilename)
+{ DataSetIO::ToFile(d_, outputFilename); }
+
+void DataSet::SaveToStream(ostream& out)
+{ DataSetIO::ToStream(d_, out); }
+
+set<string> DataSet::SequencingChemistries(void) const
+{
+    const vector<BamFile> bamFiles{ BamFiles() };
+
+    set<string> result;
+    for(const BamFile& bf : bamFiles) {
+        if (!bf.IsPacBioBAM())
+            throw std::runtime_error{ "only PacBio BAMs are supported" };
+        const vector<ReadGroupInfo> readGroups{ bf.Header().ReadGroups() };
+        for (const ReadGroupInfo& rg : readGroups)
+            result.insert(rg.SequencingChemistry());
+    }
+    return result;
+}
+
+string DataSet::TypeToName(const DataSet::TypeEnum& type)
+{
+    switch(type) {
+        case DataSet::GENERIC             : return "DataSet";
+        case DataSet::ALIGNMENT           : return "AlignmentSet";
+        case DataSet::BARCODE             : return "BarcodeSet";
+        case DataSet::CONSENSUS_ALIGNMENT : return "ConsensusAlignmentSet";
+        case DataSet::CONSENSUS_READ      : return "ConsensusReadSet";
+        case DataSet::CONTIG              : return "ContigSet";
+        case DataSet::HDF_SUBREAD         : return "HdfSubreadSet";
+        case DataSet::REFERENCE           : return "ReferenceSet";
+        case DataSet::SUBREAD             : return "SubreadSet";
+        default:
+            throw std::runtime_error("unsupported dataset type"); // unknown type
+    }
+}
+
+// Exposed timestamp utils
+
+namespace PacBio {
+namespace BAM {
+
+string CurrentTimestamp(void)
+{ return internal::ToDataSetFormat(internal::CurrentTime()); }
+
+string ToDataSetFormat(const chrono::system_clock::time_point &tp)
+{ return internal::ToDataSetFormat(tp); }
+
+string ToDataSetFormat(const time_t &t)
+{ return ToDataSetFormat(chrono::system_clock::from_time_t(t)); }
+
+string ToIso8601(const chrono::system_clock::time_point &tp)
+{ return internal::ToIso8601(tp); }
+
+string ToIso8601(const time_t &t)
+{ return ToIso8601(chrono::system_clock::from_time_t(t)); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/src/DataSetBaseTypes.cpp b/src/DataSetBaseTypes.cpp

new file mode 100644 (file)

index 0000000..2c19e0b
--- /dev/null
+++ b/src/DataSetBaseTypes.cpp
@@ -0,0 +1,126 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "pbbam/DataSetTypes.h"
+#include "pbbam/internal/DataSetBaseTypes.h"
+#include "DataSetUtils.h"
+#include "TimeUtils.h"
+#include <boost/algorithm/string.hpp>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+// ----------------
+// BaseEntityType
+// ----------------
+
+BaseEntityType::BaseEntityType(const std::string& label, const XsdType& xsd)
+    : DataSetElement(label, xsd)
+{
+    if (Version().empty())
+        Version(internal::XML_VERSION);
+}
+
+DEFINE_ACCESSORS(BaseEntityType, Extensions, Extensions)
+
+BaseEntityType& BaseEntityType::Extensions(const PacBio::BAM::Extensions& extensions)
+{ Extensions() = extensions; return *this; }
+
+// ----------------
+// DataEntityType
+// ----------------
+
+DataEntityType::DataEntityType(const std::string& label, const XsdType& xsd)
+    : BaseEntityType(label, xsd)
+{ }
+
+// -----------------
+// IndexedDataType
+// -----------------
+
+IndexedDataType::IndexedDataType(const string& metatype,
+                                 const string& filename,
+                                 const string& label, 
+                                 const XsdType &xsd)
+    : InputOutputDataType(metatype, filename, label, xsd)
+{ }
+
+DEFINE_ACCESSORS(IndexedDataType, FileIndices, FileIndices)
+
+IndexedDataType& IndexedDataType::FileIndices(const PacBio::BAM::FileIndices& indices)
+{ FileIndices() = indices; return *this; }
+
+// ---------------------
+// InputOutputDataType
+// ---------------------
+
+InputOutputDataType::InputOutputDataType(const string& metatype,
+                                         const string& filename,
+                                         const string& label,
+                                         const XsdType &xsd)
+    : StrictEntityType(metatype, label, xsd)
+{  
+    ResourceId(filename);
+}
+
+// ----------------
+// StrictEntityType
+// ----------------
+
+StrictEntityType::StrictEntityType(const string& metatype, 
+                                   const string& label, 
+                                   const XsdType& xsd)
+    : BaseEntityType(label, xsd)
+{ 
+    // MetaType
+    MetaType(metatype);
+
+    // TimeStampedName
+    const size_t numChars = metatype.size();
+    string transformedMetatype;
+    transformedMetatype.resize(numChars);
+    for (size_t i = 0; i < numChars; ++i) {
+        const char c = metatype.at(i);
+        transformedMetatype[i] = ((c == '.') ? '_' : tolower(c));
+    }
+    const string& tsn = transformedMetatype + "-" + internal::ToDataSetFormat(internal::CurrentTime());
+    TimeStampedName(tsn);
+
+    // UniqueId
+    UniqueId(internal::GenerateUuid());
+}
diff --git a/src/DataSetElement.cpp b/src/DataSetElement.cpp

new file mode 100644 (file)

index 0000000..6854fd2
--- /dev/null
+++ b/src/DataSetElement.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "pbbam/internal/DataSetElement.h"
+#include "DataSetUtils.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+
+const std::string& DataSetElement::SharedNullString(void)
+{
+    return internal::NullObject<std::string>();
+}
diff --git a/src/DataSetIO.cpp b/src/DataSetIO.cpp

new file mode 100644 (file)

index 0000000..741ffc8
--- /dev/null
+++ b/src/DataSetIO.cpp
@@ -0,0 +1,162 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "DataSetIO.h"
+#include "FileUtils.h"
+#include "FofnReader.h"
+#include "StringUtils.h"
+#include "XmlReader.h"
+#include "XmlWriter.h"
+#include <boost/algorithm/string.hpp>
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <cassert>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+typedef std::shared_ptr<DataSetBase> DataSetPtr;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static
+unique_ptr<DataSetBase> FromXml(const string& xmlFn)
+{
+    ifstream in(xmlFn);
+    if (!in)
+        throw std::runtime_error("could not open XML file for reading");
+    return XmlReader::FromStream(in);
+}
+
+static
+unique_ptr<DataSetBase> FromBam(const string& bamFn)
+{
+    unique_ptr<DataSetBase> dataset(new SubreadSet);
+    ExternalResources& resources = dataset->ExternalResources();
+    resources.Add(ExternalResource(BamFile(bamFn)));
+    return dataset;
+}
+
+static
+unique_ptr<DataSetBase> FromFofn(const string& fofn)
+{
+    const string fofnDir = internal::FileUtils::DirectoryName(fofn);
+    ifstream in(fofn);
+    if (!in)
+        throw std::runtime_error("could not open FOFN for reading");
+
+    vector<string> filenames = FofnReader::Files(in);
+    for (size_t i = 0; i < filenames.size(); ++i)
+        filenames[i] = internal::FileUtils::ResolvedFilePath(filenames[i], fofnDir);
+    return DataSetIO::FromUris(filenames);
+}
+
+static
+unique_ptr<DataSetBase> FromUri(const string& uri)
+{
+    // NOTE: this says URI, but we're not quite handling filenames as true URIs
+    //       basically just treating as a regular filename for now
+
+    // handle on extension
+    if (boost::algorithm::iends_with(uri, ".xml"))
+        return FromXml(uri);
+    else if (boost::algorithm::iends_with(uri, ".bam"))
+        return FromBam(uri);
+    else if (boost::algorithm::iends_with(uri, ".fofn"))
+        return FromFofn(uri);
+
+    // unknown filename extension
+    throw std::runtime_error("unsupported input file extension");
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+std::unique_ptr<DataSetBase> DataSetIO::FromUri(const std::string& uri)
+{
+    return FromUris(vector<string>(1, uri));
+}
+
+std::unique_ptr<DataSetBase> DataSetIO::FromUris(const std::vector<std::string>& uris)
+{
+    if (uris.empty())
+        throw std::runtime_error("empty input URI list"); // or just return empty, generic DataSet?
+
+    // create dataset(s) from URI(s)
+    vector< unique_ptr<DataSetBase> > datasets;
+    datasets.reserve(uris.size());
+    for ( const auto& uri : uris )
+        datasets.push_back(internal::FromUri(uri));
+    assert(!datasets.empty());
+
+    // if only 1, just return
+    if (datasets.size() == 1)
+        return std::unique_ptr<DataSetBase>(datasets.front().release());
+
+    // else merge
+    else {
+        unique_ptr<DataSetBase>& result = datasets.front();
+        for (size_t i = 1; i < datasets.size(); ++i)
+            *result += *datasets.at(i);
+        return unique_ptr<DataSetBase>(result.release());
+    }
+}
+
+std::unique_ptr<DataSetBase> DataSetIO::FromXmlString(const string& xml)
+{
+    if (xml.empty())
+        throw std::runtime_error("empty XML string");
+    stringstream s(xml);
+    return XmlReader::FromStream(s);
+}
+
+void DataSetIO::ToFile(const std::unique_ptr<DataSetBase>& dataset,
+                       const string& fn)
+{
+    ofstream out(fn);
+    if (!out)
+        throw std::runtime_error("could not open XML for writing");
+    XmlWriter::ToStream(dataset, out);
+}
+
+void DataSetIO::ToStream(const std::unique_ptr<DataSetBase>& dataset, ostream &out)
+{ XmlWriter::ToStream(dataset, out); }
diff --git a/src/DataSetIO.h b/src/DataSetIO.h

new file mode 100644 (file)

index 0000000..b03c23b
--- /dev/null
+++ b/src/DataSetIO.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef DATASETIO_H
+#define DATASETIO_H
+
+#include "pbbam/DataSet.h"
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class DataSetIO
+{
+public:
+
+    // input
+    static std::unique_ptr<DataSetBase> FromUri(const std::string& uri);
+    static std::unique_ptr<DataSetBase> FromUris(const std::vector<std::string>& uris);
+
+    static std::unique_ptr<DataSetBase> FromXmlString(const std::string& xml);
+
+//    static DataSetBase FromUri(const std::string& uri);
+//    static DataSetBase FromUris(const std::vector<std::string>& uris);
+
+//    // output
+    static void ToFile(const std::unique_ptr<DataSetBase>& dataset, const std::string& fn);
+    static void ToStream(const std::unique_ptr<DataSetBase>& dataset, std::ostream& out);
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // DATASETIO_H
diff --git a/src/DataSetTypes.cpp b/src/DataSetTypes.cpp

new file mode 100644 (file)

index 0000000..9dd7b27
--- /dev/null
+++ b/src/DataSetTypes.cpp
@@ -0,0 +1,480 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file DataSetTypes.cpp
+/// \brief Implementations for the public DataSet component classes.
+//
+// Author: Derek Barnett
+
+#include "pbbam/DataSetTypes.h"
+#include "pbbam/internal/DataSetBaseTypes.h"
+#include "DataSetUtils.h"
+#include "FileUtils.h"
+#include "TimeUtils.h"
+#include <set>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+// -------------------
+// AlignmentSet
+// -------------------
+
+AlignmentSet::AlignmentSet(void)
+    : DataSetBase("PacBio.DataSet.AlignmentSet",
+                  "AlignmentSet",
+                  XsdType::DATASETS)
+{ }
+
+// -------------------
+// BarcodeSet
+// -------------------
+
+BarcodeSet::BarcodeSet(void)
+    : DataSetBase("PacBio.DataSet.BarcodeSet",
+                  "BarcodeSet",
+                  XsdType::DATASETS)
+{ }
+
+// -----------------------
+// ConsensusAlignmentSet
+// -----------------------
+
+ConsensusAlignmentSet::ConsensusAlignmentSet(void)
+    : DataSetBase("PacBio.DataSet.ConsensusAlignmentSet",
+                  "ConsensusAlignmentSet",
+                  XsdType::DATASETS)
+{ }
+
+// -------------------
+// ConsensusReadSet
+// -------------------
+
+ConsensusReadSet::ConsensusReadSet(void)
+    : DataSetBase("PacBio.DataSet.ConsensusReadSet",
+                  "ConsensusReadSet",
+                  XsdType::DATASETS)
+{ }
+
+// -------------------
+// ContigSet
+// -------------------
+
+ContigSet::ContigSet(void)
+    : DataSetBase("PacBio.DataSet.ContigSet",
+                  "ContigSet",
+                  XsdType::DATASETS)
+{ }
+
+// -------------------
+// DataSetBase
+// -------------------
+
+DataSetBase::DataSetBase(void)
+    : StrictEntityType("PacBio.DataSet.DataSet",
+                       "DataSet",
+                       XsdType::DATASETS)
+{ }
+
+DataSetBase::DataSetBase(const string& metatype,
+                         const string& label,
+                         const XsdType& xsd)
+    : StrictEntityType(metatype, label, xsd)
+{ }
+
+DEFINE_ACCESSORS(DataSetBase, ExternalResources, ExternalResources)
+
+DataSetBase& DataSetBase::ExternalResources(const PacBio::BAM::ExternalResources& resources)
+{ ExternalResources() = resources; return *this;  }
+
+DEFINE_ACCESSORS(DataSetBase, Filters, Filters)
+
+DataSetBase& DataSetBase::Filters(const PacBio::BAM::Filters& filters)
+{ Filters() = filters; return *this;  }
+
+DEFINE_ACCESSORS(DataSetBase, DataSetMetadata, Metadata)
+
+DataSetBase& DataSetBase::Metadata(const PacBio::BAM::DataSetMetadata& metadata)
+{ Metadata() = metadata; return *this; }
+
+const PacBio::BAM::SubDataSets& DataSetBase::SubDataSets(void) const
+{
+    try {
+        return Child<PacBio::BAM::SubDataSets>("DataSets");
+    } catch (std::exception&) {
+        return internal::NullObject<PacBio::BAM::SubDataSets>();
+    }
+}
+
+PacBio::BAM::SubDataSets& DataSetBase::SubDataSets(void)
+{
+    if (!HasChild("DataSets"))
+        AddChild(internal::NullObject<PacBio::BAM::SubDataSets>());
+    return Child<PacBio::BAM::SubDataSets>("DataSets");
+}
+
+DataSetBase& DataSetBase::SubDataSets(const PacBio::BAM::SubDataSets &subdatasets)
+{ SubDataSets() = subdatasets; return *this;  }
+
+DataSetBase* DataSetBase::DeepCopy(void) const
+{
+    DataSetElement* copyDataset = new DataSetElement(*this);
+    DataSetBase* result = static_cast<DataSetBase*>(copyDataset);
+    result->registry_ = registry_;
+    return result;
+}
+
+DataSetBase& DataSetBase::operator+=(const DataSetBase& other)
+{
+    // must be same dataset types (or 'other' must be generic)
+    if (other.LocalNameLabel() != LocalNameLabel() && other.LocalNameLabel() != "DataSet")
+        throw std::runtime_error("cannot merge incompatible dataset types");
+
+    // check filter match
+    // check object metadata
+    Metadata() += other.Metadata();
+    ExternalResources() += other.ExternalResources();
+    Filters() += other.Filters();
+    SubDataSets() += other;
+
+    return *this;
+}
+
+std::shared_ptr<DataSetBase> DataSetBase::Create(const string& typeName)
+{
+    if (typeName == string("DataSet"))       return make_shared<DataSetBase>();
+    if (typeName == string("SubreadSet"))    return make_shared<SubreadSet>();
+    if (typeName == string("AlignmentSet"))  return make_shared<AlignmentSet>();
+    if (typeName == string("BarcodeSet"))    return make_shared<BarcodeSet>();
+    if (typeName == string("ConsensusAlignmentSet")) return make_shared<ConsensusAlignmentSet>();
+    if (typeName == string("ConsensusReadSet"))      return make_shared<ConsensusReadSet>();
+    if (typeName == string("ContigSet"))     return make_shared<ContigSet>();
+    if (typeName == string("HdfSubreadSet")) return make_shared<HdfSubreadSet>();
+    if (typeName == string("ReferenceSet"))  return make_shared<ReferenceSet>();
+
+    // unknown typename
+    throw std::runtime_error("unsupported dataset type");
+}
+
+// -------------------
+// DataSetMetadata
+// -------------------
+
+DataSetMetadata::DataSetMetadata(const std::string& numRecords,
+                                 const std::string& totalLength)
+    : DataSetElement("DataSetMetadata", XsdType::DATASETS)
+{
+    NumRecords(numRecords);
+    TotalLength(totalLength);
+}
+
+DEFINE_ACCESSORS(DataSetMetadata, Provenance, Provenance)
+
+DataSetMetadata& DataSetMetadata::Provenance(const PacBio::BAM::Provenance& provenance)
+{ Provenance() = provenance; return *this; }
+
+DataSetMetadata& DataSetMetadata::operator+=(const DataSetMetadata& other)
+{
+    NumRecords() = NumRecords() + other.NumRecords();
+    TotalLength() = TotalLength() + other.TotalLength();
+    // merge add'l
+    return *this;
+}
+
+// -------------------
+// ExtensionElement
+// -------------------
+
+ExtensionElement::ExtensionElement(void)
+    : DataSetElement("ExtensionElement", XsdType::BASE_DATA_MODEL)
+{ }
+
+// -------------------
+// Extensions
+// -------------------
+
+Extensions::Extensions(void)
+    : DataSetListElement<ExtensionElement>("Extensions", XsdType::BASE_DATA_MODEL)
+{ }
+
+// -------------------
+// ExternalResource
+// -------------------
+
+ExternalResource::ExternalResource(const BamFile& bamFile)
+    : IndexedDataType("PacBio.SubreadFile.SubreadBamFile",
+                      bamFile.Filename(),
+                      "ExternalResource",
+                      XsdType::BASE_DATA_MODEL)
+{ }
+
+ExternalResource::ExternalResource(const string& metatype,
+                                   const string& filename)
+    : IndexedDataType(metatype,
+                      filename,
+                      "ExternalResource",
+                      XsdType::BASE_DATA_MODEL)
+{ }
+
+DEFINE_ACCESSORS(ExternalResource, ExternalResources, ExternalResources)
+
+ExternalResource& ExternalResource::ExternalResources(const PacBio::BAM::ExternalResources& resources)
+{ ExternalResources() = resources; return *this; }
+
+BamFile ExternalResource::ToBamFile(void) const
+{ return BamFile(ResourceId()); }
+
+// -------------------
+// ExternalResources
+// -------------------
+
+ExternalResources::ExternalResources(void)
+    : DataSetListElement<ExternalResource>("ExternalResources",
+                                           XsdType::BASE_DATA_MODEL)
+{ }
+
+ExternalResources& ExternalResources::operator+=(const ExternalResources& other)
+{
+    // only keep unique resource ids
+
+    set<std::string> myResourceIds;
+    for (size_t i = 0; i < Size(); ++i) {
+        const ExternalResource& resource = this->operator[](i);
+        myResourceIds.insert(resource.ResourceId());
+    }
+
+    vector<size_t> newResourceIndices;
+    const size_t numOtherResourceIds = other.Size();
+    for (size_t i = 0; i < numOtherResourceIds; ++i) {
+        const string& resourceId = other[i].ResourceId();
+        auto found = myResourceIds.find(resourceId);
+        if (found == myResourceIds.cend())
+            newResourceIndices.push_back(i);
+    }
+
+    for (size_t index : newResourceIndices)
+        Add(other[index]);
+
+    return *this;
+}
+
+void ExternalResources::Add(const ExternalResource& ext)
+{
+    // disallow external resources w/ duplicate ResourceIds
+    set<std::string> myResourceIds;
+    for (size_t i = 0; i < Size(); ++i) {
+        const ExternalResource& resource = this->operator[](i);
+        myResourceIds.insert(resource.ResourceId());
+    }
+    if (myResourceIds.find(ext.ResourceId()) == myResourceIds.cend())
+        AddChild(ext);
+}
+
+vector<BamFile> ExternalResources::BamFiles(void) const
+{
+    vector<BamFile> result;
+    const int numResources = Size();
+    result.reserve(numResources);
+    for( const ExternalResource& ext : *this )
+        result.push_back(ext.ToBamFile());
+    return result;
+}
+
+void ExternalResources::Remove(const ExternalResource& ext)
+{ RemoveChild(ext); }
+
+// -------------------
+// FileIndex
+// -------------------
+
+FileIndex::FileIndex(const string& metatype, const string& filename)
+    : InputOutputDataType(metatype,
+                          filename,
+                          "FileIndex",
+                          XsdType::BASE_DATA_MODEL)
+{ }
+
+// -------------------
+// FileIndices
+// -------------------
+
+FileIndices::FileIndices(void)
+    : DataSetListElement<FileIndex>("FileIndices", XsdType::BASE_DATA_MODEL)
+{ }
+
+void FileIndices::Add(const FileIndex& index)
+{ AddChild(index); }
+
+void FileIndices::Remove(const FileIndex& index)
+{ RemoveChild(index); }
+
+// -------------------
+// Filter
+// -------------------
+
+Filter::Filter(void)
+    : DataSetElement("Filter", XsdType::DATASETS)
+{ }
+
+DEFINE_ACCESSORS(Filter, Properties, Properties)
+
+Filter& Filter::Properties(const PacBio::BAM::Properties& properties)
+{ Properties() = properties; return *this; }
+
+// -------------------
+// Filters
+// -------------------
+
+Filters::Filters(void)
+    : DataSetListElement<Filter>("Filters", XsdType::DATASETS)
+{ }
+
+Filters& Filters::operator+=(const Filters& other)
+{
+    for (auto& newFilter : other)
+        AddChild(newFilter);
+    return *this;
+}
+
+void Filters::Add(const Filter& filter)
+{ AddChild(filter); }
+
+void Filters::Remove(const Filter& filter)
+{ RemoveChild(filter); }
+
+// -------------------
+// HdfSubreadSet
+// -------------------
+
+HdfSubreadSet::HdfSubreadSet(void)
+    : DataSetBase("PacBio.DataSet.HdfSubreadSet",
+                  "HdfSubreadSet",
+                  XsdType::DATASETS)
+{ }
+
+// -------------------
+// ParentTool
+// -------------------
+
+ParentTool::ParentTool(void)
+    : BaseEntityType("ParentTool", XsdType::DATASETS)
+{ }
+
+// -------------------
+// Properties
+// -------------------
+
+Properties::Properties(void)
+    : DataSetListElement<Property>("Properties", XsdType::BASE_DATA_MODEL)
+{ }
+
+void Properties::Add(const Property &property)
+{ AddChild(property); }
+
+void Properties::Remove(const Property& property)
+{ RemoveChild(property); }
+
+// -------------------
+// Property
+// -------------------
+
+Property::Property(const std::string& name,
+                   const std::string& value,
+                   const std::string& op)
+    : DataSetElement("Property", XsdType::BASE_DATA_MODEL)
+{
+    Name(name);
+    Value(value);
+    Operator(op);
+}
+
+// -------------------
+// Provenance
+// -------------------
+
+Provenance::Provenance(void)
+    : DataSetElement("Provenance", XsdType::DATASETS)
+{ }
+
+DEFINE_ACCESSORS(Provenance, ParentTool, ParentTool)
+
+// -------------------
+// ReferenceSet
+// -------------------
+
+ReferenceSet::ReferenceSet(void)
+    : DataSetBase("PacBio.DataSet.ReferenceSet",
+                  "ReferenceSet",
+                  XsdType::DATASETS)
+{ }
+
+// -------------------
+// SubDataSets
+// -------------------
+
+SubDataSets::SubDataSets(void)
+    : internal::DataSetListElement<DataSetBase>("DataSets", XsdType::DATASETS)
+{ }
+
+SubDataSets& SubDataSets::operator+=(const DataSetBase& other)
+{
+    AddChild(other);
+    return *this;
+}
+
+SubDataSets& SubDataSets::operator+=(const SubDataSets& other)
+{
+    for (auto& newSubDataset : other)
+        AddChild(newSubDataset);
+    return *this;
+}
+
+void SubDataSets::Add(const DataSetBase& subdataset)
+{ AddChild(subdataset); }
+
+void SubDataSets::Remove(const DataSetBase& subdataset)
+{ RemoveChild(subdataset); }
+
+// -------------------
+// SubreadSet
+// -------------------
+
+SubreadSet::SubreadSet(void)
+    : DataSetBase("PacBio.DataSet.SubreadSet",
+                  "SubreadSet",
+                  XsdType::DATASETS)
+{ }
diff --git a/src/DataSetUtils.h b/src/DataSetUtils.h

new file mode 100644 (file)

index 0000000..dcf234c
--- /dev/null
+++ b/src/DataSetUtils.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef DATASETUTILS_H
+#define DATASETUTILS_H
+
+#include "pbbam/DataSetTypes.h"
+#include <boost/uuid/random_generator.hpp>
+#include <boost/uuid/uuid_io.hpp>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static const std::string XML_VERSION = std::string { "3.0.1" };
+
+template<typename T>
+inline const T& NullObject(void)
+{
+    static const T empty;
+    return empty;
+}
+
+template<>
+inline const PacBio::BAM::DataSetMetadata& NullObject(void)
+{
+    static const PacBio::BAM::DataSetMetadata empty("", "");
+    return empty;
+}
+
+inline
+std::string GenerateUuid(void)
+{
+    static boost::uuids::random_generator gen;
+    const boost::uuids::uuid uuid = gen();
+    return boost::uuids::to_string(uuid);
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#ifndef FETCH_CHILD_CONST_REF
+#define FETCH_CHILD_CONST_REF(Class, Type, Method) \
+    \
+    const PacBio::BAM::Type& Class::Method(void) const \
+    { \
+        try { \
+            return Child<PacBio::BAM::Type>(#Type); \
+        } catch (std::exception&) { \
+            return internal::NullObject<PacBio::BAM::Type>(); \
+        } \
+    }
+#endif
+
+#ifndef FETCH_CHILD_REF
+#define FETCH_CHILD_REF(Class, Type, Method) \
+    \
+    PacBio::BAM::Type& Class::Method(void) \
+    { \
+        if (!HasChild(#Type)) \
+            AddChild(internal::NullObject<PacBio::BAM::Type>()); \
+        return Child<PacBio::BAM::Type>(#Type); \
+    }
+#endif
+
+#ifndef DEFINE_ACCESSORS
+#define DEFINE_ACCESSORS(Class, Type, Method) \
+    FETCH_CHILD_CONST_REF(Class, Type, Method) \
+    FETCH_CHILD_REF(Class, Type, Method)
+#endif
+
+#endif // DATASETUTILS_H
diff --git a/src/DataSetXsd.cpp b/src/DataSetXsd.cpp

new file mode 100644 (file)

index 0000000..88d2d91
--- /dev/null
+++ b/src/DataSetXsd.cpp
@@ -0,0 +1,263 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file DataSetXsd.cpp
+/// \brief Implements the XSD- and namespace-related classes for DataSetXML.
+//
+// Author: Derek Barnett
+
+#include "pbbam/DataSetXsd.h"
+#include <unordered_map>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static map<XsdType, NamespaceInfo> DefaultRegistry(void)
+{
+    const auto result = map<XsdType, NamespaceInfo>
+    {
+        { XsdType::NONE,                   NamespaceInfo{ "", "" } },
+        { XsdType::AUTOMATION_CONSTRAINTS, NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioAutomationConstraints.xsd" } },
+        { XsdType::BASE_DATA_MODEL,        NamespaceInfo{ "pbbase", "http://pacificbiosciences.com/PacBioBaseDataModel.xsd" } },
+        { XsdType::COLLECTION_METADATA,    NamespaceInfo{ "pbmeta", "http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" } },
+        { XsdType::COMMON_MESSAGES,        NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioCommonMessages.xsd" } },
+        { XsdType::DATA_MODEL,             NamespaceInfo{ "pbdm",   "http://pacificbiosciences.com/PacBioDataModel.xsd" } },
+        { XsdType::DATA_STORE,             NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioDataStore.xsd" } },
+        { XsdType::DATASETS,               NamespaceInfo{ "pbds",   "http://pacificbiosciences.com/PacBioDatasets.xsd" } },
+        { XsdType::DECL_DATA,              NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioDeclData.xsd" } },
+        { XsdType::PART_NUMBERS,           NamespaceInfo{ "pbpn",   "http://pacificbiosciences.com/PacBioPartNumbers.xsd" } },
+        { XsdType::PRIMARY_METRICS,        NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioPrimaryMetrics.xsd" } },
+        { XsdType::REAGENT_KIT,            NamespaceInfo{ "pbrk",   "http://pacificbiosciences.com/PacBioReagentKit.xsd" } },
+        { XsdType::RIGHTS_AND_ROLES,       NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioRightsAndRoles.xsd" } },
+        { XsdType::SAMPLE_INFO,            NamespaceInfo{ "pbsample", "http://pacificbiosciences.com/PacBioSampleInfo.xsd" } },
+        { XsdType::SEEDING_DATA,           NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioSeedingData.xsd" } }
+    };
+    return result;
+}
+
+static const auto elementRegistry = unordered_map<string, XsdType>
+{
+    // 'pbbase' elements
+    //
+    { "AutomationParameter" ,  XsdType::BASE_DATA_MODEL },
+    { "AutomationParameters" , XsdType::BASE_DATA_MODEL },
+    { "BinCount" ,             XsdType::BASE_DATA_MODEL },
+    { "BinCounts" ,            XsdType::BASE_DATA_MODEL },
+    { "BinLabel" ,             XsdType::BASE_DATA_MODEL },
+    { "BinLabels" ,            XsdType::BASE_DATA_MODEL },
+    { "BinWidth" ,             XsdType::BASE_DATA_MODEL },
+    { "ExternalResource" ,     XsdType::BASE_DATA_MODEL },
+    { "ExternalResources" ,    XsdType::BASE_DATA_MODEL },
+    { "FileIndex" ,            XsdType::BASE_DATA_MODEL },
+    { "FileIndices" ,          XsdType::BASE_DATA_MODEL },
+    { "MaxBinValue" ,          XsdType::BASE_DATA_MODEL },
+    { "MaxOutlierValue" ,      XsdType::BASE_DATA_MODEL },
+    { "MetricDescription" ,    XsdType::BASE_DATA_MODEL },
+    { "NumBins" ,              XsdType::BASE_DATA_MODEL },
+    { "Properties" ,           XsdType::BASE_DATA_MODEL },
+    { "Property" ,             XsdType::BASE_DATA_MODEL },
+    { "Sample95thPct" ,        XsdType::BASE_DATA_MODEL },
+    { "SampleMean" ,           XsdType::BASE_DATA_MODEL },
+    { "SampleMed" ,            XsdType::BASE_DATA_MODEL },
+    { "SampleSize" ,           XsdType::BASE_DATA_MODEL },
+    { "SampleStd" ,            XsdType::BASE_DATA_MODEL },
+
+    // 'pbds' elements
+    //
+    { "AdapterDimerFraction",  XsdType::DATASETS },
+    { "AlignmentSet",          XsdType::DATASETS },
+    { "BarcodeConstruction",   XsdType::DATASETS },
+    { "BarcodeSet",            XsdType::DATASETS },
+    { "ConsensusAlignmentSet", XsdType::DATASETS },
+    { "ConsensusReadSet",      XsdType::DATASETS },
+    { "Contig",                XsdType::DATASETS },
+    { "Contigs",               XsdType::DATASETS },
+    { "ContigSet",             XsdType::DATASETS },
+    { "ControlReadLenDist",    XsdType::DATASETS },
+    { "ControlReadQualDist",   XsdType::DATASETS },
+    { "DataSetMetdata",        XsdType::DATASETS },
+    { "DataSet",               XsdType::DATASETS },
+    { "DataSets",              XsdType::DATASETS },
+    { "Filter",                XsdType::DATASETS },
+    { "Filters",               XsdType::DATASETS },
+    { "HdfSubreadSet",         XsdType::DATASETS },
+    { "InsertReadLenDist",     XsdType::DATASETS },
+    { "InsertReadQualDist" ,   XsdType::DATASETS },
+    { "MedianInsertDist",      XsdType::DATASETS },
+    { "NumRecords",            XsdType::DATASETS },
+    { "NumSequencingZmws",     XsdType::DATASETS },
+    { "Organism",              XsdType::DATASETS },
+    { "ParentTool",            XsdType::DATASETS },
+    { "Ploidy",                XsdType::DATASETS },
+    { "ProdDist",              XsdType::DATASETS },
+    { "Provenance",            XsdType::DATASETS },
+    { "ReadLenDist",           XsdType::DATASETS },
+    { "ReadQualDist",          XsdType::DATASETS },
+    { "ReadTypeDist",          XsdType::DATASETS },
+    { "ReferenceSet",          XsdType::DATASETS },
+    { "ShortInsertFraction",   XsdType::DATASETS },
+    { "SubreadSet",            XsdType::DATASETS },
+    { "SummaryStats",          XsdType::DATASETS },
+    { "TotalLength",           XsdType::DATASETS },
+
+    // 'pbmeta' elements
+    //
+    { "Automation",           XsdType::COLLECTION_METADATA },
+    { "AutomationName",       XsdType::COLLECTION_METADATA },
+    { "CellIndex",            XsdType::COLLECTION_METADATA },
+    { "CellPac",              XsdType::COLLECTION_METADATA },
+    { "CollectionFileCopy",   XsdType::COLLECTION_METADATA },
+    { "CollectionMetadata",   XsdType::COLLECTION_METADATA },
+    { "CollectionNumber",     XsdType::COLLECTION_METADATA },
+    { "CollectionPathUri",    XsdType::COLLECTION_METADATA },
+    { "Collections",          XsdType::COLLECTION_METADATA },
+    { "Concentration",        XsdType::COLLECTION_METADATA },
+    { "ConfigFileName",       XsdType::COLLECTION_METADATA },
+    { "CopyFiles",            XsdType::COLLECTION_METADATA },
+    { "InstCtrlVer",          XsdType::COLLECTION_METADATA },
+    { "MetricsVerbosity",     XsdType::COLLECTION_METADATA },
+    { "Name",                 XsdType::COLLECTION_METADATA },
+    { "OutputOptions",        XsdType::COLLECTION_METADATA },
+    { "PlateId",              XsdType::COLLECTION_METADATA },
+    { "Primary",              XsdType::COLLECTION_METADATA },
+    { "Readout",              XsdType::COLLECTION_METADATA },
+    { "ResultsFolder",        XsdType::COLLECTION_METADATA },
+    { "RunDetails",           XsdType::COLLECTION_METADATA },
+    { "RunId",                XsdType::COLLECTION_METADATA },
+    { "SampleReuseEnabled",   XsdType::COLLECTION_METADATA },
+    { "SequencingCondition",  XsdType::COLLECTION_METADATA },
+    { "SigProcVer",           XsdType::COLLECTION_METADATA },
+    { "SizeSelectionEnabled", XsdType::COLLECTION_METADATA },
+    { "StageHotstartEnabled", XsdType::COLLECTION_METADATA },
+    { "UseCount",             XsdType::COLLECTION_METADATA },
+    { "WellName",             XsdType::COLLECTION_METADATA },
+    { "WellSample",           XsdType::COLLECTION_METADATA },
+
+    // 'pbsample' elements
+    //
+    { "BioSample",         XsdType::SAMPLE_INFO },
+    { "BioSamplePointer",  XsdType::SAMPLE_INFO },
+    { "BioSamplePointers", XsdType::SAMPLE_INFO },
+    { "BioSamples",        XsdType::SAMPLE_INFO }
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+// ---------------
+// NamespaceInfo
+// ---------------
+
+NamespaceInfo::NamespaceInfo(void) { }
+
+NamespaceInfo::NamespaceInfo(const string& name,
+                             const string& uri)
+    : name_(name)
+    , uri_(uri)
+{ }
+
+// -------------------
+// NamespaceRegistry
+// -------------------
+
+NamespaceRegistry::NamespaceRegistry(void)
+    : data_(internal::DefaultRegistry())
+    , defaultXsdType_(XsdType::DATASETS)
+{ }
+
+NamespaceRegistry::NamespaceRegistry(const NamespaceRegistry &other)
+    : data_(other.data_)
+    , defaultXsdType_(other.defaultXsdType_)
+{ }
+
+NamespaceRegistry::NamespaceRegistry(NamespaceRegistry &&other)
+    : data_(std::move(other.data_))
+    , defaultXsdType_(std::move(other.defaultXsdType_))
+{ }
+
+NamespaceRegistry& NamespaceRegistry::operator=(const NamespaceRegistry& other)
+{
+    data_ = other.data_;
+    defaultXsdType_ = other.defaultXsdType_;
+    return *this;
+}
+
+NamespaceRegistry& NamespaceRegistry::operator=(NamespaceRegistry&& other)
+{
+    data_ = std::move(other.data_);
+    defaultXsdType_ = std::move(other.defaultXsdType_);
+    return *this;
+}
+
+NamespaceRegistry::~NamespaceRegistry(void) { }
+
+const NamespaceInfo& NamespaceRegistry::DefaultNamespace(void) const
+{ return Namespace(DefaultXsd()); }
+
+XsdType NamespaceRegistry::DefaultXsd(void) const
+{ return defaultXsdType_; }
+
+const NamespaceInfo& NamespaceRegistry::Namespace(const XsdType& xsd) const
+{ return data_.at(xsd); }
+
+void NamespaceRegistry::Register(const XsdType& xsd, const NamespaceInfo& namespaceInfo)
+{ data_[xsd] = namespaceInfo; }
+
+void NamespaceRegistry::SetDefaultXsd(const XsdType& xsd)
+{ defaultXsdType_ = xsd; }
+
+XsdType NamespaceRegistry::XsdForElement(const std::string& elementLabel) const
+{
+    const auto iter = internal::elementRegistry.find(elementLabel);
+    return (iter == internal::elementRegistry.cend() ? XsdType::NONE : iter->second);
+}
+
+XsdType NamespaceRegistry::XsdForUri(const std::string& uri) const
+{
+    map<XsdType, NamespaceInfo>::const_iterator iter = data_.cbegin();
+    map<XsdType, NamespaceInfo>::const_iterator end  = data_.cend();
+    for ( ; iter != end; ++iter ) {
+        const NamespaceInfo& info = iter->second;
+        if (info.Uri() == uri)
+            return iter->first;
+    }
+    return XsdType::NONE;
+}
diff --git a/src/EntireFileQuery.cpp b/src/EntireFileQuery.cpp

new file mode 100644 (file)

index 0000000..6813492
--- /dev/null
+++ b/src/EntireFileQuery.cpp
@@ -0,0 +1,65 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file EntireFileQuery.cpp
+/// \brief Implements the EntireFileQuery class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/CompositeBamReader.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+struct EntireFileQuery::EntireFileQueryPrivate
+{
+    EntireFileQueryPrivate(const DataSet& dataset)
+        : reader_(dataset)
+    { }
+
+    SequentialCompositeBamReader reader_;
+};
+
+EntireFileQuery::EntireFileQuery(const DataSet &dataset)
+    : internal::IQuery()
+    , d_(new EntireFileQueryPrivate(dataset))
+{ }
+
+EntireFileQuery::~EntireFileQuery(void) { }
+
+bool EntireFileQuery::GetNext(BamRecord &r)
+{ return d_->reader_.GetNext(r); }
diff --git a/src/EnumClassHash.h b/src/EnumClassHash.h

new file mode 100644 (file)

index 0000000..53740f7
--- /dev/null
+++ b/src/EnumClassHash.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file EnumClassHash.h
+/// \brief Defines the EnumClassHash class.
+//
+// Author: Derek Barnett
+
+#ifndef ENUMCLASSHASH_H
+#define ENUMCLASSHASH_H
+
+#include <cstddef>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+///
+/// \brief The EnumClassHash struct enables the use of enum class types as keys
+///        for std::unordered_map.
+///
+/// Allows something like:
+///
+/// \code{.cpp}
+///    std::unordered_map<Key_t, Value_t, EnumClassHash> myLookup;
+/// \endcode
+///
+/// where Key_t is an enum class. Without this sort of extra hand-holding to
+/// provide a 'manual' hash value, enum classes as keys will fail to compile.
+///
+/// \note This approach might be unnecessary in C++14, if I understand some of
+/// the changes correctly. But this works for C++11 and should continue beyond.
+///
+/// \sa http://stackoverflow.com/questions/18837857/cant-use-enum-class-as-unordered-map-key
+///
+struct EnumClassHash
+{
+    // *** NOTE ***
+    //
+    // Remove this when we integrate pbcopper.
+    // This is a duplicate of pbcopper/utility/EnumClassHash.h
+    //
+
+    template<typename T> size_t operator()(const T t) const
+    { return static_cast<size_t>(t); }
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // ENUMCLASSHASH_H
diff --git a/src/FastaReader.cpp b/src/FastaReader.cpp

new file mode 100644 (file)

index 0000000..f82d635
--- /dev/null
+++ b/src/FastaReader.cpp
@@ -0,0 +1,155 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file FastaReader.cpp
+/// \brief Implements the FastaReader class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/FastaReader.h"
+#include <htslib/faidx.h>
+#include <stdexcept>
+#include <fstream>
+#include <iostream>
+#include <limits>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct FastaReaderPrivate
+{
+    ifstream stream_;
+    string name_;
+    string bases_;
+
+    FastaReaderPrivate(const std::string& fn)
+        : stream_(fn)
+    {
+        if (!stream_)
+            throw std::runtime_error("FastaReader - could not open " + fn + " for reading");
+        FetchNext();
+    }
+
+    bool GetNext(FastaSequence& record)
+    {
+        if (name_.empty() && bases_.empty())
+            return false;
+        record = FastaSequence { name_, bases_ };
+        FetchNext();
+        return true;
+    }
+
+private:
+    void FetchNext(void)
+    {
+        name_.clear();
+        bases_.clear();
+
+        SkipNewlines();
+        ReadName();
+        ReadBases();
+    }
+
+    inline void SkipNewlines(void)
+    {
+        if (!stream_)
+            return;
+        if (stream_.peek() == '\n')
+            stream_.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+    }
+
+    void ReadName(void) {
+        if (!stream_)
+            return;
+        if (stream_.get() == '>')
+            std::getline(stream_, name_,  '\n');
+    }
+
+    void ReadBases(void)
+    {
+        if (!stream_)
+            return;
+        char c = static_cast<char>(stream_.peek());
+        string line;
+        while (c != '>') {
+            if (!stream_)
+                return;
+            std::getline(stream_, line, '\n');
+            bases_ += line;
+            if (!stream_)
+                return;
+            c = static_cast<char>(stream_.peek());
+        }
+    }
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+FastaReader::FastaReader(const std::string& fn)
+    : d_{ new internal::FastaReaderPrivate{ fn } }
+{ }
+
+FastaReader::FastaReader(FastaReader&& other)
+    : d_{ std::move(other.d_) }
+{ }
+
+FastaReader& FastaReader::operator=(FastaReader&& other)
+{
+    d_.swap(other.d_);
+    return *this;
+}
+
+FastaReader::~FastaReader(void) { }
+
+bool FastaReader::GetNext(FastaSequence& record)
+{ return d_->GetNext(record); }
+
+vector<FastaSequence> FastaReader::ReadAll(const string& fn)
+{
+    vector<FastaSequence> result;
+    result.reserve(256);
+    FastaReader reader{ fn };
+    FastaSequence s;
+    while(reader.GetNext(s))
+        result.emplace_back(s);
+    return result;
+}
diff --git a/src/FileProducer.cpp b/src/FileProducer.cpp

new file mode 100644 (file)

index 0000000..8cec89e
--- /dev/null
+++ b/src/FileProducer.cpp
@@ -0,0 +1,71 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "FileProducer.h"
+#include <exception>
+#include <cstdio>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+FileProducer::FileProducer(const string& targetFilename)
+    : FileProducer(targetFilename, targetFilename + ".tmp")
+{ }
+
+FileProducer::FileProducer(const string& targetFilename,
+                           const string& tempFilename)
+    : targetFilename_(targetFilename)
+    , tempFilename_(tempFilename)
+{
+    // override renaming if writing to stdout
+    //
+    // setting temp filename to '-' keeps consistent interfaces
+    // for derived classes to actually operate on temp filename
+    if (targetFilename_ == "-")
+        tempFilename_ = "-";
+}
+
+FileProducer::~FileProducer(void)
+{
+    // skip renaming if there is a 'live' exception
+    // or if writing to stdout
+    if ((std::current_exception() == nullptr) && (tempFilename_ != "-")) {
+        std::rename(tempFilename_.c_str(),
+                    targetFilename_.c_str());
+    }
+}
diff --git a/src/FileProducer.h b/src/FileProducer.h

new file mode 100644 (file)

index 0000000..aee8c85
--- /dev/null
+++ b/src/FileProducer.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef FILEPRODUCER_H
+#define FILEPRODUCER_H
+
+#include <string>
+#include <stdio.h>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// The FileProducer class provides functionality for working with a temp
+// file until successful destruction of a FileProducer-derived class.
+//
+// Derived classes should be sure to flush/close the temp file, and the
+// FileProducer's destructor will ensure that the temp file will be renamed to
+// the target filename.
+//
+// If destruction is triggered by an exception, no renaming will occur.
+//
+class FileProducer {
+
+protected:
+    FileProducer(void) = delete;
+
+    // Initializes FileProducer with specified target filename. Temp filename is
+    // set to target filename plus ".tmp" suffix.
+    FileProducer(const std::string& targetFilename);
+
+    // Initializes FileProducer with specified target filename & explicit temp
+    // filename.
+    FileProducer(const std::string& targetFilename,
+                 const std::string& tempFilename);
+
+    // Renames temp file to target filename.
+    //
+    // Derived classes should ensure that data is flushed and file handle closed
+    // before or during their destructor.
+    //
+    // Remaming will not occur if there is a 'live' exception being thrown.
+    //
+    ~FileProducer(void);
+
+protected:
+    const std::string& TargetFilename(void) const
+    { return targetFilename_; }
+
+    const std::string& TempFilename(void) const
+    { return tempFilename_; }
+
+private:
+    std::string targetFilename_;
+    std::string tempFilename_;
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // FILEPRODUCER_H
diff --git a/src/FileUtils.cpp b/src/FileUtils.cpp

new file mode 100644 (file)

index 0000000..a0a59af
--- /dev/null
+++ b/src/FileUtils.cpp
@@ -0,0 +1,246 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "FileUtils.h"
+#include "StringUtils.h"
+#include <boost/algorithm/string.hpp>
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <cassert>
+#include <sys/stat.h>
+#include <unistd.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// pops "file://" scheme off the front of a URI/filepath, if found
+static string removeFileUriScheme(const string& uri)
+{
+    assert(!uri.empty());
+
+    auto schemeLess = uri;
+    const auto fileScheme = string{"file://"};
+    const auto schemeFound = schemeLess.find(fileScheme);
+    if (schemeFound != string::npos) {
+        if (schemeFound != 0)
+            throw runtime_error("Malformed URI: scheme not at beginning");
+        schemeLess = schemeLess.substr(fileScheme.size());
+    }
+    return schemeLess;
+}
+
+#ifdef PBBAM_WIN_FILEPATHS
+
+static
+string removeDiskName(const string& filePath)
+{
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':'))
+            return filePath.substr(2);
+    }
+    return filePath;
+}
+
+static const char native_pathSeparator = '\\';
+
+static bool native_pathIsAbsolute(const string& filePath)
+{
+    assert(!filePath.empty());
+
+    // if starts with single slash or double slash
+    if (boost::algorithm::starts_with(filePath, "\\"))
+        return true;
+
+    // if starts with single or double-dots -> not absolute
+    if (boost::algorithm::starts_with(filePath, "."))
+        return false;
+
+    // if starts with disk drive name and colon ("C:\foo\bar.txt")
+    // strip the drive name and check to see if the remaining path is absolute
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':'))
+            return native_pathIsAbsolute(removeDiskName(filePath));
+    }
+
+    // otherwise, likely relative
+    return false;
+}
+
+static string native_resolvedFilePath(const string& filePath,
+                                      const string& from)
+{
+    // strip file:// scheme if present
+    auto schemeLess = removeFileUriScheme(filePath);
+
+    // if empty or already absolute path, just return it
+    // upfront empty check simplifies further parsing logic
+    if (schemeLess.empty() || native_pathIsAbsolute(schemeLess))
+        return schemeLess;
+
+    // else make relative from the provided 'from' directory
+    //
+    // first pop disk name, then any leading single-dot '.'
+    //
+    // since we're prepending the 'from' directory, we can remove
+    // any leading './' form our file path. this may just mean that
+    // we pop it off to add it right back (when from == '.'), but this
+    // keeps it consistent with other 'from' parent directories
+    //
+    schemeLess = removeDiskName(schemeLess);
+
+    const bool thisDirAtStart = (schemeLess.find(".") == 0);
+    if (thisDirAtStart) {
+        if (schemeLess.find(native_pathSeparator) == 1)
+            schemeLess = schemeLess.substr(2);
+    }
+    return from + native_pathSeparator + schemeLess;
+}
+
+#else // else for non-Windows systems
+
+static const char native_pathSeparator = '/';
+
+static bool native_pathIsAbsolute(const string& filePath)
+{ return filePath.at(0) == '/'; }
+
+static string native_resolvedFilePath(const string& filePath,
+                                      const string& from)
+{
+    // strip file:// scheme if present
+    auto schemeLess = removeFileUriScheme(filePath);
+
+    // if empty or already absolute path, just return it
+    // upfront empty check simplifies further parsing logic
+    if (schemeLess.empty() || native_pathIsAbsolute(schemeLess))
+        return schemeLess;
+
+    // else make relative from the provided 'from' directory
+    //
+    // since we're prepending the 'from' directory, we can remove
+    // any leading './' form our file path. this may just mean that
+    // we pop it off to add it right back (when from == '.'), but this
+    // keeps it consistent with other 'from' parent directories
+    //
+    const bool thisDirAtStart = (schemeLess.find(".") == 0);
+    if (thisDirAtStart) {
+        if (schemeLess.find(native_pathSeparator) == 1)
+            schemeLess = schemeLess.substr(2);
+    }
+    return from + native_pathSeparator + schemeLess;
+}
+
+#endif // PBBAM_WIN_FILEPATHS
+
+// see http://stackoverflow.com/questions/2869594/how-return-a-stdstring-from-cs-getcwd-function
+string FileUtils::CurrentWorkingDirectory(void)
+{
+    const size_t chunkSize = 1024;
+    const size_t maxNumChunks = 20;
+
+    // stack-based buffer for 'normal' case
+    char buffer[chunkSize];
+    if (getcwd(buffer, sizeof(buffer)) != NULL)
+        return string(buffer);
+
+    // if error is not ERANGE, then it's not a problem of too-long name... something else happened
+    if (errno != ERANGE)
+        throw runtime_error("could not determine current working directory path");
+
+    // long path - use heap, trying progressively longer buffers
+    for (size_t chunks = 2; chunks < maxNumChunks; ++chunks) {
+        unique_ptr<char> cwd(new char[chunkSize*chunks]);
+        if (getcwd(cwd.get(), chunkSize*chunks) != NULL)
+            return string(cwd.get());
+
+        // if error is not ERANGE, then it's not a problem of too-long name... something else happened
+        if (errno != ERANGE)
+            throw runtime_error("could not determine current working directory path");
+    }
+
+    // crazy long path name
+    throw runtime_error("could determine current working directory - extremely long path");
+}
+
+string FileUtils::DirectoryName(const string& file)
+{
+    const size_t found = file.rfind(Separator(), file.length());
+    if (found != string::npos)
+        return file.substr(0, found);
+    return string(".");
+}
+
+bool FileUtils::Exists(const char* fn)
+{
+    struct stat buf;
+    return (stat(fn, &buf) != -1);
+}
+
+chrono::system_clock::time_point FileUtils::LastModified(const char* fn)
+{
+    struct stat s;
+    if (stat(fn, &s) != 0)
+        throw runtime_error("could not get file timestamp");
+    return chrono::system_clock::from_time_t(s.st_mtime);
+}
+
+string FileUtils::ResolvedFilePath(const string& filePath,
+                                   const string& from)
+{ return native_resolvedFilePath(filePath, from); }
+
+constexpr char FileUtils::Separator(void)
+{ return native_pathSeparator; }
+
+off_t FileUtils::Size(const char* fn)
+{
+    struct stat s;
+    if (stat(fn, &s) != 0)
+        throw runtime_error("could not determine file size");
+    return s.st_size;
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/src/FileUtils.h b/src/FileUtils.h

new file mode 100644 (file)

index 0000000..112223e
--- /dev/null
+++ b/src/FileUtils.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef FILEUTILS_H
+#define FILEUTILS_H
+
+#include <chrono>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct FileUtils
+{
+public:
+
+    /// \returns application's current working directory
+    static std::string CurrentWorkingDirectory(void);
+
+    /// Parses a filepath for the the directory name for a file.
+    ///
+    /// Essentially this method strips the filename from the string provided (/path/to/file => /path/to).
+    /// If only a filename is provided, then "." is returned to indicate the current directory.
+    ///
+    /// \param[in] file name of file (can be just a filename or path/to/filename)
+    /// \returns file's directory name
+    ///
+    static std::string DirectoryName(const std::string& file);
+
+    /// Check for existence of a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns true if file exists & can be opened
+    ///
+    static bool Exists(const char* fn);
+
+    /// Check for existence of a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns true if file exists & can be opened
+    ///
+    static bool Exists(const std::string& fn);
+
+    /// Check "last modified" timestamp for a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns time of last modification
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static std::chrono::system_clock::time_point LastModified(const char* fn);
+
+    /// Check "last modified" timestamp for a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns time of last modification
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static std::chrono::system_clock::time_point LastModified(const std::string& fn);
+
+    /// Resolves input file path using optional starting directory.
+    ///
+    /// \verbatim
+    ///   /absolute/path/to/file.txt   => /absolute/path/to/file.txt
+    ///   ../relative/path/to/file.txt => <from>/../relative/path/to/file.txt
+    ///   file.txt                     => <from>/file.txt
+    /// \endverbatim
+    ///
+    /// \note This method will strip any URI scheme as well ("file://") so that the result is immediately ready from I/O operations.
+    ///
+    /// \param[in] filePath file path to be resolved
+    /// \param[in] from     optional starting directory (useful if not same as application's working directory)
+    /// \returns resolved file path
+    ///
+    static std::string ResolvedFilePath(const std::string& filePath,
+                                        const std::string& from = ".");
+
+    /// \returns native path separator
+    constexpr static char Separator(void);
+
+    /// Check size of file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns file size in bytes
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static off_t Size(const char* fn);
+
+    /// Check size of file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns file size in bytes
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static off_t Size(const std::string& fn);
+};
+
+inline bool FileUtils::Exists(const std::string& fn)
+{ return FileUtils::Exists(fn.c_str()); }
+
+inline std::chrono::system_clock::time_point FileUtils::LastModified(const std::string& fn)
+{ return FileUtils::LastModified(fn.c_str()); }
+
+inline off_t FileUtils::Size(const std::string& fn)
+{ return FileUtils::Size(fn.c_str()); }
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // FILEUTILS_H
diff --git a/src/FofnReader.cpp b/src/FofnReader.cpp

new file mode 100644 (file)

index 0000000..a0d9280
--- /dev/null
+++ b/src/FofnReader.cpp
@@ -0,0 +1,52 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "FofnReader.h"
+#include <iostream>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+vector<string> FofnReader::Files(istream& in)
+{
+    vector<string> files;
+    string fn;
+    while (getline(in, fn))
+        files.push_back(fn);
+    return files;
+}
diff --git a/src/FofnReader.h b/src/FofnReader.h

new file mode 100644 (file)

index 0000000..ee09fc5
--- /dev/null
+++ b/src/FofnReader.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef FOFNREADER_H
+#define FOFNREADER_H
+
+#include "pbbam/DataSet.h"
+#include <iosfwd>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class FofnReader
+{
+public:
+    static std::vector<std::string> Files(std::istream& in);
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // FOFNREADER_H
diff --git a/src/Frames.cpp b/src/Frames.cpp

new file mode 100644 (file)

index 0000000..e54729a
--- /dev/null
+++ b/src/Frames.cpp
@@ -0,0 +1,181 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Frames.cpp
+/// \brief Implements the Frames class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Frames.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static vector<uint16_t> framepoints;
+static vector<uint8_t> frameToCode;
+static uint16_t maxFramepoint;
+
+static
+void InitIpdDownsampling(void)
+{
+    if (!framepoints.empty())
+        return;
+
+    // liftover from Dave's python code:
+    // .../bioinformatics/tools/kineticsTools/kineticsTools/_downsampling.py
+
+    const int B = 2;
+    const int t = 6;
+    const double T = pow(B, t);
+
+    int next = 0;
+    double grain;
+    const int end = 256/T;
+    for (int i = 0; i < end; ++i) {
+        grain = pow(B, i);
+        vector<uint16_t> nextOnes;
+        for (double j = 0; j < T; ++j)
+            nextOnes.push_back(j*grain + next);
+        next = nextOnes.back() + grain;
+        framepoints.insert(framepoints.end(), nextOnes.cbegin(), nextOnes.cend());
+    }
+    assert(framepoints.size()-1 <= UINT8_MAX);
+
+    const uint16_t maxElement = (*max_element(framepoints.cbegin(), framepoints.cend()));
+    frameToCode.assign(maxElement+1, 0);
+
+    const int fpEnd = framepoints.size() - 1;
+    uint8_t i = 0;
+    uint16_t fl = 0;
+    uint16_t fu = 0;
+    for (; i < fpEnd; ++i) {
+        fl = framepoints[i];
+        fu = framepoints[i+1];
+        if (fu > fl+1) {
+            const int middle = (fl+fu)/2;
+            for (int f = fl; f < middle; ++f)
+                frameToCode[f] = i;
+            for (int f = middle; f < fu; ++f)
+                frameToCode[f] = i+1;
+        } else
+            frameToCode[fl] = i;
+    }
+
+    // this next line differs from the python implementation (there, it's "i+1")
+    // our C++ for loop has incremented our index counter one more time than the indexes from python enumerate(...)
+    frameToCode[fu] = i;
+    maxFramepoint = fu;
+}
+
+static inline
+uint16_t CodeToFrames(const uint8_t code)
+{
+    return framepoints[code];
+}
+
+static
+vector<uint16_t> CodeToFrames(const vector<uint8_t>& codedData)
+{
+    InitIpdDownsampling();
+
+    const size_t length = codedData.size();
+    vector<uint16_t> frames(length, 0);
+    for (size_t i = 0; i < length; ++i)
+        frames[i] = CodeToFrames(codedData[i]);
+    return frames;
+}
+
+static inline
+uint8_t FramesToCode(const uint16_t frame)
+{
+    return frameToCode[std::min(maxFramepoint, frame)];
+}
+
+static
+vector<uint8_t> FramesToCode(const vector<uint16_t>& frames)
+{
+    InitIpdDownsampling();
+
+    const size_t length = frames.size();
+    vector<uint8_t> result(length, 0);
+    for (size_t i = 0; i < length; ++i)
+        result[i] = FramesToCode(frames[i]);
+    return result;
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+Frames::Frames(void)
+{ }
+
+Frames::Frames(const std::vector<uint16_t>& frames)
+    : data_(frames)
+{ }
+
+Frames::Frames(std::vector<uint16_t>&& frames)
+    : data_(std::move(frames))
+{ }
+
+Frames::Frames(const Frames& other)
+    : data_(other.data_)
+{ }
+
+Frames::Frames(Frames&& other)
+    : data_(std::move(other.data_))
+{ }
+
+Frames::~Frames(void) { }
+
+Frames& Frames::operator=(const Frames& other)
+{ data_ = other.data_; return *this; }
+
+Frames& Frames::operator=(Frames&& other)
+{ data_ = std::move(other.data_); return *this; }
+
+Frames Frames::Decode(const std::vector<uint8_t>& codedData)
+{ return Frames(internal::CodeToFrames(codedData)); }
+
+std::vector<uint8_t> Frames::Encode(const std::vector<uint16_t>& frames)
+{ return internal::FramesToCode(frames); }
diff --git a/src/GenomicInterval.cpp b/src/GenomicInterval.cpp

new file mode 100644 (file)

index 0000000..10ebc23
--- /dev/null
+++ b/src/GenomicInterval.cpp
@@ -0,0 +1,136 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file GenomicInterval.cpp
+/// \brief Implements the GenomicInterval class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/GenomicInterval.h"
+#include "AssertUtils.h"
+#include "StringUtils.h"
+#include <cstdlib>
+#include <cstring>
+#include <ctype.h>
+#include <stdexcept>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// returns sequence name & sets begin/end, from input regionString
+string parseRegionString(const string& reg,
+                         PacBio::BAM::Position* begin,
+                         PacBio::BAM::Position* end)
+{
+    const vector<string> parts = internal::Split(reg, ':');
+    if (parts.empty() || parts.size() > 2)
+        throw std::runtime_error("malformed region string");
+
+    // given name only, default min,max intervals
+    if (parts.size() == 1) {
+        *begin = 0;
+        *end = 1<<29;
+    }
+
+    // parse interval from input
+    else if (parts.size() == 2) {
+        const vector<string> intervalParts = internal::Split(parts.at(1), '-');
+        if (intervalParts.empty() || intervalParts.size() >2 )
+            throw std::runtime_error("malformed region string");
+        *begin = std::stoi(intervalParts.at(0));
+        *end   = std::stoi(intervalParts.at(1));
+    }
+
+    return parts.at(0);
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+GenomicInterval::GenomicInterval(void) { }
+
+GenomicInterval::GenomicInterval(const std::string& name,
+                                 const Position& start,
+                                 const Position& stop)
+    : name_(name)
+    , interval_(start, stop)
+{ }
+
+GenomicInterval::GenomicInterval(const string& samtoolsRegionString)
+{
+    Position begin;
+    Position end;
+    name_ = internal::parseRegionString(samtoolsRegionString, &begin, &end);
+    interval_ = PacBio::BAM::Interval<Position>(begin, end);
+}
+
+GenomicInterval::GenomicInterval(const GenomicInterval& other)
+    : name_(other.name_)
+    , interval_(other.interval_)
+{ }
+
+GenomicInterval& GenomicInterval::operator=(const GenomicInterval& other)
+{
+    name_ = other.name_;
+    interval_ = other.interval_;
+    return *this;
+}
+
+bool GenomicInterval::CoveredBy(const GenomicInterval& other) const
+{
+    if (name_ != other.name_)
+        return false;
+    return interval_.CoveredBy(other.interval_);
+}
+
+bool GenomicInterval::Covers(const GenomicInterval& other) const
+{
+    if (name_ != other.name_)
+        return false;
+    return interval_.Covers(other.interval_);
+}
+
+bool GenomicInterval::Intersects(const GenomicInterval& other) const
+{
+    if (name_ != other.name_)
+        return false;
+    return interval_.Intersects(other.interval_);
+}
diff --git a/src/GenomicIntervalQuery.cpp b/src/GenomicIntervalQuery.cpp

new file mode 100644 (file)

index 0000000..b6ead9f
--- /dev/null
+++ b/src/GenomicIntervalQuery.cpp
@@ -0,0 +1,73 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file GenomicIntervalQuery.cpp
+/// \brief Implements the GenomicIntervalQuery class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/GenomicIntervalQuery.h"
+#include "pbbam/CompositeBamReader.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+struct GenomicIntervalQuery::GenomicIntervalQueryPrivate
+{
+    GenomicIntervalQueryPrivate(const GenomicInterval& interval,
+                                const DataSet& dataset)
+        : reader_(interval, dataset)
+    { }
+
+    GenomicIntervalCompositeBamReader reader_;
+};
+
+GenomicIntervalQuery::GenomicIntervalQuery(const GenomicInterval& interval,
+                                           const DataSet &dataset)
+    : internal::IQuery()
+    , d_(new GenomicIntervalQueryPrivate(interval, dataset))
+{ }
+
+GenomicIntervalQuery::~GenomicIntervalQuery(void) { }
+
+bool GenomicIntervalQuery::GetNext(BamRecord &r)
+{ return d_->reader_.GetNext(r); }
+
+GenomicIntervalQuery& GenomicIntervalQuery::Interval(const GenomicInterval& interval)
+{ d_->reader_.Interval(interval); return *this; }
+
+const GenomicInterval& GenomicIntervalQuery::Interval(void) const
+{ return d_->reader_.Interval(); }
diff --git a/src/IRecordWriter.cpp b/src/IRecordWriter.cpp

new file mode 100644 (file)

index 0000000..7333182
--- /dev/null
+++ b/src/IRecordWriter.cpp
@@ -0,0 +1,48 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file IRecordWriter.cpp
+/// \brief Implements the IRecordWriter class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/IRecordWriter.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+IRecordWriter::IRecordWriter(void) { }
+
+IRecordWriter::~IRecordWriter(void) { }
diff --git a/src/IndexedFastaReader.cpp b/src/IndexedFastaReader.cpp

new file mode 100644 (file)

index 0000000..715dd03
--- /dev/null
+++ b/src/IndexedFastaReader.cpp
@@ -0,0 +1,236 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file IndexedFastaReader.cpp
+/// \brief Implements the IndexedFastaReader class.
+//
+// Author: David Alexander
+
+#include "pbbam/IndexedFastaReader.h"
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/GenomicInterval.h"
+#include "pbbam/Orientation.h"
+#include "SequenceUtils.h"
+#include <htslib/faidx.h>
+#include <iostream>
+#include <cstdlib>
+
+namespace PacBio {
+namespace BAM {
+
+IndexedFastaReader::IndexedFastaReader(const std::string& filename)
+{
+    Open(filename);
+}
+
+IndexedFastaReader::IndexedFastaReader(const IndexedFastaReader& src)
+{
+    if (!Open(src.filename_))
+        throw std::runtime_error("Cannot open file " + src.filename_);
+}
+
+IndexedFastaReader& IndexedFastaReader::operator=(const IndexedFastaReader& rhs)
+{
+    if (&rhs == this)
+        return *this;
+
+    Open(rhs.filename_);
+    return *this;
+}
+
+IndexedFastaReader::~IndexedFastaReader(void)
+{
+    Close();
+}
+
+bool IndexedFastaReader::Open(const std::string &filename)
+{
+    faidx_t* handle = fai_load(filename.c_str());
+    if (handle == nullptr)
+        return false;
+    else
+    {
+        filename_ = filename;
+        handle_ = handle;
+        return true;
+    }
+}
+
+void IndexedFastaReader::Close(void)
+{
+    filename_ = "";
+    if (handle_ != nullptr)
+        fai_destroy(handle_);
+    handle_ = nullptr;
+}
+
+#define REQUIRE_FAIDX_LOADED if (handle_ == nullptr) throw std::exception()
+
+std::string IndexedFastaReader::Subsequence(const std::string& id,
+                                            Position begin,
+                                            Position end) const
+{
+    REQUIRE_FAIDX_LOADED;
+
+    int len;
+    // Derek: *Annoyingly* htslib seems to interpret "end" as inclusive in
+    // faidx_fetch_seq, whereas it considers it exclusive in the region spec in
+    // fai_fetch.  Can you please verify?
+    char* rawSeq = faidx_fetch_seq(handle_, id.c_str(), begin, end - 1, &len);
+    if (rawSeq == nullptr)
+        throw std::runtime_error("could not fetch FASTA sequence");
+    else {
+        std::string seq(rawSeq);
+        free(rawSeq);
+        return seq;
+    }
+}
+
+std::string IndexedFastaReader::Subsequence(const GenomicInterval& interval) const
+{
+    REQUIRE_FAIDX_LOADED;
+    return Subsequence(interval.Name(), interval.Start(), interval.Stop());
+}
+
+std::string IndexedFastaReader::Subsequence(const char *htslibRegion) const
+{
+    REQUIRE_FAIDX_LOADED;
+
+    int len;
+    char* rawSeq = fai_fetch(handle_, htslibRegion, &len);
+    if (rawSeq == nullptr)
+        throw std::runtime_error("could not fetch FASTA sequence");
+    else {
+        std::string seq(rawSeq);
+        free(rawSeq);
+        return seq;
+    }
+}
+
+
+std::string
+IndexedFastaReader::ReferenceSubsequence(const BamRecord& bamRecord,
+                                         const Orientation orientation,
+                                         const bool gapped,
+                                         const bool exciseSoftClips) const
+{
+    REQUIRE_FAIDX_LOADED;
+
+    std::string subseq = Subsequence(bamRecord.ReferenceName(),
+                                     bamRecord.ReferenceStart(),
+                                     bamRecord.ReferenceEnd());
+    const auto reverse = orientation != Orientation::GENOMIC &&
+                         bamRecord.Impl().IsReverseStrand();
+
+    if (bamRecord.Impl().IsMapped() && gapped)
+    {
+        size_t seqIndex = 0;
+        const Cigar& cigar = bamRecord.Impl().CigarData();
+        Cigar::const_iterator cigarIter = cigar.cbegin();
+        Cigar::const_iterator cigarEnd = cigar.cend();
+        for (; cigarIter != cigarEnd; ++cigarIter)
+        {
+            const CigarOperation& op = (*cigarIter);
+            const CigarOperationType& type = op.Type();
+
+            // do nothing for hard clips
+            if (type != CigarOperationType::HARD_CLIP)
+            {
+                const size_t opLength = op.Length();
+
+                // maybe remove soft clips
+                if (type == CigarOperationType::SOFT_CLIP)
+                {
+                    if (!exciseSoftClips)
+                    {
+                        subseq.reserve(subseq.size() + opLength);
+                        subseq.insert(seqIndex, opLength, '-');
+                        seqIndex += opLength;
+                    }
+                }
+
+                // for non-clipping operations
+                else {
+
+                    // maybe add gaps/padding
+                    if (type == CigarOperationType::INSERTION)
+                    {
+                        subseq.reserve(subseq.size() + opLength);
+                        subseq.insert(seqIndex, opLength, '-');
+                    }
+                    else if (type == CigarOperationType::PADDING)
+                    {
+                        subseq.reserve(subseq.size() + opLength);
+                        subseq.insert(seqIndex, opLength, '*');
+                    }
+
+                    // update index
+                    seqIndex += opLength;
+                }
+            }
+        }
+    }
+
+    if (reverse)
+        internal::ReverseComplementCaseSens(subseq);
+
+    return subseq;
+}
+
+
+int IndexedFastaReader::NumSequences(void) const
+{
+    REQUIRE_FAIDX_LOADED;
+    return faidx_nseq(handle_);
+}
+
+bool IndexedFastaReader::HasSequence(const std::string& name) const
+{
+    REQUIRE_FAIDX_LOADED;
+    return (faidx_has_seq(handle_, name.c_str()) != 0);
+}
+
+int IndexedFastaReader::SequenceLength(const std::string& name) const
+{
+    REQUIRE_FAIDX_LOADED;
+    int len = faidx_seq_len(handle_, name.c_str());
+    if (len < 0)
+        throw std::runtime_error("could not determine FASTA sequence length");
+    else return len;
+}
+
+}}  // PacBio::BAM
diff --git a/src/MD5.cpp b/src/MD5.cpp

new file mode 100644 (file)

index 0000000..b2262d5
--- /dev/null
+++ b/src/MD5.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file MD5.cpp
+/// \brief Implements basic MD5 hash utilities
+//
+// Author: Brett Bowman
+
+#include "pbbam/MD5.h"
+#include <cram/md5.h>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief MD5 hash of a string as a 32-digit hexadecimal string
+///
+std::string MD5Hash(const std::string& str)
+{
+    MD5_CTX md5;
+    unsigned char digest[16];
+    char hexdigest[33];
+
+    MD5_Init(&md5);
+    MD5_Update(&md5, reinterpret_cast<void*>(const_cast<char*>(str.c_str())), str.size());
+    MD5_Final(digest, &md5);
+
+    for (int i = 0; i < 16; ++i)
+        sprintf(&hexdigest[2*i], "%02x", digest[i]);
+
+   return std::string{hexdigest, 32};
+}
+
+} // namespace BAM
+} // namespace PacBio
+
+
+
diff --git a/src/MemoryUtils.cpp b/src/MemoryUtils.cpp

new file mode 100644 (file)

index 0000000..e3ec6a2
--- /dev/null
+++ b/src/MemoryUtils.cpp
@@ -0,0 +1,82 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "MemoryUtils.h"
+#include <string>
+#include <cstdlib>
+#include <cstring>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+// -----------------
+// BamHeaderMemory
+// -----------------
+
+BamHeader BamHeaderMemory::FromRawData(bam_hdr_t* hdr)
+{
+    // null input - error
+    if (hdr == nullptr)
+        throw std::runtime_error("invalid BAM header");
+
+    // empty text input - ok
+    if (hdr->text == nullptr || hdr->l_text == 0)
+        return BamHeader();
+
+    // parse normal SAM text input
+    return BamHeader(string(hdr->text, hdr->l_text));
+}
+
+PBBAM_SHARED_PTR<bam_hdr_t> BamHeaderMemory::MakeRawHeader(const BamHeader& header)
+{
+    const string& text = header.ToSam();
+    PBBAM_SHARED_PTR<bam_hdr_t> rawData(sam_hdr_parse(text.size(), text.c_str()), internal::HtslibHeaderDeleter());
+    rawData->ignore_sam_err = 0;
+    rawData->cigar_tab = NULL;
+    rawData->l_text = text.size();
+    rawData->text = (char*)calloc(rawData->l_text + 1, 1);
+    memcpy(rawData->text, text.c_str(), rawData->l_text);
+    return rawData;
+}
+
+//PBBAM_SHARED_PTR<bam_hdr_t> BamHeaderMemory::MakeRawHeader(const BamHeader& header)
+//{
+//    if (!header)
+//        return PBBAM_SHARED_PTR<bam_hdr_t>(nullptr);
+//    return MakeRawHeader(*header.get());
+//}
diff --git a/src/MemoryUtils.h b/src/MemoryUtils.h

new file mode 100644 (file)

index 0000000..c22f9f5
--- /dev/null
+++ b/src/MemoryUtils.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef MEMORYUTILS_H
+#define MEMORYUTILS_H
+
+#include "pbbam/Config.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/BamRecordImpl.h"
+#include <htslib/bgzf.h>
+#include <htslib/sam.h>
+#include <memory>
+
+namespace PacBio {
+namespace BAM {
+
+class BamHeader;
+
+namespace internal {
+
+// intended for use with PBBAM_SHARED_PTR<T>, std::unique_ptr<T>, etc
+
+struct HtslibBgzfDeleter
+{
+    void operator()(BGZF* bgzf)
+    {
+        if (bgzf) 
+            bgzf_close(bgzf);
+        bgzf = nullptr;
+    }
+};
+
+struct HtslibFileDeleter
+{
+    void operator()(samFile* file)
+    {
+        if (file)
+            sam_close(file);
+        file = nullptr;
+    }
+};
+
+struct HtslibHeaderDeleter
+{
+    void operator()(bam_hdr_t* hdr)
+    {
+        if (hdr)
+            bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+struct HtslibIndexDeleter
+{
+    void operator()(hts_idx_t* index)
+    {
+        if (index)
+            hts_idx_destroy(index);
+        index = nullptr;
+    }
+};
+
+struct HtslibIteratorDeleter
+{
+    void operator()(hts_itr_t* iter)
+    {
+        if (iter)
+            hts_itr_destroy(iter);
+        iter = nullptr;
+    }
+};
+
+struct HtslibRecordDeleter
+{
+    void operator()(bam1_t* b)
+    {
+        if (b)
+            bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+class BamHeaderMemory
+{
+public:
+    static BamHeader FromRawData(bam_hdr_t* header);
+    static PBBAM_SHARED_PTR<bam_hdr_t> MakeRawHeader(const BamHeader& header);
+//    static PBBAM_SHARED_PTR<bam_hdr_t> MakeRawHeader(const BamHeader& header);
+};
+
+class BamRecordMemory
+{
+public:
+    static const BamRecordImpl& GetImpl(const BamRecord& r);
+    static const BamRecordImpl& GetImpl(const BamRecord* r);
+    static PBBAM_SHARED_PTR<bam1_t> GetRawData(const BamRecord& r);
+    static PBBAM_SHARED_PTR<bam1_t> GetRawData(const BamRecord* r);
+    static PBBAM_SHARED_PTR<bam1_t> GetRawData(const BamRecordImpl& impl);
+    static PBBAM_SHARED_PTR<bam1_t> GetRawData(const BamRecordImpl* impl);
+
+    static void UpdateRecordTags(const BamRecord& r);
+    static void UpdateRecordTags(const BamRecordImpl& r);
+};
+
+inline const BamRecordImpl& BamRecordMemory::GetImpl(const BamRecord& r)
+{ return r.impl_; }
+
+inline const BamRecordImpl& BamRecordMemory::GetImpl(const BamRecord* r)
+{ return r->impl_; }
+
+inline PBBAM_SHARED_PTR<bam1_t> BamRecordMemory::GetRawData(const BamRecord& r)
+{ return GetRawData(r.impl_); }
+
+inline PBBAM_SHARED_PTR<bam1_t> BamRecordMemory::GetRawData(const BamRecord* r)
+{ return GetRawData(r->impl_); }
+
+inline PBBAM_SHARED_PTR<bam1_t> BamRecordMemory::GetRawData(const BamRecordImpl& impl)
+{ return impl.d_; }
+
+inline PBBAM_SHARED_PTR<bam1_t> BamRecordMemory::GetRawData(const BamRecordImpl* impl)
+{ return impl->d_; }
+
+inline void BamRecordMemory::UpdateRecordTags(const BamRecord& r)
+{ UpdateRecordTags(r.impl_); }
+
+inline void BamRecordMemory::UpdateRecordTags(const BamRecordImpl& r)
+{ r.UpdateTagMap(); }
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // MEMORYUTILS_H
diff --git a/src/PbiBuilder.cpp b/src/PbiBuilder.cpp

new file mode 100644 (file)

index 0000000..5cb6d6c
--- /dev/null
+++ b/src/PbiBuilder.cpp
@@ -0,0 +1,405 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiBuilder.cpp
+/// \brief Implements the PbiBuilder class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiBuilder.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/PbiRawData.h"
+#include "FileProducer.h"
+#include "MemoryUtils.h"
+#include "PbiIndexIO.h"
+#include <htslib/bgzf.h>
+#include <thread>
+#include <cassert>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// -------------------------------------------
+// PbiRawReferenceDataBuilder implementation
+// -------------------------------------------
+
+// helper for reference data
+class PbiRawReferenceDataBuilder
+{
+public:
+    PbiRawReferenceDataBuilder(const size_t numReferenceSequences);
+
+public:
+    bool AddRecord(const BamRecord& record,
+                   const PbiReferenceEntry::Row rowNumber);
+    PbiRawReferenceData Result(void) const;
+
+private:
+    int32_t lastRefId_;
+    Position lastPos_;
+    map<uint32_t, PbiReferenceEntry> rawReferenceEntries_;
+};
+
+PbiRawReferenceDataBuilder::PbiRawReferenceDataBuilder(const size_t numReferenceSequences)
+    : lastRefId_(-1)
+    , lastPos_(-1)
+{
+    // initialize with number of references we expect to see
+    //
+    // we can add more later, but want to ensure known references have an entry
+    // even if no records are observed mapping to it
+    //
+    for (size_t i = 0; i < numReferenceSequences; ++i)
+        rawReferenceEntries_[i] = PbiReferenceEntry(i);
+
+    // also create an "unmapped" entry
+    rawReferenceEntries_[PbiReferenceEntry::UNMAPPED_ID] = PbiReferenceEntry();
+}
+
+bool PbiRawReferenceDataBuilder::AddRecord(const BamRecord& record,
+                                           const PbiReferenceEntry::Row rowNumber)
+{
+    // fetch ref ID & pos for record
+    const int32_t tId = record.ReferenceId();
+    const int32_t pos = record.ReferenceStart();
+
+    // sanity checks to protect against non-coordinate-sorted BAMs
+    if (lastRefId_ != tId || (lastRefId_ >= 0 && tId < 0)) {
+        if (tId >= 0) {
+
+            // if we've already seen unmapped reads, but our current tId is valid
+            //
+            // error: unmapped reads should all be at the end (can stop checking refs)
+            //
+            PbiReferenceEntry& unmappedEntry =
+                    rawReferenceEntries_[PbiReferenceEntry::UNMAPPED_ID];
+            if (unmappedEntry.beginRow_ != PbiReferenceEntry::UNSET_ROW)
+                return false;
+
+            // if we've already seen data for this new tId
+            // (remember we're coming from another tId)
+            //
+            // error: refs are out of order (can stop checking refs)
+            //
+            PbiReferenceEntry& currentEntry =
+                    rawReferenceEntries_[(uint32_t)tId];
+            if (currentEntry.beginRow_ != PbiReferenceEntry::UNSET_ROW)
+                return false;
+        }
+        lastRefId_ = tId;
+    }
+    else if (tId >= 0 && lastPos_ > pos)
+        return false; //error: positions out of order
+
+    // update row numbers
+    PbiReferenceEntry& entry = rawReferenceEntries_[(uint32_t)tId];
+    if (entry.beginRow_ == PbiReferenceEntry::UNSET_ROW)
+        entry.beginRow_ = rowNumber;
+    entry.endRow_ = rowNumber+1;
+
+    // update pos (for sorting check next go-round)
+    lastPos_ = pos;
+    return true;
+}
+
+PbiRawReferenceData PbiRawReferenceDataBuilder::Result(void) const {
+    // PbiReferenceEntries will be sorted thanks to std::map
+    // tId will be at end since we're sorting on the uint cast of -1
+    PbiRawReferenceData result;
+    result.entries_.reserve(rawReferenceEntries_.size());
+    auto refIter = rawReferenceEntries_.cbegin();
+    const auto refEnd  = rawReferenceEntries_.cend();
+    for ( ; refIter != refEnd; ++refIter )
+        result.entries_.push_back(refIter->second);
+    return result;
+}
+
+// ----------------------------------
+// PbiBuilderPrivate implementation
+// ----------------------------------
+
+class PbiBuilderPrivate : public internal::FileProducer
+{
+public:
+    PbiBuilderPrivate(const string& filename,
+                      const size_t numReferenceSequences,
+                      const PbiBuilder::CompressionLevel compressionLevel,
+                      const size_t numThreads);
+    PbiBuilderPrivate(const string& filename,
+                      const size_t numReferenceSequences,
+                      const bool isCoordinateSorted,
+                      const PbiBuilder::CompressionLevel compressionLevel,
+                      const size_t numThreads);
+    ~PbiBuilderPrivate(void);
+
+public:
+    void AddRecord(const BamRecord& record, const int64_t vOffset);
+
+public:
+    bool HasBarcodeData(void) const;
+    bool HasMappedData(void) const;
+    bool HasReferenceData(void) const;
+
+public:
+    unique_ptr<BGZF, HtslibBgzfDeleter> bgzf_;
+    PbiRawData rawData_;
+    PbiReferenceEntry::Row currentRow_;
+    unique_ptr<PbiRawReferenceDataBuilder> refDataBuilder_;    
+};
+
+PbiBuilderPrivate::PbiBuilderPrivate(const string& filename,
+                                     const size_t numReferenceSequences,
+                                     const PbiBuilder::CompressionLevel compressionLevel,
+                                     const size_t numThreads)
+    : internal::FileProducer(filename)
+    , bgzf_(nullptr)
+    , currentRow_(0)
+    , refDataBuilder_(nullptr)
+{
+    const string& usingFilename = TempFilename();
+    const string& mode = string("wb") + to_string(static_cast<int>(compressionLevel));
+    bgzf_.reset(bgzf_open(usingFilename.c_str(), mode.c_str()));
+    if (bgzf_.get() == 0)
+        throw std::runtime_error("could not open PBI file for writing");
+
+    size_t actualNumThreads = numThreads;
+    if (actualNumThreads == 0) {
+        actualNumThreads = thread::hardware_concurrency();
+
+        // if still unknown, default to single-threaded
+        if (actualNumThreads == 0)
+            actualNumThreads = 1;
+    }
+
+    // if multithreading requested, enable it
+    if (actualNumThreads > 1)
+        bgzf_mt(bgzf_.get(), actualNumThreads, 256);
+
+    if (numReferenceSequences > 0)
+        refDataBuilder_.reset(new PbiRawReferenceDataBuilder(numReferenceSequences));
+}
+
+PbiBuilderPrivate::PbiBuilderPrivate(const string& filename,
+                                     const size_t numReferenceSequences,
+                                     const bool isCoordinateSorted,
+                                     const PbiBuilder::CompressionLevel compressionLevel,
+                                     const size_t numThreads)
+    : internal::FileProducer(filename)
+    , bgzf_(nullptr)
+    , currentRow_(0)
+    , refDataBuilder_(nullptr)
+{
+    const string& usingFilename = TempFilename();
+    const string& mode = string("wb") + to_string(static_cast<int>(compressionLevel));
+    bgzf_.reset(bgzf_open(usingFilename.c_str(), mode.c_str()));
+    if (bgzf_.get() == 0)
+        throw std::runtime_error("could not open PBI file for writing");
+
+    size_t actualNumThreads = numThreads;
+    if (actualNumThreads == 0) {
+        actualNumThreads = thread::hardware_concurrency();
+
+        // if still unknown, default to single-threaded
+        if (actualNumThreads == 0)
+            actualNumThreads = 1;
+    }
+
+    // if multithreading requested, enable it
+    if (actualNumThreads > 1)
+        bgzf_mt(bgzf_.get(), actualNumThreads, 256);
+
+    if (isCoordinateSorted && numReferenceSequences > 0)
+        refDataBuilder_.reset(new PbiRawReferenceDataBuilder(numReferenceSequences));
+}
+
+PbiBuilderPrivate::~PbiBuilderPrivate(void)
+{
+    rawData_.NumReads(currentRow_);
+
+    const auto hasBarcodeData   = HasBarcodeData();
+    const auto hasMappedData    = HasMappedData();
+    const auto hasReferenceData = HasReferenceData();
+
+    // fetch reference data, if available
+    if (hasReferenceData) {
+        assert(refDataBuilder_);
+        rawData_.ReferenceData() = refDataBuilder_->Result();
+    }
+
+    // determine flags
+    PbiFile::Sections sections = PbiFile::BASIC;
+    if (hasMappedData)    sections |= PbiFile::MAPPED;
+    if (hasBarcodeData)   sections |= PbiFile::BARCODE;
+    if (hasReferenceData) sections |= PbiFile::REFERENCE;
+    rawData_.FileSections(sections);
+
+    // write index contents to file
+    BGZF* fp = bgzf_.get();
+    PbiIndexIO::WriteHeader(rawData_, fp);
+    const uint32_t numReads = rawData_.NumReads();
+    if (numReads > 0) {
+        PbiIndexIO::WriteBasicData(rawData_.BasicData(), numReads, fp);
+        if (hasMappedData)    PbiIndexIO::WriteMappedData(rawData_.MappedData(), numReads, fp);
+        if (hasReferenceData) PbiIndexIO::WriteReferenceData(rawData_.ReferenceData(), fp);
+        if (hasBarcodeData)   PbiIndexIO::WriteBarcodeData(rawData_.BarcodeData(), numReads, fp);
+    }
+}
+
+void PbiBuilderPrivate::AddRecord(const BamRecord& record, const int64_t vOffset)
+{
+    // ensure updated data
+    record.ResetCachedPositions();
+
+    // store data
+    rawData_.BarcodeData().AddRecord(record);
+    rawData_.BasicData().AddRecord(record, vOffset);
+    rawData_.MappedData().AddRecord(record);
+
+    if (refDataBuilder_) {
+
+        // stop storing coordinate-sorted reference data if we encounter out-of-order record
+        const bool sorted = refDataBuilder_->AddRecord(record, currentRow_);
+        if (!sorted)
+            refDataBuilder_.reset();
+    }
+
+    // increment row counter
+    ++currentRow_;
+}
+
+bool PbiBuilderPrivate::HasBarcodeData(void) const
+{
+    // fetch data components
+    const auto& barcodeData = rawData_.BarcodeData();
+    const auto& bcForward   = barcodeData.bcForward_;
+    const auto& bcReverse   = barcodeData.bcReverse_;
+    const auto& bcQuality   = barcodeData.bcQual_;
+
+    // ensure valid sizes
+    if (bcForward.size() != bcReverse.size() &&
+        bcForward.size() != bcQuality.size())
+    {
+        auto msg = string{ "error: inconsistency in PBI barcode data:\n" };
+        msg +=     string{ "  bcForward has " } + to_string(bcForward.size()) + string{ " elements\n" };
+        msg +=     string{ "  bcReverse has " } + to_string(bcReverse.size()) + string{ " elements\n" };
+        msg +=     string{ "  bcQuality has " } + to_string(bcQuality.size()) + string{ " elements\n" };
+        msg +=     string{ "\n" };
+        msg +=     string{ "  these containers should contain equal number of elements.\n" };
+        throw std::runtime_error(msg);
+    }
+    assert(bcForward.size() == rawData_.NumReads());
+
+    // check for data
+    for (uint32_t i = 0; i < rawData_.NumReads(); ++i) {
+        if (bcForward.at(i) != -1 ||
+            bcReverse.at(i)  != -1 ||
+            bcQuality.at(i)  != -1 )
+        {
+            return true;
+        }
+    }
+    // no actual data found
+    return false;
+}
+
+bool PbiBuilderPrivate::HasMappedData(void) const
+{
+    const auto& mappedData = rawData_.MappedData();
+    const auto& tIds = mappedData.tId_;
+    assert(tIds.size() == rawData_.NumReads());
+    for (const auto tId : tIds) {
+        if (tId >= 0)
+            return true;
+    }
+    return false; // all reads unmapped
+}
+
+bool PbiBuilderPrivate::HasReferenceData(void) const
+{ return bool(refDataBuilder_); }
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+// ---------------------------
+// PbiBuilder implementation
+// ---------------------------
+
+PbiBuilder::PbiBuilder(const string& pbiFilename,
+                       const CompressionLevel compressionLevel,
+                       const size_t numThreads)
+    : d_(new internal::PbiBuilderPrivate(pbiFilename,
+                                         0,
+                                         compressionLevel,
+                                         numThreads))
+{ }
+
+PbiBuilder::PbiBuilder(const string& pbiFilename,
+                       const size_t numReferenceSequences,
+                       const CompressionLevel compressionLevel,
+                       const size_t numThreads)
+    : d_(new internal::PbiBuilderPrivate(pbiFilename,
+                                         numReferenceSequences,
+                                         compressionLevel,
+                                         numThreads))
+{ }
+
+PbiBuilder::PbiBuilder(const string& pbiFilename,
+                       const size_t numReferenceSequences,
+                       const bool isCoordinateSorted,
+                       const CompressionLevel compressionLevel,
+                       const size_t numThreads)
+    : d_(new internal::PbiBuilderPrivate(pbiFilename,
+                                         numReferenceSequences,
+                                         isCoordinateSorted,
+                                         compressionLevel,
+                                         numThreads))
+{ }
+
+PbiBuilder::~PbiBuilder(void) { }
+
+void PbiBuilder::AddRecord(const BamRecord& record, const int64_t vOffset)
+{
+    internal::BamRecordMemory::UpdateRecordTags(record);
+    d_->AddRecord(record, vOffset);
+}
+
+const PbiRawData& PbiBuilder::Index(void) const
+{ return d_->rawData_; }
diff --git a/src/PbiFile.cpp b/src/PbiFile.cpp

new file mode 100644 (file)

index 0000000..144c847
--- /dev/null
+++ b/src/PbiFile.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiFile.cpp
+/// \brief Implements the PbiFile methods.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiFile.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/PbiBuilder.h"
+#include "pbbam/BamReader.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::PbiFile;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace PbiFile {
+
+void CreateFrom(const BamFile& bamFile,
+                const PbiBuilder::CompressionLevel compressionLevel,
+                const size_t numThreads)
+{
+    PbiBuilder builder(bamFile.PacBioIndexFilename(),
+                       bamFile.Header().Sequences().size(),
+                       compressionLevel,
+                       numThreads);
+    BamReader reader(bamFile);
+    BamRecord b;
+    int64_t offset = reader.VirtualTell();
+    while (reader.GetNext(b)) {
+        builder.AddRecord(b, offset);
+        offset = reader.VirtualTell();
+    }
+}
+
+} // namespace PbiFile
+} // namespace BAM
+} // namespace PacBio
diff --git a/src/PbiFilter.cpp b/src/PbiFilter.cpp

new file mode 100644 (file)

index 0000000..0ce8930
--- /dev/null
+++ b/src/PbiFilter.cpp
@@ -0,0 +1,365 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.\r
+//\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted (subject to the limitations in the\r
+// disclaimer below) provided that the following conditions are met:\r
+//\r
+//  * Redistributions of source code must retain the above copyright\r
+//    notice, this list of conditions and the following disclaimer.\r
+//\r
+//  * Redistributions in binary form must reproduce the above\r
+//    copyright notice, this list of conditions and the following\r
+//    disclaimer in the documentation and/or other materials provided\r
+//    with the distribution.\r
+//\r
+//  * Neither the name of Pacific Biosciences nor the names of its\r
+//    contributors may be used to endorse or promote products derived\r
+//    from this software without specific prior written permission.\r
+//\r
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE\r
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC\r
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED\r
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES\r
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\r
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS\r
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF\r
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND\r
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF\r
+// SUCH DAMAGE.\r
+//\r
+// File Description\r
+/// \file PbiFilter.cpp\r
+/// \brief Implements the PbiFilter class.\r
+//\r
+// Author: Derek Barnett\r
+\r
+#include "pbbam/PbiFilter.h"\r
+#include "pbbam/PbiFilterTypes.h"\r
+#include "StringUtils.h"\r
+#include <boost/algorithm/string/case_conv.hpp>\r
+#include <boost/algorithm/string/trim.hpp>\r
+#include <boost/numeric/conversion/cast.hpp>\r
+#include <algorithm>\r
+#include <fstream>\r
+#include <iostream>\r
+#include <sstream>\r
+#include <string>\r
+#include <unordered_map>\r
+#include <cctype>\r
+using namespace PacBio;\r
+using namespace PacBio::BAM;\r
+using namespace PacBio::BAM::internal;\r
+using namespace std;\r
+\r
+namespace PacBio {\r
+namespace BAM {\r
+namespace internal {\r
+\r
+enum class BuiltIn\r
+{\r
+    AlignedEndFilter\r
+  , AlignedLengthFilter\r
+  , AlignedStartFilter\r
+  , AlignedStrandFilter\r
+  , BarcodeFilter\r
+  , BarcodeForwardFilter\r
+  , BarcodeQualityFilter\r
+  , BarcodeReverseFilter\r
+  , BarcodesFilter\r
+  , IdentityFilter\r
+  , LocalContextFilter\r
+  , MovieNameFilter\r
+  , NumDeletedBasesFilter\r
+  , NumInsertedBasesFilter\r
+  , NumMatchesFilter\r
+  , NumMismatchesFilter\r
+  , QueryEndFilter\r
+  , QueryLengthFilter\r
+  , QueryNameFilter\r
+  , QueryNamesFromFileFilter\r
+  , QueryStartFilter\r
+  , ReadAccuracyFilter\r
+  , ReadGroupFilter\r
+  , ReferenceEndFilter\r
+  , ReferenceIdFilter\r
+  , ReferenceNameFilter\r
+  , ReferenceStartFilter\r
+  , ZmwFilter\r
+};\r
+\r
+static const unordered_map<string, BuiltIn> builtInLookup =\r
+{\r
+    // property name   built-in filter\r
+    { "ae",            BuiltIn::AlignedEndFilter },\r
+    { "aend",          BuiltIn::AlignedEndFilter },\r
+    { "alignedlength", BuiltIn::AlignedLengthFilter },\r
+    { "as",            BuiltIn::AlignedStartFilter },\r
+    { "astart",        BuiltIn::AlignedStartFilter },\r
+    { "readstart",     BuiltIn::AlignedStartFilter },\r
+    { "bc",            BuiltIn::BarcodeFilter },\r
+    { "barcode",       BuiltIn::BarcodeFilter },\r
+    { "bcf",           BuiltIn::BarcodeForwardFilter },\r
+    { "bq",            BuiltIn::BarcodeQualityFilter },\r
+    { "bcq",           BuiltIn::BarcodeQualityFilter },\r
+    { "bcr",           BuiltIn::BarcodeReverseFilter },\r
+    { "accuracy",      BuiltIn::IdentityFilter },\r
+    { "identity",      BuiltIn::IdentityFilter },\r
+    { "cx",            BuiltIn::LocalContextFilter },\r
+    { "movie",         BuiltIn::MovieNameFilter },\r
+    { "qe",            BuiltIn::QueryEndFilter },\r
+    { "qend",          BuiltIn::QueryEndFilter },\r
+    { "length",        BuiltIn::QueryLengthFilter },\r
+    { "querylength",   BuiltIn::QueryLengthFilter },\r
+    { "qname",         BuiltIn::QueryNameFilter },\r
+    { "qname_file",    BuiltIn::QueryNamesFromFileFilter },\r
+    { "qs",            BuiltIn::QueryStartFilter },\r
+    { "qstart",        BuiltIn::QueryStartFilter },\r
+    { "rq",            BuiltIn::ReadAccuracyFilter },\r
+    { "te",            BuiltIn::ReferenceEndFilter },\r
+    { "tend",          BuiltIn::ReferenceEndFilter },\r
+    { "rname",         BuiltIn::ReferenceNameFilter },\r
+    { "ts",            BuiltIn::ReferenceStartFilter },\r
+    { "tstart",        BuiltIn::ReferenceStartFilter },\r
+    { "pos",           BuiltIn::ReferenceStartFilter },\r
+    { "zm",            BuiltIn::ZmwFilter },\r
+    { "zmw",           BuiltIn::ZmwFilter }\r
+};\r
+\r
+static const unordered_map<string, LocalContextFlags> contextFlagNames =\r
+{\r
+    { "NO_LOCAL_CONTEXT", LocalContextFlags::NO_LOCAL_CONTEXT },\r
+    { "ADAPTER_BEFORE",   LocalContextFlags::ADAPTER_BEFORE },\r
+    { "ADAPTER_AFTER",    LocalContextFlags::ADAPTER_AFTER },\r
+    { "BARCODE_BEFORE",   LocalContextFlags::BARCODE_BEFORE },\r
+    { "BARCODE_AFTER",    LocalContextFlags::BARCODE_AFTER },\r
+    { "FORWARD_PASS",     LocalContextFlags::FORWARD_PASS },\r
+    { "REVERSE_PASS",     LocalContextFlags::REVERSE_PASS }\r
+};\r
+\r
+// helper methods (for handling maybe-list strings))\r
+static inline bool isBracketed(const string& value)\r
+{\r
+    static const string openBrackets = "[({";\r
+    static const string closeBrackets = "])}";\r
+    return openBrackets.find(value.at(0)) != string::npos &&\r
+           closeBrackets.find(value.at(value.length()-1)) != string::npos;\r
+};\r
+\r
+static inline bool isList(const string& value)\r
+{\r
+    return value.find(',') != string::npos;\r
+}\r
+\r
+static\r
+PbiFilter CreateBarcodeFilter(string value,\r
+                              const Compare::Type compareType)\r
+{\r
+    if (value.empty())\r
+        throw std::runtime_error("empty value for barcode filter property");\r
+\r
+    if (isBracketed(value)) {\r
+        value.erase(0,1);\r
+        value.pop_back();\r
+    }\r
+\r
+    if (isList(value)) {\r
+        vector<string> barcodes = internal::Split(value, ',');\r
+        if (barcodes.size() != 2)\r
+            throw std::runtime_error("only 2 barcode values expected");\r
+        return PbiBarcodesFilter{ boost::numeric_cast<int16_t>(stoi(barcodes.at(0))),\r
+                                  boost::numeric_cast<int16_t>(stoi(barcodes.at(1))),\r
+                                  compareType\r
+                                };\r
+    } else\r
+        return PbiBarcodeFilter{ boost::numeric_cast<int16_t>(stoi(value)), compareType };\r
+}\r
+\r
+static\r
+PbiFilter CreateBarcodeForwardFilter(string value,\r
+                                     const Compare::Type compareType)\r
+{\r
+    if (value.empty())\r
+        throw std::runtime_error("empty value for barcode_forward filter property");\r
+\r
+    if (isBracketed(value)) {\r
+        value.erase(0,1);\r
+        value.pop_back();\r
+    }\r
+\r
+    if (isList(value)) {\r
+        vector<string> tokens = internal::Split(value, ',');\r
+        vector<int16_t> barcodes;\r
+        barcodes.reserve(tokens.size());\r
+        for (const auto& t : tokens) \r
+            barcodes.push_back(boost::numeric_cast<int16_t>(stoi(t)));\r
+        return PbiBarcodeForwardFilter{ std::move(barcodes) };\r
+    } else\r
+        return PbiBarcodeForwardFilter{ boost::numeric_cast<int16_t>(stoi(value)), compareType };\r
+}\r
+\r
+static\r
+PbiFilter CreateBarcodeReverseFilter(string value,\r
+                                     const Compare::Type compareType)\r
+{\r
+    if (value.empty())\r
+        throw std::runtime_error("empty value for barcode_reverse filter property");\r
+\r
+    if (isBracketed(value)) {\r
+        value.erase(0,1);\r
+        value.pop_back();\r
+    }\r
+\r
+    if (isList(value)) {\r
+        vector<string> tokens = internal::Split(value, ',');\r
+        vector<int16_t> barcodes;\r
+        barcodes.reserve(tokens.size());\r
+        for (const auto& t : tokens)\r
+            barcodes.push_back(boost::numeric_cast<int16_t>(stoi(t)));\r
+        return PbiBarcodeReverseFilter{ std::move(barcodes) };\r
+    } else\r
+        return PbiBarcodeReverseFilter{ boost::numeric_cast<int16_t>(stoi(value)), compareType };\r
+}\r
+\r
+static\r
+PbiFilter CreateLocalContextFilter(const string& value,\r
+                                   const Compare::Type compareType)\r
+{\r
+    if (value.empty())\r
+        throw std::runtime_error("empty value for local context filter property");\r
+\r
+    LocalContextFlags filterValue = LocalContextFlags::NO_LOCAL_CONTEXT;\r
+\r
+    // if raw integer\r
+    if (isdigit(value.at(0)))\r
+        filterValue = static_cast<LocalContextFlags>(stoi(value));\r
+\r
+    // else interpret as flag names\r
+    else {\r
+        vector<string> tokens = internal::Split(value, '|');\r
+        for (string& token : tokens) {\r
+            boost::algorithm::trim(token); // trim whitespace\r
+            filterValue = (filterValue | contextFlagNames.at(token));\r
+        }\r
+    }\r
+\r
+    return PbiFilter{ PbiLocalContextFilter{filterValue, compareType} };\r
+}\r
+\r
+static\r
+PbiFilter CreateQueryNamesFilterFromFile(const string& value,\r
+                                         const DataSet& dataset)\r
+{\r
+    // resolve file from dataset, value\r
+    const string resolvedFilename = dataset.ResolvePath(value);\r
+    vector<string> whitelist;\r
+    string fn;\r
+    ifstream in(resolvedFilename);\r
+    while (getline(in, fn))\r
+        whitelist.push_back(fn);\r
+    return PbiQueryNameFilter{ whitelist };\r
+}\r
+\r
+static\r
+PbiFilter FromDataSetProperty(const Property& property,\r
+                              const DataSet& dataset)\r
+{\r
+    try {\r
+        const string& value = property.Value();\r
+        const Compare::Type compareType = Compare::TypeFromOperator(property.Operator());\r
+        const BuiltIn builtInCode = builtInLookup.at(boost::algorithm::to_lower_copy(property.Name()));\r
+        switch (builtInCode) {\r
+\r
+            // single-value filters\r
+            case BuiltIn::AlignedEndFilter     : return PbiAlignedEndFilter{ static_cast<uint32_t>(stoul(value)), compareType };\r
+            case BuiltIn::AlignedLengthFilter  : return PbiAlignedLengthFilter{ static_cast<uint32_t>(stoul(value)), compareType };\r
+            case BuiltIn::AlignedStartFilter   : return PbiAlignedStartFilter{ static_cast<uint32_t>(stoul(value)), compareType };\r
+            case BuiltIn::BarcodeQualityFilter : return PbiBarcodeQualityFilter{ static_cast<uint8_t>(stoul(value)), compareType };\r
+            case BuiltIn::IdentityFilter       : return PbiIdentityFilter{ stof(value), compareType };\r
+            case BuiltIn::MovieNameFilter      : return PbiMovieNameFilter{ value };\r
+            case BuiltIn::QueryEndFilter       : return PbiQueryEndFilter{ stoi(value), compareType };\r
+            case BuiltIn::QueryLengthFilter    : return PbiQueryLengthFilter{ stoi(value), compareType };\r
+            case BuiltIn::QueryNameFilter      : return PbiQueryNameFilter{ value };\r
+            case BuiltIn::QueryStartFilter     : return PbiQueryStartFilter{ stoi(value), compareType };\r
+            case BuiltIn::ReadAccuracyFilter   : return PbiReadAccuracyFilter{ stof(value), compareType };\r
+            case BuiltIn::ReadGroupFilter      : return PbiReadGroupFilter{ value, compareType };\r
+            case BuiltIn::ReferenceEndFilter   : return PbiReferenceEndFilter{ static_cast<uint32_t>(stoul(value)), compareType };\r
+            case BuiltIn::ReferenceIdFilter    : return PbiReferenceIdFilter{ stoi(value), compareType };\r
+            case BuiltIn::ReferenceNameFilter  : return PbiReferenceNameFilter{ value };\r
+            case BuiltIn::ReferenceStartFilter : return PbiReferenceStartFilter{ static_cast<uint32_t>(stoul(value)), compareType };\r
+            case BuiltIn::ZmwFilter            : return PbiZmwFilter{ stoi(value), compareType };\r
+\r
+            // (maybe) list-value filters\r
+            case BuiltIn::BarcodeFilter        : return CreateBarcodeFilter(value, compareType);\r
+            case BuiltIn::BarcodeForwardFilter : return CreateBarcodeForwardFilter(value, compareType);\r
+            case BuiltIn::BarcodeReverseFilter : return CreateBarcodeReverseFilter(value, compareType); \r
+            case BuiltIn::LocalContextFilter   : return CreateLocalContextFilter(value, compareType);\r
+\r
+            // other built-ins\r
+            case BuiltIn::QueryNamesFromFileFilter : return CreateQueryNamesFilterFromFile(value, dataset); // compareType ignored\r
+\r
+            default :\r
+                throw std::exception();\r
+        }\r
+        // unreachable\r
+        return PbiFilter{ };\r
+\r
+    } catch (std::exception& e) {\r
+        stringstream s;\r
+        s << "error: could not create filter from XML Property element: " << endl\r
+          << "  Name:     " << property.Name()     << endl\r
+          << "  Value:    " << property.Value()    << endl\r
+          << "  Operator: " << property.Operator() << endl\r
+          << "  reason:   " << e.what() << endl;\r
+        throw std::runtime_error(s.str());\r
+    }\r
+}\r
+\r
+} // namespace internal\r
+} // namespace BAM\r
+} // namespace PacBio\r
+\r
+PbiFilter PbiFilter::FromDataSet(const DataSet& dataset)\r
+{\r
+    auto datasetFilter = PbiFilter{ PbiFilter::UNION };\r
+    for (auto&& xmlFilter : dataset.Filters()) {\r
+        auto propertiesFilter = PbiFilter{ };\r
+        for (auto&& xmlProperty : xmlFilter.Properties())\r
+            propertiesFilter.Add(internal::FromDataSetProperty(xmlProperty, dataset));\r
+        datasetFilter.Add(propertiesFilter);\r
+    }\r
+    return datasetFilter;\r
+}\r
+\r
+PbiFilter PbiFilter::Intersection(const std::vector<PbiFilter>& filters)\r
+{\r
+    auto result = PbiFilter{ PbiFilter::INTERSECT };\r
+    result.Add(filters);\r
+    return result;\r
+}\r
+\r
+PbiFilter PbiFilter::Intersection(std::vector<PbiFilter>&& filters)\r
+{\r
+    auto result = PbiFilter{ PbiFilter::INTERSECT };\r
+    result.Add(std::move(filters));\r
+    return result;\r
+}\r
+\r
+PbiFilter PbiFilter::Union(const std::vector<PbiFilter>& filters)\r
+{\r
+    auto result = PbiFilter{ PbiFilter::UNION };\r
+    result.Add(filters);\r
+    return result;\r
+}\r
+\r
+PbiFilter PbiFilter::Union(std::vector<PbiFilter>&& filters)\r
+{\r
+    auto result = PbiFilter{ PbiFilter::UNION };\r
+    result.Add(std::move(filters));\r
+    return result;\r
+}\r
diff --git a/src/PbiFilterQuery.cpp b/src/PbiFilterQuery.cpp

new file mode 100644 (file)

index 0000000..19d2b31
--- /dev/null
+++ b/src/PbiFilterQuery.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiFilterQuery.cpp
+/// \brief Implements the PbiFilterQuery class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiFilterQuery.h"
+#include "pbbam/CompositeBamReader.h"
+
+
+#include <iostream>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+struct PbiFilterQuery::PbiFilterQueryPrivate
+{
+    PbiFilterQueryPrivate(const PbiFilter& filter, const DataSet& dataset)
+        : reader_(filter, dataset)
+    { }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_; // unsorted
+};
+
+PbiFilterQuery::PbiFilterQuery(const PbiFilter& filter, const DataSet& dataset)
+    : internal::IQuery()
+    , d_(new PbiFilterQueryPrivate(filter, dataset))
+{ }
+
+PbiFilterQuery::~PbiFilterQuery(void) { }
+
+bool PbiFilterQuery::GetNext(BamRecord &r)
+{ return d_->reader_.GetNext(r); }
diff --git a/src/PbiFilterTypes.cpp b/src/PbiFilterTypes.cpp

new file mode 100644 (file)

index 0000000..13ff375
--- /dev/null
+++ b/src/PbiFilterTypes.cpp
@@ -0,0 +1,450 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiFilterTypes.cpp
+/// \brief Implements the built-in PBI filters.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiFilterTypes.h"
+#include "StringUtils.h"
+#include <boost/algorithm/string.hpp>
+#include <sstream>
+#include <string>
+#include <cassert>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template<typename T>
+IndexList readLengthHelper(const std::vector<T>& start,
+                           const std::vector<T>& end,
+                           const T& value,
+                           const Compare::Type cmp)
+{
+    assert(start.size() == end.size());
+
+    auto result = IndexList{ };
+    const auto numElements = start.size();
+    for (size_t i = 0; i < numElements; ++i) {
+        const auto readLength = end[i] - start[i];
+        bool keep = false;
+        switch(cmp) {
+            case Compare::EQUAL              : keep = (readLength == value); break;
+            case Compare::NOT_EQUAL          : keep = (readLength != value); break;
+            case Compare::LESS_THAN          : keep = (readLength < value); break;
+            case Compare::LESS_THAN_EQUAL    : keep = (readLength <= value); break;
+            case Compare::GREATER_THAN       : keep = (readLength > value); break;
+            case Compare::GREATER_THAN_EQUAL : keep = (readLength >= value); break;
+            default:
+                assert(false);
+                throw std::runtime_error(string{"read length filter encountered unknown Compare::Type: "} +
+                                         Compare::TypeToName(cmp));
+        }
+
+        if (keep)
+            result.push_back(i);
+    }
+    return result;
+}
+
+static
+PbiFilter filterFromMovieName(const string& movieName, bool includeCcs)
+{
+    // we'll match on any rgIds from our candidate list
+    auto filter = PbiFilter{ PbiFilter::UNION };
+    filter.Add(
+    {
+        PbiReadGroupFilter{ MakeReadGroupId(movieName, "POLYMERASE") },
+        PbiReadGroupFilter{ MakeReadGroupId(movieName, "HQREGION") },
+        PbiReadGroupFilter{ MakeReadGroupId(movieName, "SUBREAD") },
+        PbiReadGroupFilter{ MakeReadGroupId(movieName, "SCRAP") },
+        PbiReadGroupFilter{ MakeReadGroupId(movieName, "UNKNOWN") }
+    });
+    if (includeCcs)
+        filter.Add(PbiReadGroupFilter{ MakeReadGroupId(movieName, "CCS") });
+
+    return filter;
+}
+
+//static
+//PbiFilter filterFromQueryName(const string& queryName)
+//{
+//    // split full name into moviename, holenumber
+//    const auto nameParts = internal::Split(queryName, '/');
+//    if (nameParts.size() != 3) {
+//        auto msg = string{ "PbiQueryNameFilter error: requested QNAME (" } + queryName;
+//        msg += string{ ") is not a valid PacBio BAM QNAME. See spec for details"};
+//        throw std::runtime_error(msg);
+//    }
+//
+//    // main filter: {union of candidate rgIds} && zmw [&& qStart && qEnd](non-CCS reads)
+//    auto filter = PbiFilter{ };
+//    filter.Add(PbiZmwFilter{ stoi(nameParts.at(1)) }); // hole number
+//
+//    const auto movieName = nameParts.at(0);
+//
+//    // CCS (only 1 possible candidate rgId)
+//    if (nameParts.at(2) == "ccs")
+//        filter.Add(PbiReadGroupFilter{ MakeReadGroupId(movieName, "CCS") });
+//
+//    // all other read types
+//    else {
+//        // we'll match on any read type that matches our qname
+//        // (except for CCS since it has a different QNAME anyway)
+//        const auto rgIdFilter = filterFromMovieName(movieName, false);
+//        filter.Add(rgIdFilter);
+//
+//        // add qStart/qEnd filters to our main filter
+//        const auto queryIntervalParts = internal::Split(nameParts.at(2), '_');
+//        if (queryIntervalParts.size() != 2) {
+//            auto msg = string{ "PbiQueryNameFilter error: requested QNAME (" } + queryName;
+//            msg += string{ ") is not a valid PacBio BAM QNAME. See spec for details"};
+//            throw std::runtime_error(msg);
+//        }
+//        filter.Add(PbiQueryStartFilter{ stoi(queryIntervalParts.at(0)) });
+//        filter.Add(PbiQueryEndFilter{ stoi(queryIntervalParts.at(1)) });
+//    }
+//    return filter;
+//}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+// PbiAlignedLengthFilter
+
+bool PbiAlignedLengthFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const auto& mappedData = idx.MappedData();
+    const auto& aEnd    = mappedData.aEnd_.at(row) ;
+    const auto& aStart  = mappedData.aStart_.at(row);
+    const auto aLength = aEnd - aStart;
+    return CompareHelper(aLength);
+}
+
+// PbiIdentityFilter
+
+bool PbiIdentityFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const auto& mappedData = idx.MappedData();
+    const auto& nMM  = mappedData.nMM_.at(row);
+    const auto& nIndels = mappedData.NumDeletedAndInsertedBasesAt(row);
+    const auto& nDel = nIndels.first;
+    const auto& nIns = nIndels.second;
+
+    const auto& basicData = idx.BasicData();
+    const auto& qStart = basicData.qStart_.at(row);
+    const auto& qEnd   = basicData.qEnd_.at(row);
+
+    const auto readLength = qEnd - qStart;
+    const auto nonMatches = nMM + nDel + nIns;
+    const float identity  = 1.0 - (static_cast<float>(nonMatches)/static_cast<float>(readLength));
+
+    return CompareHelper(identity);
+}
+
+// PbiMovieNameFilter
+
+PbiMovieNameFilter::PbiMovieNameFilter(const std::string& movieName)
+    : compositeFilter_(internal::filterFromMovieName(movieName, true)) // include CCS
+{ }
+
+PbiMovieNameFilter::PbiMovieNameFilter(const std::vector<std::string>& whitelist)
+    : compositeFilter_(PbiFilter::UNION)
+{
+    for (const auto& movieName : whitelist)
+        compositeFilter_.Add(internal::filterFromMovieName(movieName, true)); // include CCS
+}
+
+PbiMovieNameFilter::PbiMovieNameFilter(std::vector<std::string>&& whitelist)
+    : compositeFilter_(PbiFilter::UNION)
+{
+    for (auto&& movieName : whitelist)
+        compositeFilter_.Add(internal::filterFromMovieName(movieName, true)); // include CCS
+}
+
+// PbiQueryLengthFilter
+
+bool PbiQueryLengthFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const auto& basicData = idx.BasicData();
+    const auto& qStart = basicData.qStart_.at(row);
+    const auto& qEnd   = basicData.qEnd_.at(row);
+    const auto readLength = qEnd - qStart;
+    return CompareHelper(readLength);
+}
+
+// PbiQueryNameFilter
+
+struct PbiQueryNameFilter::PbiQueryNameFilterPrivate
+{
+public:
+    typedef pair<int32_t, int32_t>                 QueryInterval;
+    typedef set<QueryInterval>                     QueryIntervals;
+    typedef unordered_map<int32_t, QueryIntervals> ZmwLookup;
+    typedef shared_ptr<ZmwLookup>                  ZmwLookupPtr;
+    typedef unordered_map<int32_t, ZmwLookupPtr>   RgIdLookup;
+
+public:
+    PbiQueryNameFilterPrivate(const vector<string>& whitelist)
+    {
+        for (const auto& queryName : whitelist) {
+
+            // split name into main parts
+            auto nameParts = internal::Split(queryName, '/');
+            if (nameParts.size() != 3) {
+                auto msg = string{ "PbiQueryNameFilter error: requested QNAME (" } + queryName;
+                msg += string{ ") is not a valid PacBio BAM QNAME. See spec for details"};
+                throw std::runtime_error(msg);
+            }
+
+            //
+            // generate candidate read group IDs from movie name
+            //
+            // then, ensure read group IDs in lookup table, creating or fetching
+            // shared ZmwLookup table if new movie
+            //
+            const string& movieName = nameParts.at(0);
+            const bool isCCS = (nameParts.at(2) == "ccs" || nameParts.at(2) == "CCS");
+            vector<int32_t> rgIds;
+            if (isCCS) {
+                rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "CCS")) );
+            } else {
+                rgIds.reserve(6);
+                rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "POLYMERASE")));
+                rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "HQREGION")));
+                rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "SUBREAD")));
+                rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "SCRAP")));
+                rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "UNKNOWN")));
+                rgIds.push_back( ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "ZMW")));
+            }
+            assert(!rgIds.empty());
+            auto rgFound = lookup_.find(rgIds.front());
+            ZmwLookupPtr zmwPtr = nullptr;
+            if (rgFound == lookup_.end()) {
+                zmwPtr = ZmwLookupPtr(new ZmwLookup);
+                for (const auto& rg : rgIds) {
+                    assert(lookup_.find(rg) == lookup_.end());
+                    lookup_.emplace(rg, zmwPtr);
+                }
+            }
+            else {
+#ifndef NDEBUG
+                for (const auto& rg : rgIds)
+                    assert(lookup_.find(rg) != lookup_.end());
+#endif
+                zmwPtr = rgFound->second;
+            }
+
+            // fetch ZMW & QueryStart/QEnd from query name
+            const int32_t zmw = stoi(nameParts.at(1));
+            int32_t queryStart = -1;
+            int32_t queryEnd   = -1;
+            if (!isCCS) {
+                const auto queryIntervalParts = internal::Split(nameParts.at(2), '_');
+                if (queryIntervalParts.size() != 2) {
+                    auto msg = string{ "PbiQueryNameFilter error: requested QNAME (" } + queryName;
+                    msg += string{ ") is not a valid PacBio BAM QNAME. See spec for details"};
+                    throw std::runtime_error(msg);
+                }
+                queryStart = stoi(queryIntervalParts.at(0));
+                queryEnd   = stoi(queryIntervalParts.at(1));
+            }
+
+            // creating new ZMW entry if not yet seen & store QS/QE pair
+            //
+            const auto zmwFound = zmwPtr->find(zmw);
+            if (zmwFound == zmwPtr->end())
+                zmwPtr->emplace(zmw, QueryIntervals{});
+            QueryIntervals& queryIntervals = zmwPtr->at(zmw);
+            queryIntervals.emplace(make_pair(queryStart, queryEnd));
+        }
+    }
+
+    PbiQueryNameFilterPrivate(const unique_ptr<PbiQueryNameFilterPrivate>& other)
+    {
+        if (other)
+            lookup_ = other->lookup_;
+    }
+
+    bool Accepts(const PbiRawData& idx, const size_t row) const
+    {
+        const auto& basicData = idx.BasicData();
+
+        // see if row's RGID known
+        const auto& rgId = basicData.rgId_.at(row);
+        const auto rgFound = lookup_.find(rgId);
+        if (rgFound == lookup_.end())
+            return false;
+
+        // see if row's ZMW known
+        const auto& zmwPtr = rgFound->second;
+        const auto zmw = basicData.holeNumber_.at(row);
+        const auto zmwFound = zmwPtr->find(zmw);
+        if (zmwFound == zmwPtr->end())
+            return false;
+
+        // see if row's QueryStart/QueryEnd known
+        // CCS names already covered in lookup construction phase
+        const auto& queryIntervals = zmwFound->second;
+        const auto qStart = basicData.qStart_.at(row);
+        const auto qEnd   = basicData.qEnd_.at(row);
+        const auto queryInterval = make_pair(qStart, qEnd);
+        return queryIntervals.find(queryInterval) != queryIntervals.end();
+    }
+
+private:
+    RgIdLookup lookup_;
+};
+
+PbiQueryNameFilter::PbiQueryNameFilter(const std::string& qname)
+    : d_(new PbiQueryNameFilter::PbiQueryNameFilterPrivate(vector<string>{1, qname}))
+{ }
+//    : compositeFilter_(internal::filterFromQueryName(qname))
+//{ }
+
+PbiQueryNameFilter::PbiQueryNameFilter(const std::vector<std::string>& whitelist)
+    : d_(new PbiQueryNameFilter::PbiQueryNameFilterPrivate(whitelist))
+{ }
+//    : compositeFilter_(PbiFilter::UNION)
+//{
+//    try {
+//        for (const auto& qname : whitelist)
+//            compositeFilter_.Add(internal::filterFromQueryName(qname));
+//    }
+//    // simply re-throw our own exception
+//    catch (std::runtime_error&) {
+//        throw;
+//    }
+//    // we may hit other exceptions (e.g. in stoi()) - but we'll pin on a bit of extra data
+//    catch (std::exception& e) {
+//        auto msg = string{ "PbiQueryNameFilter encountered error: " } + e.what();
+//        throw std::runtime_error(msg);
+//    }
+//}
+
+//PbiQueryNameFilter::PbiQueryNameFilter(std::vector<std::string>&& whitelist)
+//    : d_(new PbiQueryNameFilter::PbiQueryNameFilterPrivate(whitelist))
+//{ }
+//    : compositeFilter_(PbiFilter::UNION)
+//{
+//    try {
+//        for (const auto& qname : whitelist)
+//            compositeFilter_.Add(internal::filterFromQueryName(qname));
+//    }
+//    // simply re-throw our own exception
+//    catch (std::runtime_error&) {
+//        throw;
+//    }
+//    // we may hit other exceptions (e.g. in stoi()) - but we'll pin on a bit of extra data
+//    catch (std::exception& e) {
+//        auto msg = string{ "PbiQueryNameFilter encountered error: " } + e.what();
+//        throw std::runtime_error(msg);
+//    }
+//}
+
+PbiQueryNameFilter::PbiQueryNameFilter(const PbiQueryNameFilter& other)
+    : d_(new PbiQueryNameFilter::PbiQueryNameFilterPrivate(other.d_))
+{ }
+
+PbiQueryNameFilter::~PbiQueryNameFilter(void) { }
+
+bool PbiQueryNameFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{ return d_->Accepts(idx, row); }
+//{ return compositeFilter_.Accepts(idx, row); }
+
+// PbiReferenceNameFilter
+
+PbiReferenceNameFilter::PbiReferenceNameFilter(const std::string& rname,
+                                               const Compare::Type cmp)
+    : initialized_(false)
+    , rname_(rname)
+    , cmp_(cmp)
+{
+    if (cmp != Compare::EQUAL && cmp != Compare::NOT_EQUAL) {
+        auto msg = std::string{ "Compare type: " };
+        msg += Compare::TypeToName(cmp);
+        msg += " not supported for PbiReferenceNameFilter (use one of Compare::EQUAL or Compare::NOT_EQUAL).";
+        throw std::runtime_error(msg);
+    }
+}
+
+PbiReferenceNameFilter::PbiReferenceNameFilter(const std::vector<std::string>& whitelist)
+    : initialized_(false)
+    , rnameWhitelist_(whitelist)
+    , cmp_(Compare::EQUAL)
+{ }
+
+PbiReferenceNameFilter::PbiReferenceNameFilter(std::vector<std::string>&& whitelist)
+    : initialized_(false)
+    , rnameWhitelist_(std::move(whitelist))
+    , cmp_(Compare::EQUAL)
+{ }
+
+bool PbiReferenceNameFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    if (!initialized_)
+        Initialize(idx);
+    return subFilter_.Accepts(idx, row);
+}
+
+void PbiReferenceNameFilter::Initialize(const PbiRawData& idx) const
+{
+    const auto pbiFilename = idx.Filename();
+    const auto bamFilename = pbiFilename.substr(0, pbiFilename.length() - 4);
+    const auto bamFile = BamFile{ bamFilename };
+
+    // single-value
+    if (rnameWhitelist_ == boost::none) {
+        const auto tId = bamFile.ReferenceId(rname_);
+        subFilter_ = PbiReferenceIdFilter{ tId, cmp_ };
+    }
+
+    // multi-value whitelist
+    else {
+        subFilter_ = PbiFilter(PbiFilter::UNION);
+        for (const auto& rname : rnameWhitelist_.get())
+            subFilter_.Add(PbiReferenceIdFilter{ bamFile.ReferenceId(rname) });
+    }
+    initialized_ = true;
+}
+
diff --git a/src/PbiIndex.cpp b/src/PbiIndex.cpp

new file mode 100644 (file)

index 0000000..da874b7
--- /dev/null
+++ b/src/PbiIndex.cpp
@@ -0,0 +1,218 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiIndex.cpp
+/// \brief Implements the PbiIndex class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiIndex.h"
+#include "PbiIndexIO.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+// ----------------------------------
+// SubreadLookupData implementation
+// ----------------------------------
+
+BasicLookupData::BasicLookupData(void) { }
+
+BasicLookupData::BasicLookupData(const PbiRawBasicData& rawData)
+    : rgId_(rawData.rgId_)
+    , qStart_(rawData.qStart_)
+    , qEnd_(rawData.qEnd_)
+    , holeNumber_(rawData.holeNumber_)
+    , readQual_(rawData.readQual_)
+    , ctxtFlag_(rawData.ctxtFlag_)
+    , fileOffset_(rawData.fileOffset_)
+{ }
+
+// ----------------------------------
+// MappedLookupData implementation
+// ----------------------------------
+
+MappedLookupData::MappedLookupData(void) { }
+
+MappedLookupData::MappedLookupData(const PbiRawMappedData& rawData)
+    : tId_(rawData.tId_)
+    , tStart_(rawData.tStart_)
+    , tEnd_(rawData.tEnd_)
+    , aStart_(rawData.aStart_)
+    , aEnd_(rawData.aEnd_)
+    , nM_(rawData.nM_)
+    , nMM_(rawData.nMM_)
+    , mapQV_(rawData.mapQV_)
+{
+    const size_t numElements = rawData.revStrand_.size();
+    reverseStrand_.reserve(numElements/2);
+    forwardStrand_.reserve(numElements/2);
+
+    std::map<uint32_t, IndexList> insRawData;
+    std::map<uint32_t, IndexList> delRawData;
+    for (size_t i = 0; i < numElements; ++i) {
+
+        // nDel, nIns
+        const auto indels = rawData.NumDeletedAndInsertedBasesAt(i);
+        delRawData[indels.first].push_back(i);
+        insRawData[indels.second].push_back(i);
+
+        // strand
+        if (rawData.revStrand_.at(i) == 0)
+            forwardStrand_.push_back(i);
+        else
+            reverseStrand_.push_back(i);
+    }
+
+    nIns_ = OrderedLookup<uint32_t>(std::move(insRawData));
+    nDel_ = OrderedLookup<uint32_t>(std::move(delRawData));
+}
+
+// ----------------------------------
+// BarcodeLookupData implementation
+// ----------------------------------
+
+BarcodeLookupData::BarcodeLookupData(void) { }
+
+BarcodeLookupData::BarcodeLookupData(const PbiRawBarcodeData& rawData)
+    : bcForward_(rawData.bcForward_)
+    , bcReverse_(rawData.bcReverse_)
+    , bcQual_(rawData.bcQual_)
+
+{  }
+
+// ----------------------------------
+// ReferenceLookupData implementation
+// ----------------------------------
+
+ReferenceLookupData::ReferenceLookupData(void) { }
+
+ReferenceLookupData::ReferenceLookupData(const PbiRawReferenceData& rawData)
+{
+    const size_t numEntries = rawData.entries_.size();
+    references_.reserve(numEntries);
+    for (size_t i = 0; i < numEntries; ++i) {
+        const PbiReferenceEntry& entry = rawData.entries_.at(i);
+        references_[entry.tId_] = IndexRange(entry.beginRow_, entry.endRow_);
+    }
+}
+
+// --------------------------------
+// PbiIndexPrivate implementation
+// --------------------------------
+
+PbiIndexPrivate::PbiIndexPrivate(void)
+    : version_(PbiFile::CurrentVersion)
+    , sections_(PbiFile::BASIC)
+    , numReads_(0)
+{ }
+
+PbiIndexPrivate::PbiIndexPrivate(const PbiRawData& rawIndex)
+    : filename_(rawIndex.Filename())
+    , version_(rawIndex.Version())
+    , sections_(rawIndex.FileSections())
+    , numReads_(rawIndex.NumReads())
+    , basicData_(rawIndex.BasicData())
+    , mappedData_(rawIndex.MappedData())
+    , referenceData_(rawIndex.ReferenceData())
+    , barcodeData_(rawIndex.BarcodeData())
+{ }
+
+PbiIndexPrivate::PbiIndexPrivate(PbiRawData&& rawIndex)
+    : filename_(rawIndex.Filename())
+    , version_(std::move(rawIndex.Version()))
+    , sections_(std::move(rawIndex.FileSections()))
+    , numReads_(std::move(rawIndex.NumReads()))
+    , basicData_(std::move(rawIndex.BasicData()))
+    , mappedData_(std::move(rawIndex.MappedData()))
+    , referenceData_(std::move(rawIndex.ReferenceData()))
+    , barcodeData_(std::move(rawIndex.BarcodeData()))
+{ }
+
+unique_ptr<PbiIndexPrivate> PbiIndexPrivate::DeepCopy(void) const
+{
+    std::unique_ptr<PbiIndexPrivate> copy(new PbiIndexPrivate);
+    copy->filename_ = filename_;
+    copy->version_  = version_;
+    copy->sections_ = sections_;
+    copy->numReads_ = numReads_;
+    copy->basicData_     = basicData_;
+    copy->mappedData_    = mappedData_;
+    copy->referenceData_ = referenceData_;
+    copy->barcodeData_   = barcodeData_;
+    return copy;
+}
+
+// -------------------------
+// PbiIndex implementation
+// -------------------------
+
+PbiIndex::PbiIndex(void)
+    : d_(new PbiIndexPrivate)
+{ }
+
+PbiIndex::PbiIndex(const string& pbiFilename)
+    : d_(new PbiIndexPrivate(PbiRawData(pbiFilename)))
+{ }
+
+PbiIndex::PbiIndex(const PbiIndex& other)
+    : d_(std::forward<unique_ptr<PbiIndexPrivate>>(other.d_->DeepCopy()))
+{
+    // move is ok, since it's a deep-copied, new object
+}
+
+PbiIndex::PbiIndex(PbiIndex&& other)
+    : d_(std::move(other.d_))
+{ }
+
+PbiIndex& PbiIndex::operator=(const PbiIndex& other)
+{
+    // move is ok, since it's a deep-copied, new object
+    d_ = other.d_->DeepCopy();
+    return *this;
+}
+
+PbiIndex& PbiIndex::operator=(PbiIndex&& other)
+{
+    d_ = std::move(other.d_);
+    return *this;
+}
+
+PbiIndex::~PbiIndex(void) { }
+
+string PbiIndex::Filename(void) const
+{ return d_->filename_; }
diff --git a/src/PbiIndexIO.cpp b/src/PbiIndexIO.cpp

new file mode 100644 (file)

index 0000000..7d1f615
--- /dev/null
+++ b/src/PbiIndexIO.cpp
@@ -0,0 +1,474 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// Author: Derek Barnett
+
+#include "PbiIndexIO.h"
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/PbiBuilder.h"
+#include "MemoryUtils.h"
+#include <boost/algorithm/string.hpp>
+#include <vector>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// \brief Appends content of src vector to dst vector using move semantics.
+///
+/// \param[in]     src  Input vector that will be empty after execution
+/// \param[in,out] dst  Output vector that will be appended to
+///
+template <typename T>
+inline void MoveAppend(std::vector<T>& src, std::vector<T>& dst) noexcept
+{
+    if (dst.empty())
+    {
+        dst = std::move(src);
+    }
+    else
+    {
+        dst.reserve(dst.size() + src.size());
+        std::move(src.begin(), src.end(), std::back_inserter(dst));
+        src.clear();
+    }
+}
+
+/// \brief Appends content of src vector to dst vector using move semantics.
+///
+/// \param[in]     src  Input vector via perfect forwarding
+/// \param[in,out] dst  Output vector that will be appended to
+///
+template <typename T>
+inline void MoveAppend(std::vector<T>&& src, std::vector<T>& dst) noexcept
+{
+    if (dst.empty())
+    {
+        dst = std::move(src);
+    }
+    else
+    {
+        dst.reserve(dst.size() + src.size());
+        std::move(src.begin(), src.end(), std::back_inserter(dst));
+        src.clear();
+    }
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+// ---------------------------
+// PbiIndexIO implementation
+// ---------------------------
+
+PbiRawData PbiIndexIO::Load(const std::string& pbiFilename)
+{
+     PbiRawData rawData;
+     Load(rawData, pbiFilename);
+     return rawData;
+}
+
+void PbiIndexIO::Load(PbiRawData& rawData,
+                      const string& filename)
+{
+    // open file for reading
+    if (!boost::algorithm::iends_with(filename, ".pbi"))
+        throw std::runtime_error("unsupported file extension");
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> bgzf(bgzf_open(filename.c_str(), "rb"));
+    BGZF* fp = bgzf.get();
+    if (fp == 0)
+        throw std::runtime_error("could not open PBI file for reading");
+
+    // load data
+    LoadHeader(rawData, fp);
+    const uint32_t numReads = rawData.NumReads();
+    if (numReads > 0) {
+        LoadBasicData(rawData.BasicData(), numReads, fp);
+        if (rawData.HasMappedData())
+            LoadMappedData(rawData.MappedData(), numReads, fp);
+        if (rawData.HasReferenceData())
+            LoadReferenceData(rawData.ReferenceData(), fp);
+        if (rawData.HasBarcodeData())
+            LoadBarcodeData(rawData.BarcodeData(), numReads, fp);
+    }
+}
+
+void PbiIndexIO::LoadFromDataSet(PbiRawData& aggregateData,
+                                 const DataSet& dataset)
+{
+    aggregateData.NumReads(0);
+    aggregateData.FileSections(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE);
+    aggregateData.Version(PbiFile::CurrentVersion);
+
+    const auto bamFiles = dataset.BamFiles();
+    uint16_t fileNumber = 0;
+    for (const auto& bamFile : bamFiles) {
+        PbiRawData currentPbi{bamFile.PacBioIndexFilename()};
+        const size_t currentPbiCount = currentPbi.NumReads();
+
+        // read count
+        aggregateData.NumReads(aggregateData.NumReads()+currentPbiCount);
+
+        // BasicData
+        PbiRawBasicData& aggregateBasicData = aggregateData.BasicData();
+        PbiRawBasicData& currentBasicData   = currentPbi.BasicData();
+        MoveAppend(std::move(currentBasicData.rgId_),       aggregateBasicData.rgId_);
+        MoveAppend(std::move(currentBasicData.qStart_),     aggregateBasicData.qStart_);
+        MoveAppend(std::move(currentBasicData.qEnd_),       aggregateBasicData.qEnd_);
+        MoveAppend(std::move(currentBasicData.holeNumber_), aggregateBasicData.holeNumber_);
+        MoveAppend(std::move(currentBasicData.readQual_),   aggregateBasicData.readQual_);
+        MoveAppend(std::move(currentBasicData.ctxtFlag_),   aggregateBasicData.ctxtFlag_);
+        MoveAppend(std::move(currentBasicData.fileOffset_), aggregateBasicData.fileOffset_);
+        MoveAppend(std::vector<uint16_t>(currentPbiCount, fileNumber), aggregateBasicData.fileNumber_);
+
+        // BarcodeData
+        PbiRawBarcodeData& aggregateBarcodeData = aggregateData.BarcodeData();
+        if (currentPbi.HasBarcodeData()) {
+            PbiRawBarcodeData& currentBarcodeData  = currentPbi.BarcodeData();
+            MoveAppend(std::move(currentBarcodeData.bcForward_), aggregateBarcodeData.bcForward_);
+            MoveAppend(std::move(currentBarcodeData.bcReverse_), aggregateBarcodeData.bcReverse_);
+            MoveAppend(std::move(currentBarcodeData.bcQual_),    aggregateBarcodeData.bcQual_);
+        } else {
+            MoveAppend(std::vector<int16_t>(currentPbiCount, -1), aggregateBarcodeData.bcForward_);
+            MoveAppend(std::vector<int16_t>(currentPbiCount, -1), aggregateBarcodeData.bcReverse_);
+            MoveAppend(std::vector<int8_t>(currentPbiCount, -1),  aggregateBarcodeData.bcQual_);
+        }
+
+        // MappedData
+        PbiRawMappedData& aggregateMappedData = aggregateData.MappedData();
+        if (currentPbi.HasMappedData()) {
+            PbiRawMappedData& currentMappedData  = currentPbi.MappedData();
+            MoveAppend(std::move(currentMappedData.tId_),       aggregateMappedData.tId_);
+            MoveAppend(std::move(currentMappedData.tStart_),    aggregateMappedData.tStart_);
+            MoveAppend(std::move(currentMappedData.tEnd_),      aggregateMappedData.tEnd_);
+            MoveAppend(std::move(currentMappedData.aStart_),    aggregateMappedData.aStart_);
+            MoveAppend(std::move(currentMappedData.aEnd_),      aggregateMappedData.aEnd_);
+            MoveAppend(std::move(currentMappedData.revStrand_), aggregateMappedData.revStrand_);
+            MoveAppend(std::move(currentMappedData.nM_),        aggregateMappedData.nM_);
+            MoveAppend(std::move(currentMappedData.nMM_),       aggregateMappedData.nMM_);
+            MoveAppend(std::move(currentMappedData.mapQV_),     aggregateMappedData.mapQV_);
+        } else {
+            MoveAppend(std::vector<int32_t>(currentPbiCount, -1), aggregateMappedData.tId_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition), aggregateMappedData.tStart_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition), aggregateMappedData.tEnd_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition), aggregateMappedData.aStart_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition), aggregateMappedData.aEnd_);
+            MoveAppend(std::vector<uint8_t>(currentPbiCount, 0),   aggregateMappedData.revStrand_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, 0),  aggregateMappedData.nM_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, 0),  aggregateMappedData.nMM_);
+            MoveAppend(std::vector<uint8_t>(currentPbiCount, 255), aggregateMappedData.mapQV_);
+        }
+
+        ++fileNumber;
+    }
+}
+
+void PbiIndexIO::LoadBarcodeData(PbiRawBarcodeData& barcodeData,
+                                 const uint32_t numReads,
+                                 BGZF* fp)
+{
+    assert(numReads > 0);
+    (void)numReads; // quash warnings building in release mode
+
+    LoadBgzfVector(fp, barcodeData.bcForward_, numReads);
+    LoadBgzfVector(fp, barcodeData.bcReverse_, numReads);
+    LoadBgzfVector(fp, barcodeData.bcQual_,    numReads);
+
+    assert(barcodeData.bcForward_.size() == numReads);
+    assert(barcodeData.bcReverse_.size() == numReads);
+    assert(barcodeData.bcQual_.size()    == numReads);
+}
+
+void PbiIndexIO::LoadHeader(PbiRawData& index,
+                            BGZF* fp)
+{
+    size_t bytesRead = 0;
+
+    // 'magic' string
+    char magic[4];
+    bytesRead = bgzf_read(fp, magic, 4);
+    if (bytesRead != 4 || strncmp(magic, "PBI\1", 4))
+        throw std::runtime_error("expected PBI file, found unknown format instead");
+
+    // version, pbi_flags, & n_reads
+    uint32_t version;
+    uint16_t sections;
+    uint32_t numReads;
+    bgzf_read(fp, &version,  sizeof(version));
+    bgzf_read(fp, &sections, sizeof(sections));
+    bgzf_read(fp, &numReads, sizeof(numReads));
+    if (fp->is_be) {
+        version  = ed_swap_4(version);
+        sections = ed_swap_2(sections);
+        numReads = ed_swap_4(numReads);
+    }
+
+    index.Version(PbiFile::VersionEnum(version));
+    index.FileSections(PbiFile::Sections(sections));
+    index.NumReads(numReads);
+
+    // skip reserved section
+    size_t reservedLength = 18;
+    // adjust depending on version
+    char reserved[18];
+    bytesRead = bgzf_read(fp, &reserved, reservedLength);
+}
+
+void PbiIndexIO::LoadMappedData(PbiRawMappedData& mappedData,
+                                const uint32_t numReads,
+                                BGZF* fp)
+{
+    assert(numReads > 0);
+    (void)numReads; // quash warnings building in release mode
+
+    LoadBgzfVector(fp, mappedData.tId_,       numReads);
+    LoadBgzfVector(fp, mappedData.tStart_,    numReads);
+    LoadBgzfVector(fp, mappedData.tEnd_,      numReads);
+    LoadBgzfVector(fp, mappedData.aStart_,    numReads);
+    LoadBgzfVector(fp, mappedData.aEnd_,      numReads);
+    LoadBgzfVector(fp, mappedData.revStrand_, numReads);
+    LoadBgzfVector(fp, mappedData.nM_,        numReads);
+    LoadBgzfVector(fp, mappedData.nMM_,       numReads);
+    LoadBgzfVector(fp, mappedData.mapQV_,     numReads);
+
+    assert(mappedData.tId_.size()       == numReads);
+    assert(mappedData.tStart_.size()    == numReads);
+    assert(mappedData.tEnd_.size()      == numReads);
+    assert(mappedData.aStart_.size()    == numReads);
+    assert(mappedData.aEnd_.size()      == numReads);
+    assert(mappedData.revStrand_.size() == numReads);
+    assert(mappedData.nM_.size()        == numReads);
+    assert(mappedData.nMM_.size()       == numReads);
+    assert(mappedData.mapQV_.size()     == numReads);
+}
+
+void PbiIndexIO::LoadReferenceData(PbiRawReferenceData& referenceData,
+                                   BGZF* fp)
+{
+    assert(sizeof(PbiReferenceEntry::ID)  == 4);
+    assert(sizeof(PbiReferenceEntry::Row) == 4);
+
+    // num refs
+    uint32_t numRefs;
+    bgzf_read(fp, &numRefs, 4);
+    if (fp->is_be)
+        numRefs = ed_swap_4(numRefs);
+
+    // reference entries
+    referenceData.entries_.clear();
+    referenceData.entries_.resize(numRefs);
+    for (size_t i = 0; i < numRefs; ++i) {
+        PbiReferenceEntry& entry = referenceData.entries_[i];
+        bgzf_read(fp, &entry.tId_,      4);
+        bgzf_read(fp, &entry.beginRow_, 4);
+        bgzf_read(fp, &entry.endRow_,   4);
+        if (fp->is_be) {
+            entry.tId_      = ed_swap_4(entry.tId_);
+            entry.beginRow_ = ed_swap_4(entry.beginRow_);
+            entry.endRow_   = ed_swap_4(entry.endRow_);
+        }
+    }
+}
+
+void PbiIndexIO::LoadBasicData(PbiRawBasicData& basicData,
+                                 const uint32_t numReads,
+                                 BGZF* fp)
+{
+    assert(numReads > 0);
+    (void)numReads; // quash warnings building in release mode
+
+    LoadBgzfVector(fp, basicData.rgId_,       numReads);
+    LoadBgzfVector(fp, basicData.qStart_,     numReads);
+    LoadBgzfVector(fp, basicData.qEnd_,       numReads);
+    LoadBgzfVector(fp, basicData.holeNumber_, numReads);
+    LoadBgzfVector(fp, basicData.readQual_,   numReads);
+    LoadBgzfVector(fp, basicData.ctxtFlag_,   numReads);
+    LoadBgzfVector(fp, basicData.fileOffset_, numReads);
+
+    assert(basicData.rgId_.size()       == numReads);
+    assert(basicData.qStart_.size()     == numReads);
+    assert(basicData.qEnd_.size()       == numReads);
+    assert(basicData.holeNumber_.size() == numReads);
+    assert(basicData.readQual_.size()   == numReads);
+    assert(basicData.ctxtFlag_.size()   == numReads);
+    assert(basicData.fileOffset_.size() == numReads);
+}
+
+void PbiIndexIO::Save(const PbiRawData& index,
+                      const std::string& filename)
+{
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> bgzf(bgzf_open(filename.c_str(), "wb"));
+    BGZF* fp = bgzf.get();
+    if (fp == 0)
+        throw std::runtime_error("could not open PBI file for writing");
+
+    WriteHeader(index, fp);
+    const uint32_t numReads = index.NumReads();
+    if (numReads > 0) {
+        WriteBasicData(index.BasicData(), numReads, fp);
+
+        if (index.HasMappedData())
+            WriteMappedData(index.MappedData(), numReads, fp);
+        if (index.HasReferenceData())
+            WriteReferenceData(index.ReferenceData(), fp);
+        if (index.HasBarcodeData())
+            WriteBarcodeData(index.BarcodeData(), numReads, fp);
+    }
+}
+
+void PbiIndexIO::WriteBarcodeData(const PbiRawBarcodeData& barcodeData,
+                                  const uint32_t numReads,
+                                  BGZF* fp)
+{
+    assert(numReads > 0);
+    assert(barcodeData.bcForward_.size()   == numReads);
+    assert(barcodeData.bcReverse_.size()   == numReads);
+    assert(barcodeData.bcQual_.size()      == numReads);
+    (void)numReads; // quash warnings building in release mode
+
+    WriteBgzfVector(fp, barcodeData.bcForward_);
+    WriteBgzfVector(fp, barcodeData.bcReverse_);
+    WriteBgzfVector(fp, barcodeData.bcQual_);
+}
+
+void PbiIndexIO::WriteHeader(const PbiRawData& index,
+                             BGZF* fp)
+{
+    // 'magic' string
+    char magic[4];
+    strncpy(magic, "PBI\1", 4);
+    bgzf_write(fp, magic, 4);
+
+    // version, pbi_flags, & n_reads
+    uint32_t version   = static_cast<uint32_t>(index.Version());
+    uint16_t pbi_flags = static_cast<uint16_t>(index.FileSections());
+    uint32_t numReads  = index.NumReads();
+    if (fp->is_be) {
+        version   = ed_swap_4(version);
+        pbi_flags = ed_swap_2(pbi_flags);
+        numReads  = ed_swap_4(numReads);
+    }
+    bgzf_write(fp, &version,   4);
+    bgzf_write(fp, &pbi_flags, 2);
+    bgzf_write(fp, &numReads,  4);
+
+    // reserved space
+    char reserved[18];
+    memset(reserved, 0, 18);
+    bgzf_write(fp, reserved, 18);
+}
+
+void PbiIndexIO::WriteMappedData(const PbiRawMappedData& mappedData,
+                                 const uint32_t numReads,
+                                 BGZF* fp)
+{
+    assert(mappedData.tId_.size()       == numReads);
+    assert(mappedData.tStart_.size()    == numReads);
+    assert(mappedData.tEnd_.size()      == numReads);
+    assert(mappedData.aStart_.size()    == numReads);
+    assert(mappedData.aEnd_.size()      == numReads);
+    assert(mappedData.revStrand_.size() == numReads);
+    assert(mappedData.nM_.size()        == numReads);
+    assert(mappedData.nMM_.size()       == numReads);
+    assert(mappedData.mapQV_.size()     == numReads);
+    (void)numReads; // quash warnings building in release mode
+
+    WriteBgzfVector(fp, mappedData.tId_);
+    WriteBgzfVector(fp, mappedData.tStart_);
+    WriteBgzfVector(fp, mappedData.tEnd_);
+    WriteBgzfVector(fp, mappedData.aStart_);
+    WriteBgzfVector(fp, mappedData.aEnd_);
+    WriteBgzfVector(fp, mappedData.revStrand_);
+    WriteBgzfVector(fp, mappedData.nM_);
+    WriteBgzfVector(fp, mappedData.nMM_);
+    WriteBgzfVector(fp, mappedData.mapQV_);
+}
+
+void PbiIndexIO::WriteReferenceData(const PbiRawReferenceData& referenceData,
+                                    BGZF* fp)
+{
+    // num_refs
+    uint32_t numRefs = referenceData.entries_.size();
+    if (fp->is_be)
+        numRefs = ed_swap_4(numRefs);
+    bgzf_write(fp, &numRefs, 4);
+
+    // reference entries
+    numRefs = referenceData.entries_.size(); // need to reset after maybe endian-swapping
+    for (size_t i = 0; i < numRefs; ++i) {
+        const PbiReferenceEntry& entry = referenceData.entries_[i];
+        uint32_t tId      = entry.tId_;
+        uint32_t beginRow = entry.beginRow_;
+        uint32_t endRow   = entry.endRow_;
+        if (fp->is_be) {
+            tId      = ed_swap_4(tId);
+            beginRow = ed_swap_4(beginRow);
+            endRow   = ed_swap_4(endRow);
+        }
+        bgzf_write(fp, &tId,      4);
+        bgzf_write(fp, &beginRow, 4);
+        bgzf_write(fp, &endRow,   4);
+    }
+}
+
+void PbiIndexIO::WriteBasicData(const PbiRawBasicData& basicData,
+                                const uint32_t numReads,
+                                BGZF* fp)
+{
+    assert(basicData.rgId_.size()       == numReads);
+    assert(basicData.qStart_.size()     == numReads);
+    assert(basicData.qEnd_.size()       == numReads);
+    assert(basicData.holeNumber_.size() == numReads);
+    assert(basicData.readQual_.size()   == numReads);
+    assert(basicData.ctxtFlag_.size()   == numReads);
+    assert(basicData.fileOffset_.size() == numReads);
+    (void)numReads; // quash warnings building in release mode
+
+    WriteBgzfVector(fp, basicData.rgId_);
+    WriteBgzfVector(fp, basicData.qStart_);
+    WriteBgzfVector(fp, basicData.qEnd_);
+    WriteBgzfVector(fp, basicData.holeNumber_);
+    WriteBgzfVector(fp, basicData.readQual_);
+    WriteBgzfVector(fp, basicData.ctxtFlag_);
+    WriteBgzfVector(fp, basicData.fileOffset_);
+}
diff --git a/src/PbiIndexIO.h b/src/PbiIndexIO.h

new file mode 100644 (file)

index 0000000..927173c
--- /dev/null
+++ b/src/PbiIndexIO.h
@@ -0,0 +1,165 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// Author: Derek Barnett
+
+#ifndef PBIINDEXIO_H
+#define PBIINDEXIO_H
+
+#include "pbbam/BamFile.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/PbiFile.h"
+#include "pbbam/PbiRawData.h"
+#include <htslib/bgzf.h>
+#include <htslib/sam.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include <cassert>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class PbiIndexIO
+{
+public:
+    // top-level entry points
+    static PbiRawData Load(const std::string& filename);
+    static void Load(PbiRawData& rawData, const std::string& filename);
+    static void LoadFromDataSet(PbiRawData& aggregateData, const DataSet& dataset);
+    static void Save(const PbiRawData& rawData, const std::string& filename);
+
+public:
+    // per-component load
+    static void LoadBarcodeData(PbiRawBarcodeData& barcodeData,
+                                const uint32_t numReads,
+                                BGZF* fp);
+    static void LoadHeader(PbiRawData& index,
+                           BGZF* fp);
+    static void LoadMappedData(PbiRawMappedData& mappedData,
+                               const uint32_t numReads,
+                               BGZF* fp);
+    static void LoadReferenceData(PbiRawReferenceData& referenceData,
+                                  BGZF* fp);
+    static void LoadBasicData(PbiRawBasicData& basicData,
+                              const uint32_t numReads,
+                              BGZF* fp);
+
+    // per-data-field load
+    template<typename T>
+    static void LoadBgzfVector(BGZF* fp,
+                               std::vector<T>& data,
+                               const uint32_t numReads);
+
+public:
+    // per-component write
+    static void WriteBarcodeData(const PbiRawBarcodeData& barcodeData,
+                                 const uint32_t numReads,
+                                 BGZF* fp);
+    static void WriteHeader(const PbiRawData& index,
+                            BGZF* fp);
+    static void WriteMappedData(const PbiRawMappedData& mappedData,
+                                const uint32_t numReads,
+                                BGZF* fp);
+    static void WriteReferenceData(const PbiRawReferenceData& referenceData,
+                                   BGZF* fp);
+    static void WriteBasicData(const PbiRawBasicData& subreadData,
+                                 const uint32_t numReads,
+                                 BGZF* fp);
+
+    // per-data-field write
+    template<typename T>
+    static void WriteBgzfVector(BGZF* fp,
+                                const std::vector<T>& data);
+
+private:
+    // helper functions
+    template<typename T>
+    static void SwapEndianness(std::vector<T>& data);
+};
+
+template<typename T>
+inline void PbiIndexIO::LoadBgzfVector(BGZF* fp,
+                                       std::vector<T>& data,
+                                       const uint32_t numReads)
+{
+    assert(fp);
+    data.resize(numReads);
+    bgzf_read(fp, &data[0], numReads*sizeof(T));
+    if (fp->is_be)
+        SwapEndianness(data);
+}
+
+template<typename T>
+inline void PbiIndexIO::SwapEndianness(std::vector<T>& data)
+{
+    const size_t elementSize = sizeof(T);
+    const size_t numReads = data.size();
+    switch (elementSize) {
+        case 1 : break; // no swapping necessary
+        case 2 :
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_2p(&data[i]);
+            break;
+        case 4:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_4p(&data[i]);
+            break;
+        case 8:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_8p(&data[i]);
+            break;
+        default:
+            throw std::runtime_error("unsupported element size");
+    }
+}
+
+template<typename T>
+inline void PbiIndexIO::WriteBgzfVector(BGZF* fp,
+                                        const std::vector<T>& data)
+{
+    assert(fp);
+    std::vector<T> output = data;
+    if (fp->is_be)
+        SwapEndianness(output);
+    bgzf_write(fp, &output[0], data.size()*sizeof(T));
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // PBIINDEXIO_H
diff --git a/src/PbiIndexedBamReader.cpp b/src/PbiIndexedBamReader.cpp

new file mode 100644 (file)

index 0000000..e9aeeb7
--- /dev/null
+++ b/src/PbiIndexedBamReader.cpp
@@ -0,0 +1,189 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiIndexedBamReader.cpp
+/// \brief Implements the PbiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiIndexedBamReader.h"
+#include <htslib/bgzf.h>
+
+#include <iostream>
+
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct PbiIndexedBamReaderPrivate
+{
+public:
+    PbiIndexedBamReaderPrivate(const string& pbiFilename)
+        : index_(pbiFilename)
+        , currentBlockReadCount_(0)
+    { }
+
+    void ApplyOffsets(void)
+    {
+        const std::vector<int64_t>& fileOffsets = index_.BasicData().fileOffset_;
+        for (IndexResultBlock& block : blocks_)
+            block.virtualOffset_ = fileOffsets.at(block.firstIndex_);
+    }
+
+    void Filter(const PbiFilter& filter)
+    {
+        // store request & reset counters
+        filter_ = filter;
+        currentBlockReadCount_ = 0;
+        blocks_.clear();
+
+        // find blocks of reads passing filter criteria
+        const uint32_t numReads = index_.NumReads();
+        if (numReads == 0) {               // empty PBI - no reads to use
+            return;
+        } else if (filter_.IsEmpty()) {    // empty filter - use all reads
+            blocks_.push_back(IndexResultBlock{0, numReads});
+        } else {
+            IndexList indices;
+            indices.reserve(numReads);
+            for (size_t i = 0; i < numReads; ++i) {
+                if (filter_.Accepts(index_, i))
+                    indices.push_back(i);
+            }
+            blocks_ = mergedIndexBlocks(std::move(indices));
+        }
+
+        // apply offsets
+        ApplyOffsets();
+    }
+
+    int ReadRawData(BGZF* bgzf, bam1_t* b)
+    {
+        // no data to fetch, return false
+        if (blocks_.empty())
+            return -1; // "EOF"
+
+        // if on new block, seek to its first record
+        if (currentBlockReadCount_ == 0) {
+            auto seekResult = bgzf_seek(bgzf, blocks_.at(0).virtualOffset_, SEEK_SET);
+            if (seekResult == -1)
+                throw std::runtime_error("could not seek in BAM file");
+        }
+
+        // read next record
+        auto result = bam_read1(bgzf, b);
+
+        // update counters. if block finished, pop & reset
+        ++currentBlockReadCount_;
+        if (currentBlockReadCount_ == blocks_.at(0).numReads_) {
+            blocks_.pop_front();
+            currentBlockReadCount_ = 0;
+        }
+
+        return result;
+    }
+
+public:
+    PbiFilter filter_;
+    PbiRawData index_;
+    IndexResultBlocks blocks_;
+    size_t currentBlockReadCount_;
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+PbiIndexedBamReader::PbiIndexedBamReader(const PbiFilter& filter,
+                                         const std::string& filename)
+    : PbiIndexedBamReader(filter, BamFile(filename))
+{ }
+
+PbiIndexedBamReader::PbiIndexedBamReader(const PbiFilter& filter,
+                                         const BamFile& bamFile)
+    : PbiIndexedBamReader(bamFile)
+{
+    Filter(filter);
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(const PbiFilter& filter,
+                                         BamFile&& bamFile)
+    : PbiIndexedBamReader(std::move(bamFile))
+{
+    Filter(filter);
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(const std::string& bamFilename)
+    : PbiIndexedBamReader(BamFile(bamFilename))
+{ }
+
+PbiIndexedBamReader::PbiIndexedBamReader(const BamFile& bamFile)
+    : BamReader(bamFile)
+    , d_(new internal::PbiIndexedBamReaderPrivate(File().PacBioIndexFilename()))
+{ }
+
+PbiIndexedBamReader::PbiIndexedBamReader(BamFile&& bamFile)
+    : BamReader(std::move(bamFile))
+    , d_(new internal::PbiIndexedBamReaderPrivate(File().PacBioIndexFilename()))
+{ }
+
+PbiIndexedBamReader::~PbiIndexedBamReader(void) { }
+
+int PbiIndexedBamReader::ReadRawData(BGZF* bgzf, bam1_t* b)
+{
+    assert(d_);
+    return d_->ReadRawData(bgzf, b);
+}
+
+const PbiFilter& PbiIndexedBamReader::Filter(void) const
+{
+    assert(d_);
+    return d_->filter_;
+}
+
+PbiIndexedBamReader& PbiIndexedBamReader::Filter(const PbiFilter& filter)
+{
+    assert(d_);
+    d_->Filter(filter);
+    return *this;
+}
+
diff --git a/src/PbiRawData.cpp b/src/PbiRawData.cpp

new file mode 100644 (file)

index 0000000..a0e9d1d
--- /dev/null
+++ b/src/PbiRawData.cpp
@@ -0,0 +1,508 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file PbiRawData.cpp
+/// \brief Implements the classes used for working with raw PBI data.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiRawData.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "PbiIndexIO.h"
+#include <boost/numeric/conversion/cast.hpp>
+#include <map>
+#include <cassert>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static
+string ToString(const RecordType type)
+{
+    static const auto lookup = map<RecordType, string>
+    {
+        { RecordType::ZMW,        "ZMW" },
+        { RecordType::HQREGION,   "HQREGION" },
+        { RecordType::SUBREAD,    "SUBREAD" },
+        { RecordType::CCS,        "CCS" },
+        { RecordType::SCRAP,      "SCRAP" },
+        { RecordType::UNKNOWN,    "UNKNOWN" }
+    };
+
+    try {
+        return lookup.at(type);
+    } catch (std::exception&) {
+        throw std::runtime_error("error: unknown RecordType encountered");
+    }
+}
+
+} // namespace internal
+} // namespace BAM
+} // namesapce PacBio
+
+// ----------------------------------
+// PbiRawBarcodeData implementation
+// ----------------------------------
+
+PbiRawBarcodeData::PbiRawBarcodeData(void) { }
+
+PbiRawBarcodeData::PbiRawBarcodeData(uint32_t numReads)
+{
+    bcForward_.reserve(numReads);
+    bcReverse_.reserve(numReads);
+    bcQual_.reserve(numReads);
+}
+
+PbiRawBarcodeData::PbiRawBarcodeData(const PbiRawBarcodeData& other)
+    : bcForward_(other.bcForward_)
+    , bcReverse_(other.bcReverse_)
+    , bcQual_(other.bcQual_)
+{ }
+
+PbiRawBarcodeData::PbiRawBarcodeData(PbiRawBarcodeData&& other)
+    : bcForward_(std::move(other.bcForward_))
+    , bcReverse_(std::move(other.bcReverse_))
+    , bcQual_(std::move(other.bcQual_))
+{ }
+
+PbiRawBarcodeData& PbiRawBarcodeData::operator=(const PbiRawBarcodeData& other)
+{
+    bcForward_ = other.bcForward_;
+    bcReverse_ = other.bcReverse_;
+    bcQual_ = other.bcQual_;
+    return *this;
+}
+
+PbiRawBarcodeData& PbiRawBarcodeData::operator=(PbiRawBarcodeData&& other)
+{
+    bcForward_ = std::move(other.bcForward_);
+    bcReverse_ = std::move(other.bcReverse_);
+    bcQual_ = std::move(other.bcQual_);
+    return *this;
+}
+
+void PbiRawBarcodeData::AddRecord(const BamRecord& b)
+{
+    // check for any barcode data (both required)
+    if (b.HasBarcodes() && b.HasBarcodeQuality()) {
+
+        // fetch data from record
+        const auto barcodes = b.Barcodes();
+        const auto barcodeQuality = b.BarcodeQuality();
+        const auto bcForward = barcodes.first;
+        const auto bcReverse = barcodes.second;
+        const auto bcQuality = boost::numeric_cast<int8_t>(barcodeQuality);
+
+        // only store actual data if all values >= 0
+        if (bcForward >= 0 && bcReverse >=0 && bcQuality >= 0) {
+            bcForward_.push_back(bcForward);
+            bcReverse_.push_back(bcReverse);
+            bcQual_.push_back(bcQuality);
+            return;
+        }
+    }
+
+    // if we get here, at least one value is either missing or is -1
+    bcForward_.push_back(-1);
+    bcReverse_.push_back(-1);
+    bcQual_.push_back(-1);
+}
+
+// ----------------------------------
+// PbiRawMappedData implementation
+// ----------------------------------
+
+PbiRawMappedData::PbiRawMappedData(void) { }
+
+PbiRawMappedData::PbiRawMappedData(uint32_t numReads)
+{
+    tId_.reserve(numReads);
+    tStart_.reserve(numReads);
+    tEnd_.reserve(numReads);
+    aStart_.reserve(numReads);
+    aEnd_.reserve(numReads);
+    revStrand_.reserve(numReads);
+    nM_.reserve(numReads);
+    nMM_.reserve(numReads);
+    mapQV_.reserve(numReads);
+}
+
+PbiRawMappedData::PbiRawMappedData(const PbiRawMappedData& other)
+    : tId_(other.tId_)
+    , tStart_(other.tStart_)
+    , tEnd_(other.tEnd_)
+    , aStart_(other.aStart_)
+    , aEnd_(other.aEnd_)
+    , revStrand_(other.revStrand_)
+    , nM_(other.nM_)
+    , nMM_(other.nMM_)
+    , mapQV_(other.mapQV_)
+{ }
+
+PbiRawMappedData::PbiRawMappedData(PbiRawMappedData&& other)
+    : tId_(std::move(other.tId_))
+    , tStart_(std::move(other.tStart_))
+    , tEnd_(std::move(other.tEnd_))
+    , aStart_(std::move(other.aStart_))
+    , aEnd_(std::move(other.aEnd_))
+    , revStrand_(std::move(other.revStrand_))
+    , nM_(std::move(other.nM_))
+    , nMM_(std::move(other.nMM_))
+    , mapQV_(std::move(other.mapQV_))
+{ }
+
+PbiRawMappedData& PbiRawMappedData::operator=(const PbiRawMappedData& other)
+{
+    tId_ = other.tId_;
+    tStart_ = other.tStart_;
+    tEnd_ = other.tEnd_;
+    aStart_ = other.aStart_;
+    aEnd_ = other.aEnd_;
+    revStrand_ = other.revStrand_;
+    nM_ = other.nM_;
+    nMM_ = other.nMM_;
+    mapQV_ = other.mapQV_;
+    return *this;
+}
+
+PbiRawMappedData& PbiRawMappedData::operator=(PbiRawMappedData&& other)
+{
+    tId_ = std::move(other.tId_);
+    tStart_ = std::move(other.tStart_);
+    tEnd_ = std::move(other.tEnd_);
+    aStart_ = std::move(other.aStart_);
+    aEnd_ = std::move(other.aEnd_);
+    revStrand_ = std::move(other.revStrand_);
+    nM_ = std::move(other.nM_);
+    nMM_ = std::move(other.nMM_);
+    mapQV_ = std::move(other.mapQV_);
+    return *this;
+}
+
+void PbiRawMappedData::AddRecord(const BamRecord& b)
+{
+    tId_.push_back(b.ReferenceId());
+    tStart_.push_back(b.ReferenceStart());
+    tEnd_.push_back(b.ReferenceEnd());
+    aStart_.push_back(b.AlignedStart());
+    aEnd_.push_back(b.AlignedEnd());
+    revStrand_.push_back( (b.AlignedStrand() == Strand::REVERSE ? 1 : 0) );
+    mapQV_.push_back(b.MapQuality());
+
+    const auto matchesAndMismatches = b.NumMatchesAndMismatches();
+    nM_.push_back(matchesAndMismatches.first);
+    nMM_.push_back(matchesAndMismatches.second);
+}
+
+uint32_t PbiRawMappedData::NumDeletedBasesAt(size_t recordIndex) const
+{ return NumDeletedAndInsertedBasesAt(recordIndex).first; }
+
+std::pair<uint32_t, uint32_t> PbiRawMappedData::NumDeletedAndInsertedBasesAt(size_t recordIndex) const
+{
+    const auto aStart = aStart_.at(recordIndex);
+    const auto aEnd   = aEnd_.at(recordIndex);
+    const auto tStart = tStart_.at(recordIndex);
+    const auto tEnd   = tEnd_.at(recordIndex);
+    const auto nM     = nM_.at(recordIndex);
+    const auto nMM    = nMM_.at(recordIndex);
+    const auto numIns = (aEnd - aStart - nM - nMM);
+    const auto numDel = (tEnd - tStart - nM - nMM);
+    return std::make_pair(numDel, numIns);
+}
+
+uint32_t PbiRawMappedData::NumInsertedBasesAt(size_t recordIndex) const
+{ return NumDeletedAndInsertedBasesAt(recordIndex).second; }
+
+// ------------------------------------
+// PbiReferenceEntry implementation
+// ------------------------------------
+
+const PbiReferenceEntry::ID  PbiReferenceEntry::UNMAPPED_ID = static_cast<PbiReferenceEntry::ID>(-1);
+const PbiReferenceEntry::Row PbiReferenceEntry::UNSET_ROW   = static_cast<PbiReferenceEntry::Row>(-1);
+
+PbiReferenceEntry::PbiReferenceEntry(void)
+    : tId_(UNMAPPED_ID)
+    , beginRow_(UNSET_ROW)
+    , endRow_(UNSET_ROW)
+{ }
+
+PbiReferenceEntry::PbiReferenceEntry(ID id)
+    : tId_(id)
+    , beginRow_(UNSET_ROW)
+    , endRow_(UNSET_ROW)
+{ }
+
+PbiReferenceEntry::PbiReferenceEntry(ID id, Row beginRow, Row endRow)
+    : tId_(id)
+    , beginRow_(beginRow)
+    , endRow_(endRow)
+{ }
+
+PbiReferenceEntry::PbiReferenceEntry(const PbiReferenceEntry& other)
+    : tId_(other.tId_)
+    , beginRow_(other.beginRow_)
+    , endRow_(other.endRow_)
+{ }
+
+PbiReferenceEntry::PbiReferenceEntry(PbiReferenceEntry&& other)
+    : tId_(std::move(other.tId_))
+    , beginRow_(std::move(other.beginRow_))
+    , endRow_(std::move(other.endRow_))
+{ }
+
+PbiReferenceEntry& PbiReferenceEntry::operator=(const PbiReferenceEntry& other)
+{
+    tId_ = other.tId_;
+    beginRow_ = other.beginRow_;
+    endRow_ = other.endRow_;
+    return *this;
+}
+
+PbiReferenceEntry& PbiReferenceEntry::operator=(PbiReferenceEntry&& other)
+{
+    tId_ = std::move(other.tId_);
+    beginRow_ = std::move(other.beginRow_);
+    endRow_ = std::move(other.endRow_);
+    return *this;
+}
+
+// ------------------------------------
+// PbiRawReferenceData implementation
+// ------------------------------------
+
+PbiRawReferenceData::PbiRawReferenceData(void) { }
+
+PbiRawReferenceData::PbiRawReferenceData(uint32_t numRefs)
+{  entries_.reserve(numRefs); }
+
+PbiRawReferenceData::PbiRawReferenceData(const PbiRawReferenceData& other)
+    : entries_(other.entries_)
+{ }
+
+PbiRawReferenceData::PbiRawReferenceData(PbiRawReferenceData&& other)
+    : entries_(std::move(other.entries_))
+{ }
+
+PbiRawReferenceData& PbiRawReferenceData::operator=(const PbiRawReferenceData& other)
+{
+    entries_ = other.entries_;
+    return *this;
+}
+
+PbiRawReferenceData& PbiRawReferenceData::operator=(PbiRawReferenceData&& other)
+{
+    entries_ = std::move(other.entries_);
+    return *this;
+}
+
+// ----------------------------------
+// PbiRawSubreadData implementation
+// ----------------------------------
+
+PbiRawBasicData::PbiRawBasicData(void) { }
+
+PbiRawBasicData::PbiRawBasicData(uint32_t numReads)
+{
+    rgId_.reserve(numReads);
+    qStart_.reserve(numReads);
+    qEnd_.reserve(numReads);
+    holeNumber_.reserve(numReads);
+    readQual_.reserve(numReads);
+    ctxtFlag_.reserve(numReads);
+    fileOffset_.reserve(numReads);
+    fileNumber_.reserve(numReads);
+}
+
+PbiRawBasicData::PbiRawBasicData(const PbiRawBasicData& other)
+    : rgId_(other.rgId_)
+    , qStart_(other.qStart_)
+    , qEnd_(other.qEnd_)
+    , holeNumber_(other.holeNumber_)
+    , readQual_(other.readQual_)
+    , ctxtFlag_(other.ctxtFlag_)
+    , fileOffset_(other.fileOffset_)
+    , fileNumber_(other.fileNumber_)
+{ }
+
+PbiRawBasicData::PbiRawBasicData(PbiRawBasicData&& other)
+    : rgId_(std::move(other.rgId_))
+    , qStart_(std::move(other.qStart_))
+    , qEnd_(std::move(other.qEnd_))
+    , holeNumber_(std::move(other.holeNumber_))
+    , readQual_(std::move(other.readQual_))
+    , ctxtFlag_(std::move(other.ctxtFlag_))
+    , fileOffset_(std::move(other.fileOffset_))
+    , fileNumber_(std::move(other.fileNumber_))
+{ }
+
+PbiRawBasicData& PbiRawBasicData::operator=(const PbiRawBasicData& other)
+{
+    rgId_ = other.rgId_;
+    qStart_ = other.qStart_;
+    qEnd_ = other.qEnd_;
+    holeNumber_ = other.holeNumber_;
+    readQual_ = other.readQual_;
+    ctxtFlag_ = other.ctxtFlag_;
+    fileOffset_ = other.fileOffset_;
+    fileNumber_ = other.fileNumber_;
+    return *this;
+}
+
+PbiRawBasicData& PbiRawBasicData::operator=(PbiRawBasicData&& other)
+{
+    rgId_ = std::move(other.rgId_);
+    qStart_ = std::move(other.qStart_);
+    qEnd_ = std::move(other.qEnd_);
+    holeNumber_ = std::move(other.holeNumber_);
+    readQual_ = std::move(other.readQual_);
+    ctxtFlag_ = std::move(other.ctxtFlag_);
+    fileOffset_ = std::move(other.fileOffset_);
+    fileNumber_ = std::move(other.fileNumber_);
+    return *this;
+}
+
+void PbiRawBasicData::AddRecord(const BamRecord& b, int64_t offset)
+{
+    // read group ID
+    auto rgId = b.ReadGroupId();
+    if (rgId.empty())
+        rgId = MakeReadGroupId(b.MovieName(), internal::ToString(b.Type()));
+    const uint32_t rawid = std::stoul(rgId, nullptr, 16);
+    const int32_t id = static_cast<int32_t>(rawid);
+    rgId_.push_back(id);
+
+    // query start/end
+    if (b.Type() == RecordType::CCS) {
+        qStart_.push_back(-1);
+        qEnd_.push_back(-1);
+    } else {
+        qStart_.push_back(b.QueryStart());
+        qEnd_.push_back(b.QueryEnd());
+    }
+
+    // add'l basic data
+    holeNumber_.push_back(b.HasHoleNumber() ? b.HoleNumber() : 0);
+    readQual_.push_back(b.HasReadAccuracy() ? static_cast<float>(b.ReadAccuracy()) : 0.0f);
+    ctxtFlag_.push_back(b.HasLocalContextFlags() ? b.LocalContextFlags() : LocalContextFlags::NO_LOCAL_CONTEXT);
+
+    // virtual offset of record start
+    fileOffset_.push_back(offset);
+
+    // default file number
+    fileNumber_.push_back(0);
+}
+
+// ----------------------------------
+// PbiRawData implementation
+// ----------------------------------
+
+PbiRawData::PbiRawData(void)
+    : version_(PbiFile::CurrentVersion)
+    , sections_(PbiFile::ALL)
+    , numReads_(0)
+{ }
+
+PbiRawData::PbiRawData(const string& pbiFilename)
+    : filename_(pbiFilename)
+    , version_(PbiFile::CurrentVersion)
+    , sections_(PbiFile::ALL)
+    , numReads_(0)
+{
+    internal::PbiIndexIO::Load(*this, pbiFilename);
+}
+
+PbiRawData::PbiRawData(const DataSet& dataset)
+    : version_(PbiFile::CurrentVersion)
+    , sections_(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE)
+    , numReads_(0)
+{
+    internal::PbiIndexIO::LoadFromDataSet(*this, dataset);
+}
+
+PbiRawData::PbiRawData(const PbiRawData& other)
+    : filename_(other.filename_)
+    , version_(other.version_)
+    , sections_(other.sections_)
+    , numReads_(other.numReads_)
+    , barcodeData_(other.barcodeData_)
+    , mappedData_(other.mappedData_)
+    , referenceData_(other.referenceData_)
+    , basicData_(other.basicData_)
+{ }
+
+PbiRawData::PbiRawData(PbiRawData&& other)
+    : filename_(std::move(other.filename_))
+    , version_(std::move(other.version_))
+    , sections_(std::move(other.sections_))
+    , numReads_(std::move(other.numReads_))
+    , barcodeData_(std::move(other.barcodeData_))
+    , mappedData_(std::move(other.mappedData_))
+    , referenceData_(std::move(other.referenceData_))
+    , basicData_(std::move(other.basicData_))
+{ }
+
+PbiRawData& PbiRawData::operator=(const PbiRawData& other)
+{
+    filename_ = other.filename_;
+    version_ = other.version_;
+    sections_ = other.sections_;
+    numReads_ = other.numReads_;
+    barcodeData_ = other.barcodeData_;
+    mappedData_ = other.mappedData_;
+    referenceData_ = other.referenceData_;
+    basicData_ = other.basicData_;
+    return *this;
+}
+
+PbiRawData& PbiRawData::operator=(PbiRawData&& other)
+{
+    filename_ = std::move(other.filename_);
+    version_ = std::move(other.version_);
+    sections_ = std::move(other.sections_);
+    numReads_ = std::move(other.numReads_);
+    barcodeData_ = std::move(other.barcodeData_);
+    mappedData_ = std::move(other.mappedData_);
+    referenceData_ = std::move(other.referenceData_);
+    basicData_ = std::move(other.basicData_);
+    return *this;
+}
+
+PbiRawData::~PbiRawData(void) { }
diff --git a/src/ProgramInfo.cpp b/src/ProgramInfo.cpp

new file mode 100644 (file)

index 0000000..75f193a
--- /dev/null
+++ b/src/ProgramInfo.cpp
@@ -0,0 +1,165 @@
+
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ProgramInfo.cpp
+/// \brief Implements the ProgramInfo class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/ProgramInfo.h"
+#include "SequenceUtils.h"
+#include <sstream>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static string token_ID = string("ID");
+static string token_CL = string("CL");
+static string token_DS = string("DS");
+static string token_PN = string("PN");
+static string token_PP = string("PP");
+static string token_VN = string("VN");
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+ProgramInfo::ProgramInfo(void) { }
+
+ProgramInfo::ProgramInfo(const std::string& id)
+    : id_(id)
+{ }
+
+ProgramInfo::ProgramInfo(const ProgramInfo& other)
+    : commandLine_(other.commandLine_)
+    , description_(other.description_)
+    , id_(other.id_)
+    , name_(other.name_)
+    , previousProgramId_(other.previousProgramId_)
+    , version_(other.version_)
+{  }
+
+ProgramInfo::ProgramInfo(ProgramInfo&& other)
+    : commandLine_(std::move(other.commandLine_))
+    , description_(std::move(other.description_))
+    , id_(std::move(other.id_))
+    , name_(std::move(other.name_))
+    , previousProgramId_(std::move(other.previousProgramId_))
+    , version_(std::move(other.version_))
+{ }
+
+ProgramInfo::~ProgramInfo(void) { }
+
+ProgramInfo& ProgramInfo::operator=(const ProgramInfo& other)
+{
+    commandLine_ = other.commandLine_;
+    description_ = other.description_;
+    id_ = other.id_;
+    name_ = other.name_;
+    previousProgramId_ = other.previousProgramId_;
+    version_ = other.version_;
+    return *this;
+}
+
+ProgramInfo& ProgramInfo::operator=(ProgramInfo&& other)
+{
+    commandLine_ = std::move(other.commandLine_);
+    description_ = std::move(other.description_);
+    id_ = std::move(other.id_);
+    name_ = std::move(other.name_);
+    previousProgramId_ = std::move(other.previousProgramId_);
+    version_ = std::move(other.version_);
+    return *this;
+}
+
+ProgramInfo ProgramInfo::FromSam(const string& sam)
+{
+    // pop off '@PG\t', then split rest of line into tokens
+    const vector<string>& tokens = internal::Split(sam.substr(4), '\t');
+    if (tokens.empty())
+        return ProgramInfo();
+
+    ProgramInfo prog;
+    map<string, string> custom;
+
+    // iterate over tokens
+    for (const string& token : tokens) {
+        const string& tokenTag   = token.substr(0,2);
+        const string& tokenValue = token.substr(3);
+
+        // set program contents
+        if      (tokenTag == internal::token_ID) prog.Id(tokenValue);
+        else if (tokenTag == internal::token_CL) prog.CommandLine(tokenValue);
+        else if (tokenTag == internal::token_DS) prog.Description(tokenValue);
+        else if (tokenTag == internal::token_PN) prog.Name(tokenValue);
+        else if (tokenTag == internal::token_PP) prog.PreviousProgramId(tokenValue);
+        else if (tokenTag == internal::token_VN) prog.Version(tokenValue);
+
+        // otherwise, "custom" tag
+        else
+            custom[tokenTag] = tokenValue;
+    }
+
+    prog.CustomTags(custom);
+    return prog;
+}
+
+string ProgramInfo::ToSam(void) const
+{
+    stringstream out;
+    out << "@PG"
+        << internal::MakeSamTag(internal::token_ID, id_);
+
+    if (!name_.empty())              out << internal::MakeSamTag(internal::token_PN, name_);
+    if (!version_.empty())           out << internal::MakeSamTag(internal::token_VN, version_);
+    if (!description_.empty())       out << internal::MakeSamTag(internal::token_DS, description_);
+    if (!previousProgramId_.empty()) out << internal::MakeSamTag(internal::token_PP, previousProgramId_);
+    if (!commandLine_.empty())       out << internal::MakeSamTag(internal::token_CL, commandLine_);
+
+    // append any custom tags
+    map<string, string>::const_iterator customIter = custom_.cbegin();
+    map<string, string>::const_iterator customEnd  = custom_.cend();
+    for ( ; customIter != customEnd; ++customIter )
+        out << internal::MakeSamTag(customIter->first, customIter->second);
+
+    return out.str();
+}
+
diff --git a/src/Pulse2BaseCache.h b/src/Pulse2BaseCache.h

new file mode 100644 (file)

index 0000000..ae5bb1c
--- /dev/null
+++ b/src/Pulse2BaseCache.h
@@ -0,0 +1,154 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// Author: Derek Barnett
+
+#ifndef PULSE2BASECACHE_H
+#define PULSE2BASECACHE_H
+
+#include "pbbam/Config.h"
+#include <boost/dynamic_bitset.hpp>
+#include <string>
+#include <cassert>
+#include <cctype>
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class Pulse2BaseCache
+{
+public:
+    /// \brief Creates a Pulse2BaseCache from pulseCall data ('pc' tag)
+    ///
+    /// Computes & stores cache of basecalled vs. squashed pulse positions for
+    /// later masking of pulse data.
+    ///
+    /// \param pulseCalls[in]   string contents of 'pc' tag
+    ///
+    Pulse2BaseCache(const std::string& pulseCalls)
+        : data_(pulseCalls.size())
+    {
+        // basecalled pulse -> data[i] == 1
+        // squashed pulse   -> data[i] == 0
+        //
+        const auto numPulses = pulseCalls.size();
+        for (size_t i = 0; i < numPulses; ++i)
+            data_[i] = std::isupper(pulseCalls.at(i));
+    }
+
+    Pulse2BaseCache(void) = delete;
+    Pulse2BaseCache(const Pulse2BaseCache& other) = default;
+    Pulse2BaseCache(Pulse2BaseCache&& other) = default;
+    Pulse2BaseCache& operator=(const Pulse2BaseCache&) = default;
+    Pulse2BaseCache& operator=(Pulse2BaseCache&&) = default;
+    ~Pulse2BaseCache(void) noexcept {}
+
+public:
+
+    ///
+    /// \brief FindFirst
+    /// \return
+    ///
+    size_t FindFirst(void) const
+    { return data_.find_first(); }
+
+    ///
+    /// \brief FindNext
+    /// \param from
+    /// \return
+    ///
+    size_t FindNext(size_t from) const
+    { return data_.find_next(from); }
+
+    ///
+    /// \brief IsBasecallAt
+    /// \param pos
+    /// \return
+    ///
+    bool IsBasecallAt(const size_t pos) const
+    { return data_[pos]; }
+
+    /// \returns the total number of pulses (basecalled & squashed)
+    ///
+    size_t NumPulses(void) const
+    {
+        return data_.size();
+    }
+
+    /// \returns the total number of basecalled pulses
+    ///
+    size_t NumBases(void) const
+    {
+        return data_.count();
+    }
+
+    /// \brief Removes squashed pulse positions from input data.
+    ///
+    /// \param[in]  Contents of any per-pulse tag.
+    /// \returns    Input \p pulseData less all squashed pulses
+    ///
+    template<typename T>
+    T RemoveSquashedPulses(const T& pulseData) const
+    {
+        const auto numPulses = pulseData.size();
+        assert(numPulses == data_.size());
+
+        // The reserve() below overshoots the required space, but numPulses is cheap
+        // to compute, and by definition will be sufficient to hold the result. Thus
+        // we only ever need to do one allocation.
+        //
+        T result;
+        result.reserve(numPulses);
+
+        // Only include data at positions that match our cached pulse data.
+        //
+        size_t inputIndex = 0;
+        for (size_t i = 0; i < numPulses; ++i) {
+            if (data_[i])
+                result.push_back(pulseData.at(inputIndex));
+            ++inputIndex;
+        }
+        return result;
+    }
+
+private:
+    boost::dynamic_bitset<> data_;
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // PULSE2BASECACHE_H
diff --git a/src/QNameQuery.cpp b/src/QNameQuery.cpp

new file mode 100644 (file)

index 0000000..e544664
--- /dev/null
+++ b/src/QNameQuery.cpp
@@ -0,0 +1,104 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file QNameQuery.cpp
+/// \brief Implements the QNameQuery class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/QNameQuery.h"
+#include "pbbam/CompositeBamReader.h"
+#include <boost/optional.hpp>
+#include <cassert>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+struct QNameQuery::QNameQueryPrivate
+{
+public:
+    QNameQueryPrivate(const DataSet& dataset)
+        : reader_(new SequentialCompositeBamReader(dataset))
+        , nextRecord_(boost::none)
+    { }
+
+    bool GetNext(vector<BamRecord>& records)
+    {
+        records.clear();
+
+        string groupRecordName;
+
+        if (nextRecord_.is_initialized()) {
+            BamRecord r = nextRecord_.get();
+            groupRecordName = r.FullName();
+            records.push_back(std::move(r));
+            nextRecord_ = boost::none;
+        }
+
+        BamRecord record;
+        while (reader_->GetNext(record)) {
+            if (records.empty()) {
+                groupRecordName = record.FullName();
+                records.push_back(record);
+            }
+            else {
+                assert(!records.empty());
+                if (record.FullName() == groupRecordName)
+                    records.push_back(record);
+                else {
+                    nextRecord_ = record;
+                    return true;
+                }
+            }
+        }
+        return !records.empty();
+    }
+
+public:
+    unique_ptr<SequentialCompositeBamReader> reader_;
+    boost::optional<BamRecord> nextRecord_;
+};
+
+QNameQuery::QNameQuery(const DataSet& dataset)
+    : internal::IGroupQuery()
+    , d_(new QNameQueryPrivate(dataset))
+{ }
+
+QNameQuery::~QNameQuery(void) { }
+
+bool QNameQuery::GetNext(vector<BamRecord>& records)
+{ return d_->GetNext(records); }
diff --git a/src/QualityValue.cpp b/src/QualityValue.cpp

new file mode 100644 (file)

index 0000000..e9f63c9
--- /dev/null
+++ b/src/QualityValue.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file QualityValue.h
+/// \brief Implements the QualityValue class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/QualityValue.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+const uint8_t QualityValue::MAX = 93;
diff --git a/src/ReadAccuracyQuery.cpp b/src/ReadAccuracyQuery.cpp

new file mode 100644 (file)

index 0000000..8535189
--- /dev/null
+++ b/src/ReadAccuracyQuery.cpp
@@ -0,0 +1,71 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ReadAccuracyQuery.cpp
+/// \brief Implements the ReadAccuracyQuery class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/ReadAccuracyQuery.h"
+#include "pbbam/PbiFilterTypes.h"
+#include "pbbam/CompositeBamReader.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+struct ReadAccuracyQuery::ReadAccuracyQueryPrivate
+{
+    ReadAccuracyQueryPrivate(const Accuracy accuracy,
+                             const Compare::Type compareType,
+                             const DataSet& dataset)
+        : reader_(PbiReadAccuracyFilter(accuracy, compareType), dataset)
+    { }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_; // unsorted
+};
+
+ReadAccuracyQuery::ReadAccuracyQuery(const Accuracy accuracy,
+                                     const Compare::Type compareType,
+                                     const DataSet& dataset)
+    : internal::IQuery()
+    , d_(new ReadAccuracyQueryPrivate(accuracy, compareType, dataset))
+{ }
+
+ReadAccuracyQuery::~ReadAccuracyQuery(void) { }
+
+bool ReadAccuracyQuery::GetNext(BamRecord &r)
+{ return d_->reader_.GetNext(r); }
diff --git a/src/ReadGroupInfo.cpp b/src/ReadGroupInfo.cpp

new file mode 100644 (file)

index 0000000..765c60e
--- /dev/null
+++ b/src/ReadGroupInfo.cpp
@@ -0,0 +1,803 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ReadGroupInfo.cpp
+/// \brief Implements the ReadGroupInfo class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/ReadGroupInfo.h"
+#include "pbbam/MD5.h"
+#include "ChemistryTable.h"
+#include "SequenceUtils.h"
+#include <iomanip>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <cstdio>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static const string sam_ID = string{ "ID" };
+static const string sam_CN = string{ "CN" };
+static const string sam_DS = string{ "DS" };
+static const string sam_DT = string{ "DT" };
+static const string sam_FO = string{ "FO" };
+static const string sam_KS = string{ "KS" };
+static const string sam_LB = string{ "LB" };
+static const string sam_PG = string{ "PG" };
+static const string sam_PI = string{ "PI" };
+static const string sam_PL = string{ "PL" };
+static const string sam_PM = string{ "PM" };
+static const string sam_PU = string{ "PU" };
+static const string sam_SM = string{ "SM" };
+
+static const string feature_DQ = string{ "DeletionQV" };
+static const string feature_DT = string{ "DeletionTag" };
+static const string feature_IQ = string{ "InsertionQV" };
+static const string feature_MQ = string{ "MergeQV" };
+static const string feature_SQ = string{ "SubstitutionQV" };
+static const string feature_ST = string{ "SubstitutionTag" };
+static const string feature_IP = string{ "Ipd" };
+static const string feature_PW = string{ "PulseWidth" };
+static const string feature_PM = string{ "PkMid" };
+static const string feature_PA = string{ "PkMean" };
+static const string feature_PI = string{ "PkMid2" };
+static const string feature_PS = string{ "PkMean2" };
+static const string feature_LT = string{ "Label" };
+static const string feature_PQ = string{ "LabelQV" };
+static const string feature_PT = string{ "AltLabel" };
+static const string feature_PV = string{ "AltLabelQV" };
+static const string feature_PG = string{ "PulseMergeQV" };
+static const string feature_PC = string{ "PulseCall" };
+static const string feature_PD = string{ "PrePulseFrames" };
+static const string feature_PX = string{ "PulseCallWidth" };
+static const string feature_SF = string{ "StartFrame" };
+
+static const string token_RT = string{ "READTYPE" };
+static const string token_BK = string{ "BINDINGKIT" };
+static const string token_SK = string{ "SEQUENCINGKIT" };
+static const string token_BV = string{ "BASECALLERVERSION" };
+static const string token_FR = string{ "FRAMERATEHZ" };
+static const string token_CT = string{ "CONTROL" };
+
+static const string token_BF = string{ "BarcodeFile" };
+static const string token_BH = string{ "BarcodeHash" };
+static const string token_BC = string{ "BarcodeCount" };
+static const string token_BM = string{ "BarcodeMode" };
+static const string token_BQ = string{ "BarcodeQuality" };
+
+static const string codec_RAW = string{ "Frames" };
+static const string codec_V1  = string{ "CodecV1" };
+
+static const string barcodemode_NONE = string{ "None" };
+static const string barcodemode_SYM  = string{ "Symmetric" };
+static const string barcodemode_ASYM = string{ "Asymmetric" };
+
+static const string barcodequal_NONE  = string{ "None" };
+static const string barcodequal_SCORE = string{ "Score" };
+static const string barcodequal_PROB  = string{ "Probability" };
+
+static const string platformModelType_ASTRO  = string{ "ASTRO" };
+static const string platformModelType_RS     = string{ "RS" };
+static const string platformModelType_SEQUEL = string{ "SEQUEL" };
+
+static
+string BaseFeatureName(const BaseFeature& feature)
+{
+    switch(feature) {
+        case BaseFeature::DELETION_QV      : return feature_DQ;
+        case BaseFeature::DELETION_TAG     : return feature_DT;
+        case BaseFeature::INSERTION_QV     : return feature_IQ;
+        case BaseFeature::MERGE_QV         : return feature_MQ;
+        case BaseFeature::SUBSTITUTION_QV  : return feature_SQ;
+        case BaseFeature::SUBSTITUTION_TAG : return feature_ST;
+        case BaseFeature::IPD              : return feature_IP;
+        case BaseFeature::PULSE_WIDTH      : return feature_PW;
+        case BaseFeature::PKMID            : return feature_PM;
+        case BaseFeature::PKMEAN           : return feature_PA;
+        case BaseFeature::PKMID2           : return feature_PI;
+        case BaseFeature::PKMEAN2          : return feature_PS;
+        case BaseFeature::LABEL_QV         : return feature_PQ;
+        case BaseFeature::ALT_LABEL        : return feature_PT;
+        case BaseFeature::ALT_LABEL_QV     : return feature_PV;
+        case BaseFeature::PULSE_MERGE_QV   : return feature_PG;
+        case BaseFeature::PULSE_CALL       : return feature_PC;
+        case BaseFeature::PRE_PULSE_FRAMES : return feature_PD;
+        case BaseFeature::PULSE_CALL_WIDTH : return feature_PX;
+    case BaseFeature::START_FRAME          : return feature_SF;
+        default:
+            throw std::runtime_error{ "unrecognized base feature" };
+    }
+    return string{ }; // unreachable
+}
+
+static
+string FrameCodecName(const FrameCodec& codec)
+{
+    switch (codec) {
+        case FrameCodec::RAW : return codec_RAW;
+        case FrameCodec::V1  : return codec_V1;
+        default:
+            throw std::runtime_error{ "unrecognized frame codec" };
+    }
+    return string{ }; // unreachable
+}
+
+static
+string BarcodeModeName(const BarcodeModeType& mode)
+{
+    switch (mode) {
+        case BarcodeModeType::NONE       : return barcodemode_NONE;
+        case BarcodeModeType::SYMMETRIC  : return barcodemode_SYM;
+        case BarcodeModeType::ASYMMETRIC : return barcodemode_ASYM;
+        default:
+            throw std::runtime_error{ "unrecognized barcode mode" };
+    }
+    return string{ }; // unreachable
+}
+
+static
+string BarcodeQualityName(const BarcodeQualityType& type)
+{
+    switch (type) {
+        case BarcodeQualityType::NONE  : return barcodequal_NONE;
+        case BarcodeQualityType::SCORE : return barcodequal_SCORE;
+        case BarcodeQualityType::PROBABILITY : return barcodequal_PROB;
+        default:
+            throw std::runtime_error{ "unrecognized barcode quality type" };
+    }
+    return string{ }; // unreachable
+}
+
+static
+string PlatformModelName(const PlatformModelType& type)
+{
+    switch (type) {
+        case PlatformModelType::ASTRO  : return platformModelType_ASTRO;
+        case PlatformModelType::RS     : return platformModelType_RS;
+        case PlatformModelType::SEQUEL : return platformModelType_SEQUEL;
+        default:
+            throw std::runtime_error{ "unrecognized platform model" };
+    }
+    return string{ }; // unreachable
+}
+
+static map<string, BaseFeature>        nameToFeature;
+static map<string, FrameCodec>         nameToCodec;
+static map<string, BarcodeModeType>    nameToBarcodeMode;
+static map<string, BarcodeQualityType> nameToBarcodeQuality;
+static map<string, PlatformModelType>  nameToPlatformModel;
+
+static inline
+void InitNameToFeature(void)
+{
+    if (nameToFeature.empty()) {
+        nameToFeature[feature_DQ] = BaseFeature::DELETION_QV;
+        nameToFeature[feature_DT] = BaseFeature::DELETION_TAG;
+        nameToFeature[feature_IQ] = BaseFeature::INSERTION_QV;
+        nameToFeature[feature_MQ] = BaseFeature::MERGE_QV;
+        nameToFeature[feature_SQ] = BaseFeature::SUBSTITUTION_QV;
+        nameToFeature[feature_ST] = BaseFeature::SUBSTITUTION_TAG;
+        nameToFeature[feature_IP] = BaseFeature::IPD;
+        nameToFeature[feature_PW] = BaseFeature::PULSE_WIDTH;
+        nameToFeature[feature_PM] = BaseFeature::PKMID;
+        nameToFeature[feature_PA] = BaseFeature::PKMEAN;
+        nameToFeature[feature_PI] = BaseFeature::PKMID2;
+        nameToFeature[feature_PS] = BaseFeature::PKMEAN2;
+        nameToFeature[feature_PQ] = BaseFeature::LABEL_QV;
+        nameToFeature[feature_PT] = BaseFeature::ALT_LABEL;
+        nameToFeature[feature_PV] = BaseFeature::ALT_LABEL_QV;
+        nameToFeature[feature_PC] = BaseFeature::PULSE_CALL;
+        nameToFeature[feature_PG] = BaseFeature::PULSE_MERGE_QV;
+        nameToFeature[feature_PD] = BaseFeature::PRE_PULSE_FRAMES;
+        nameToFeature[feature_PX] = BaseFeature::PULSE_CALL_WIDTH;
+        nameToFeature[feature_SF] = BaseFeature::START_FRAME;
+    }
+}
+
+static inline
+void InitNameToCodec(void)
+{
+    if (nameToCodec.empty()) {
+        nameToCodec[codec_RAW] = FrameCodec::RAW;
+        nameToCodec[codec_V1]  = FrameCodec::V1;
+    }
+}
+
+static inline
+void InitNameToBarcodeMode(void)
+{
+    if (nameToBarcodeMode.empty()) {
+        nameToBarcodeMode[barcodemode_NONE] = BarcodeModeType::NONE;
+        nameToBarcodeMode[barcodemode_SYM]  = BarcodeModeType::SYMMETRIC;
+        nameToBarcodeMode[barcodemode_ASYM] = BarcodeModeType::ASYMMETRIC;
+    }
+}
+
+static inline
+void InitNameToBarcodeQuality(void)
+{
+    if (nameToBarcodeQuality.empty()) {
+        nameToBarcodeQuality[barcodequal_NONE]  = BarcodeQualityType::NONE;
+        nameToBarcodeQuality[barcodequal_SCORE] = BarcodeQualityType::SCORE;
+        nameToBarcodeQuality[barcodequal_PROB]  = BarcodeQualityType::PROBABILITY;
+    }
+}
+
+static inline
+void InitNameToPlatformModel(void)
+{
+    if (nameToPlatformModel.empty()) {
+        nameToPlatformModel[platformModelType_ASTRO]  = PlatformModelType::ASTRO;
+        nameToPlatformModel[platformModelType_RS]     = PlatformModelType::RS;
+        nameToPlatformModel[platformModelType_SEQUEL] = PlatformModelType::SEQUEL;
+    }
+}
+
+static inline
+bool IsLikelyBarcodeKey(const string& name)
+{ return name.find("Barcode") == 0; }
+
+static inline
+bool IsBaseFeature(const string& name)
+{
+    InitNameToFeature();
+    return nameToFeature.find(name) != nameToFeature.cend();
+}
+
+static inline
+BaseFeature BaseFeatureFromName(const string& name)
+{
+    InitNameToFeature();
+    return nameToFeature.at(name);
+}
+
+static inline
+FrameCodec FrameCodecFromName(const string& name)
+{
+    InitNameToCodec();
+    return nameToCodec.at(name);
+}
+
+static inline
+BarcodeModeType BarcodeModeFromName(const string& name)
+{
+    InitNameToBarcodeMode();
+    return nameToBarcodeMode.at(name);
+}
+
+static inline
+BarcodeQualityType BarcodeQualityFromName(const string& name)
+{
+    InitNameToBarcodeQuality();
+    return nameToBarcodeQuality.at(name);
+}
+
+static inline
+PlatformModelType PlatformModelFromName(const string& name)
+{
+    InitNameToPlatformModel();
+    return nameToPlatformModel.at(name);
+}
+
+} // namespace internal
+
+ReadGroupInfo::ReadGroupInfo(void)
+    : platformModel_(PlatformModelType::SEQUEL)
+    , readType_("UNKNOWN")
+    , ipdCodec_(FrameCodec::V1)
+    , pulseWidthCodec_(FrameCodec::V1)
+{ }
+
+ReadGroupInfo::ReadGroupInfo(const std::string& id)
+    : id_(id)
+    , platformModel_(PlatformModelType::SEQUEL)
+    , readType_("UNKNOWN")
+    , ipdCodec_(FrameCodec::V1)
+    , pulseWidthCodec_(FrameCodec::V1)
+{ }
+
+ReadGroupInfo::ReadGroupInfo(const std::string& movieName,
+                             const std::string& readType)
+    : id_(MakeReadGroupId(movieName, readType))
+    , movieName_(movieName)
+    , platformModel_(PlatformModelType::SEQUEL)
+    , readType_(readType)
+    , ipdCodec_(FrameCodec::V1)
+    , pulseWidthCodec_(FrameCodec::V1)
+{ }
+
+ReadGroupInfo::ReadGroupInfo(const std::string& movieName,
+                             const std::string& readType,
+                             const PlatformModelType platform)
+    : id_(MakeReadGroupId(movieName, readType))
+    , movieName_(movieName)
+    , platformModel_(platform)
+    , readType_(readType)
+    , ipdCodec_(FrameCodec::V1)
+    , pulseWidthCodec_(FrameCodec::V1)
+{ }
+
+ReadGroupInfo::ReadGroupInfo(const ReadGroupInfo& other)
+    : id_(other.id_)
+    , sequencingCenter_(other.sequencingCenter_)
+    , date_(other.date_)
+    , flowOrder_(other.flowOrder_)
+    , keySequence_(other.keySequence_)
+    , library_(other.library_)
+    , programs_(other.programs_)
+    , predictedInsertSize_(other.predictedInsertSize_)
+    , movieName_(other.movieName_)
+    , sample_(other.sample_)
+    , platformModel_(other.platformModel_)
+    , readType_(other.readType_)
+    , bindingKit_(other.bindingKit_)
+    , sequencingKit_(other.sequencingKit_)
+    , basecallerVersion_(other.basecallerVersion_)
+    , frameRateHz_(other.frameRateHz_)
+    , control_(other.control_)
+    , ipdCodec_(other.ipdCodec_)
+    , pulseWidthCodec_(other.pulseWidthCodec_)
+    , hasBarcodeData_(other.hasBarcodeData_)
+    , barcodeFile_(other.barcodeFile_)
+    , barcodeHash_(other.barcodeHash_)
+    , barcodeCount_(other.barcodeCount_)
+    , barcodeMode_(other.barcodeMode_)
+    , barcodeQuality_(other.barcodeQuality_)
+    , features_(other.features_)
+{  }
+
+ReadGroupInfo::ReadGroupInfo(ReadGroupInfo&& other)
+    : id_(std::move(other.id_))
+    , sequencingCenter_(std::move(other.sequencingCenter_))
+    , date_(std::move(other.date_))
+    , flowOrder_(std::move(other.flowOrder_))
+    , keySequence_(std::move(other.keySequence_))
+    , library_(std::move(other.library_))
+    , programs_(std::move(other.programs_))
+    , predictedInsertSize_(std::move(other.predictedInsertSize_))
+    , movieName_(std::move(other.movieName_))
+    , sample_(std::move(other.sample_))
+    , platformModel_(std::move(other.platformModel_))
+    , readType_(std::move(other.readType_))
+    , bindingKit_(std::move(other.bindingKit_))
+    , sequencingKit_(std::move(other.sequencingKit_))
+    , basecallerVersion_(std::move(other.basecallerVersion_))
+    , frameRateHz_(std::move(other.frameRateHz_))
+    , control_(std::move(other.control_))
+    , ipdCodec_(std::move(other.ipdCodec_))
+    , pulseWidthCodec_(std::move(other.pulseWidthCodec_))
+    , hasBarcodeData_(std::move(other.hasBarcodeData_))
+    , barcodeFile_(std::move(other.barcodeFile_))
+    , barcodeHash_(std::move(other.barcodeHash_))
+    , barcodeCount_(std::move(other.barcodeCount_))
+    , barcodeMode_(std::move(other.barcodeMode_))
+    , barcodeQuality_(std::move(other.barcodeQuality_))
+    , features_(std::move(other.features_))
+{ }
+
+ReadGroupInfo::~ReadGroupInfo(void) { }
+
+ReadGroupInfo& ReadGroupInfo::operator=(const ReadGroupInfo& other)
+{
+    id_ = other.id_;
+    sequencingCenter_ = other.sequencingCenter_;
+    date_ = other.date_;
+    flowOrder_ = other.flowOrder_;
+    keySequence_ = other.keySequence_;
+    library_ = other.library_;
+    programs_ = other.programs_;
+    platformModel_ = other.platformModel_;
+    predictedInsertSize_ = other.predictedInsertSize_;
+    movieName_ = other.movieName_;
+    sample_ = other.sample_;
+    readType_ = other.readType_;
+    bindingKit_ = other.bindingKit_;
+    sequencingKit_ = other.sequencingKit_;
+    basecallerVersion_ = other.basecallerVersion_;
+    frameRateHz_ = other.frameRateHz_;
+    control_ = other.control_;
+    ipdCodec_ = other.ipdCodec_;
+    pulseWidthCodec_ = other.pulseWidthCodec_;
+    hasBarcodeData_ = other.hasBarcodeData_;
+    barcodeFile_  = other.barcodeFile_;
+    barcodeHash_ = other.barcodeHash_;
+    barcodeCount_ = other.barcodeCount_;
+    barcodeMode_ = other.barcodeMode_;
+    barcodeQuality_ = other.barcodeQuality_;
+    features_ = other.features_;
+    return *this;
+}
+
+ReadGroupInfo& ReadGroupInfo::operator=(ReadGroupInfo&& other)
+{
+    id_ = std::move(other.id_);
+    sequencingCenter_ = std::move(other.sequencingCenter_);
+    date_ = std::move(other.date_);
+    flowOrder_ = std::move(other.flowOrder_);
+    keySequence_ = std::move(other.keySequence_);
+    library_ = std::move(other.library_);
+    programs_ = std::move(other.programs_);
+    platformModel_ = std::move(other.platformModel_);
+    predictedInsertSize_ = std::move(other.predictedInsertSize_);
+    movieName_ = std::move(other.movieName_);
+    sample_ = std::move(other.sample_);
+    readType_ = std::move(other.readType_);
+    bindingKit_ = std::move(other.bindingKit_);
+    sequencingKit_ = std::move(other.sequencingKit_);
+    basecallerVersion_ = std::move(other.basecallerVersion_);
+    frameRateHz_ = std::move(other.frameRateHz_);
+    control_ = std::move(other.control_);
+    ipdCodec_ = std::move(other.ipdCodec_);
+    pulseWidthCodec_ = std::move(other.pulseWidthCodec_);
+    hasBarcodeData_ = std::move(other.hasBarcodeData_);
+    barcodeFile_  = std::move(other.barcodeFile_);
+    barcodeHash_ = std::move(other.barcodeHash_);
+    barcodeCount_ = std::move(other.barcodeCount_);
+    barcodeMode_ = std::move(other.barcodeMode_);
+    barcodeQuality_ = std::move(other.barcodeQuality_);
+    features_ = std::move(other.features_);
+    return *this;
+}
+
+void ReadGroupInfo::DecodeSamDescription(const std::string& description)
+{
+    // split on semicolons
+    // for each, split on equal
+    //    determine name ->
+
+    auto tokens = internal::Split(description, ';');
+    if (tokens.empty())
+        return;
+
+    bool hasBarcodeFile = false;
+    bool hasBarcodeHash = false;
+    bool hasBarcodeCount = false;
+    bool hasBarcodeMode = false;
+    bool hasBarcodeQuality = false;
+
+    // iterate over tokens
+    for (auto&& token : tokens) {
+
+        const auto foundEqual = token.find('=');
+        if (foundEqual == string::npos)
+            continue;
+
+        const auto key = token.substr(0,foundEqual);
+        const auto value = token.substr(foundEqual+1);
+
+        // 'mandatory' items
+        if      (key == internal::token_RT) readType_ = value;
+        else if (key == internal::token_BK) bindingKit_ = value;
+        else if (key == internal::token_BV) basecallerVersion_ = value;
+        else if (key == internal::token_SK) sequencingKit_ = value;
+        else if (key == internal::token_FR) frameRateHz_ = value;
+        else if (key == internal::token_CT) control_ = (value == "TRUE");
+
+        // base features
+        else if (internal::IsBaseFeature(key))
+            features_[internal::BaseFeatureFromName(key)] = value;
+
+        // barcode data
+        else if (internal::IsLikelyBarcodeKey(key)) {
+            if (key == internal::token_BF) {
+                barcodeFile_ = value;
+                hasBarcodeFile = true;
+            }
+            else if (key == internal::token_BH) {
+                barcodeHash_ = value;
+                hasBarcodeHash = true;
+            }
+            else if (key == internal::token_BC) {
+                barcodeCount_ = static_cast<size_t>(std::stoul(value));
+                hasBarcodeCount = true;
+            }
+            else if (key == internal::token_BM) {
+                barcodeMode_ = internal::BarcodeModeFromName(value);
+                hasBarcodeMode = true;
+            }
+            else if (key == internal::token_BQ) {
+                barcodeQuality_ = internal::BarcodeQualityFromName(value);
+                hasBarcodeQuality = true;
+            }
+        }
+
+        // frame codecs
+        else {
+            const auto keyParts = internal::Split(key, ':');
+            if (keyParts.size() == 2) {
+                const auto& subkey = keyParts.at(0);
+                if (subkey == internal::feature_IP) {
+                    ipdCodec_ = internal::FrameCodecFromName(keyParts.at(1));
+                    features_[BaseFeature::IPD] = value;
+                } 
+                else if (subkey == internal::feature_PW) {
+                    pulseWidthCodec_ = internal::FrameCodecFromName(keyParts.at(1));
+                    features_[BaseFeature::PULSE_WIDTH] = value;
+                }
+            }
+        }
+    }
+
+    hasBarcodeData_ = (hasBarcodeFile  &&
+                       hasBarcodeHash  &&
+                       hasBarcodeCount &&
+                       hasBarcodeMode  &&
+                       hasBarcodeQuality);
+}
+
+std::string ReadGroupInfo::EncodeSamDescription(void) const
+{
+    auto result = string{ };
+    result.reserve(256);
+    result.append(std::string(internal::token_RT+"=" + readType_));
+
+    static const auto SEP   = string{";"};
+    static const auto COLON = string{":"};
+    static const auto EQ    = string{"="};
+
+    auto featureName = string{ };
+    const auto featureEnd = features_.cend();
+    auto featureIter = features_.cbegin();
+    for ( ; featureIter != featureEnd; ++featureIter ) {
+        featureName = internal::BaseFeatureName(featureIter->first);
+        if (featureName.empty() || featureIter->second.empty())
+            continue;
+        else if (featureName == internal::feature_IP) {
+            featureName.append(COLON);
+            featureName.append(internal::FrameCodecName(ipdCodec_));
+        }
+        else if (featureName == internal::feature_PW) {
+            featureName.append(COLON);
+            featureName.append(internal::FrameCodecName(pulseWidthCodec_));
+        }
+        result.append(string(SEP + featureName + EQ + featureIter->second));
+    }
+
+    if (!bindingKit_.empty())        result.append(SEP + internal::token_BK +EQ + bindingKit_);
+    if (!sequencingKit_.empty())     result.append(SEP + internal::token_SK +EQ + sequencingKit_);
+    if (!basecallerVersion_.empty()) result.append(SEP + internal::token_BV +EQ + basecallerVersion_);
+    if (!frameRateHz_.empty())       result.append(SEP + internal::token_FR +EQ + frameRateHz_);
+    if (control_)                    result.append(SEP + internal::token_CT +EQ + (control_ ? "TRUE"
+                                                                                            : "FALSE"));
+
+    if (hasBarcodeData_) {
+        const auto barcodeData =
+            string {
+                SEP + internal::token_BF + EQ + barcodeFile_ +
+                SEP + internal::token_BH + EQ + barcodeHash_ +
+                SEP + internal::token_BC + EQ + std::to_string(barcodeCount_) +
+                SEP + internal::token_BM + EQ + internal::BarcodeModeName(barcodeMode_) +
+                SEP + internal::token_BQ + EQ + internal::BarcodeQualityName(barcodeQuality_)
+            };
+        result.reserve(result.size() + barcodeData.size());
+        result.append(barcodeData);
+    }
+
+    return result;
+}
+
+ReadGroupInfo ReadGroupInfo::FromSam(const string& sam)
+{
+    // pop off '@RG\t', then split rest of line into tokens
+    const auto tokens = internal::Split(sam.substr(4), '\t');
+    if (tokens.empty())
+        return ReadGroupInfo{ };
+
+    auto rg = ReadGroupInfo{ };
+    auto custom = map<string, string>{ };
+
+    for (auto&& token : tokens) {
+        const auto tokenTag   = token.substr(0,2);
+        const auto tokenValue = token.substr(3);
+
+        // set read group info
+        if      (tokenTag == internal::sam_ID) rg.Id(tokenValue);
+        else if (tokenTag == internal::sam_CN) rg.SequencingCenter(tokenValue);
+        else if (tokenTag == internal::sam_DT) rg.Date(tokenValue);
+        else if (tokenTag == internal::sam_FO) rg.FlowOrder(tokenValue);
+        else if (tokenTag == internal::sam_KS) rg.KeySequence(tokenValue);
+        else if (tokenTag == internal::sam_LB) rg.Library(tokenValue);
+        else if (tokenTag == internal::sam_PG) rg.Programs(tokenValue);
+        else if (tokenTag == internal::sam_PI) rg.PredictedInsertSize(tokenValue);
+        else if (tokenTag == internal::sam_PU) rg.MovieName(tokenValue);
+        else if (tokenTag == internal::sam_SM) rg.Sample(tokenValue);
+        else if (tokenTag == internal::sam_DS) rg.DecodeSamDescription(tokenValue);
+        else if (tokenTag == internal::sam_PM) rg.PlatformModel(internal::PlatformModelFromName(tokenValue));
+
+        // otherwise, "custom" tag
+        else
+            custom[tokenTag] = tokenValue;
+    }
+    rg.CustomTags(custom);
+
+    return rg;
+}
+
+string ReadGroupInfo::IntToId(const int32_t id)
+{
+    stringstream s;
+    s << std::setfill('0') << std::setw(8) << std::hex << id;
+    return s.str();
+}
+
+ReadGroupInfo& ReadGroupInfo::IpdCodec(const FrameCodec& codec,
+                                       const string& tag)
+{
+    // store desired codec type
+    ipdCodec_ = codec;
+
+    // update base features map
+    auto actualTag = tag;
+    if (actualTag.empty())
+        actualTag = "ip";
+    BaseFeatureTag(BaseFeature::IPD, actualTag);
+    return *this;
+}
+
+ReadGroupInfo& ReadGroupInfo::PulseWidthCodec(const FrameCodec& codec,
+                                              const string& tag)
+{
+    // store desired codec type
+    pulseWidthCodec_ = codec;
+
+    // update base features map
+    auto actualTag = tag;
+    if (actualTag.empty())
+        actualTag = "pw";
+    BaseFeatureTag(BaseFeature::PULSE_WIDTH, actualTag);
+    return *this;
+}
+
+string ReadGroupInfo::SequencingChemistryFromTriple(const string& bindingKit,
+                                                    const string& sequencingKit,
+                                                    const string& basecallerVersion)
+{
+    const auto verFields = internal::Split(basecallerVersion, '.');
+    if (verFields.size() < 2)
+        throw std::runtime_error("basecaller version too short: " + basecallerVersion);
+    const string ver = verFields.at(0) + "." + verFields.at(1);
+//    const string ver{ basecallerVersion.substr(0, 3) };
+    for (const auto& row : internal::ChemistryTable) {
+        if (bindingKit == row[0] && sequencingKit == row[1] && ver == row[2])
+            return row[3];
+    }
+
+    // not found
+    throw InvalidSequencingChemistryException(bindingKit,
+                                              sequencingKit,
+                                              basecallerVersion);
+}
+
+std::string ReadGroupInfo::ToSam(void) const
+{
+    stringstream out;
+    out << "@RG"
+        << internal::MakeSamTag(internal::sam_ID, id_)
+        << internal::MakeSamTag(internal::sam_PL, Platform());
+
+    auto description = EncodeSamDescription();
+    if (!description.empty())
+        out << internal::MakeSamTag(internal::sam_DS, description);
+
+    if (!sequencingCenter_.empty())    out << internal::MakeSamTag(internal::sam_CN, sequencingCenter_);
+    if (!date_.empty())                out << internal::MakeSamTag(internal::sam_DT, date_);
+    if (!flowOrder_.empty())           out << internal::MakeSamTag(internal::sam_FO, flowOrder_);
+    if (!keySequence_.empty())         out << internal::MakeSamTag(internal::sam_KS, keySequence_);
+    if (!library_.empty())             out << internal::MakeSamTag(internal::sam_LB, library_);
+    if (!programs_.empty())            out << internal::MakeSamTag(internal::sam_PG, programs_);
+    if (!predictedInsertSize_.empty()) out << internal::MakeSamTag(internal::sam_PI, predictedInsertSize_);
+    if (!movieName_.empty())           out << internal::MakeSamTag(internal::sam_PU, movieName_);
+    if (!sample_.empty())              out << internal::MakeSamTag(internal::sam_SM, sample_);
+
+    out << internal::MakeSamTag(internal::sam_PM, internal::PlatformModelName(platformModel_));
+
+    // append any custom tags
+    auto customIter = custom_.cbegin();
+    auto customEnd  = custom_.cend();
+    for ( ; customIter != customEnd; ++customIter )
+        out << internal::MakeSamTag(customIter->first, customIter->second);
+
+    return out.str();
+}
+
+std::string MakeReadGroupId(const std::string& movieName,
+                            const std::string& readType)
+{
+/*{
+    MD5_CTX md5;
+    unsigned char digest[16];
+    char hexdigest[9];
+
+    MD5_Init(&md5);
+    MD5_Update(&md5, reinterpret_cast<void*>(const_cast<char*>(movieName.c_str())), movieName.size());
+    MD5_Update(&md5, reinterpret_cast<void*>(const_cast<char*>("//")), 2);
+    MD5_Update(&md5, reinterpret_cast<void*>(const_cast<char*>(readType.c_str())), readType.size());
+    MD5_Final(digest, &md5);
+
+    for (int i = 0; i < 4; ++i)
+        sprintf(&hexdigest[2*i], "%02x", digest[i]);
+
+    return std::string{hexdigest, 8};
+*/
+    return MD5Hash(movieName + "//" + readType).substr(0,8);
+}
+
+bool ReadGroupInfo::operator==(const ReadGroupInfo& other) const
+{
+    return id_ == other.id_ 
+            && sequencingCenter_ == other.sequencingCenter_        
+            && date_ == other.date_                    
+            && flowOrder_ == other.flowOrder_               
+            && keySequence_ == other.keySequence_             
+            && library_ == other.library_                 
+            && programs_ == other.programs_                
+            && platformModel_ == other.platformModel_                
+            && predictedInsertSize_ == other.predictedInsertSize_     
+            && movieName_ == other.movieName_               
+            && sample_ == other.sample_                  
+            && readType_ == other.readType_ 
+            && bindingKit_ == other.bindingKit_ 
+            && sequencingKit_ == other.sequencingKit_ 
+            && basecallerVersion_ == other.basecallerVersion_ 
+            && frameRateHz_ == other.frameRateHz_ 
+            && control_ == other.control_ 
+            && ipdCodec_ == other.ipdCodec_
+            && pulseWidthCodec_ == other.pulseWidthCodec_
+            && hasBarcodeData_ == other.hasBarcodeData_
+            && barcodeFile_ == other.barcodeFile_
+            && barcodeHash_ == other.barcodeHash_
+            && barcodeCount_ == other.barcodeCount_
+            && barcodeMode_ == other.barcodeMode_
+            && barcodeQuality_ == other.barcodeQuality_
+            && features_.size() == other.features_.size()
+            && std::equal(features_.cbegin(),
+                          features_.cend(),
+                          other.features_.cbegin())
+            && custom_.size() == other.custom_.size()
+            && std::equal(custom_.begin(),
+                          custom_.end(),
+                          other.custom_.cbegin());
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/src/SamTagCodec.cpp b/src/SamTagCodec.cpp

new file mode 100644 (file)

index 0000000..43064b8
--- /dev/null
+++ b/src/SamTagCodec.cpp
@@ -0,0 +1,300 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file SamTagCodec.h
+/// \brief Implements the SamTagCodec class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/SamTagCodec.h"
+#include "AssertUtils.h"
+#include <boost/lexical_cast.hpp>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+template<typename T>
+inline void appendSamValue(const T& value,
+                           string& result,
+                           bool force8BitInt = false)
+{
+    if (force8BitInt)
+        result.append(boost::lexical_cast<string>(static_cast<int>(value)));
+    else
+        result.append(boost::lexical_cast<string>(value));
+}
+
+template<typename T>
+void appendSamMultiValue(const T& container,
+                         string& result,
+                         bool force8BitInt = false)
+{
+    auto end = container.cend();
+    for (auto iter = container.cbegin(); iter != end; ++iter) {
+        result.append(1, ',');
+        if ( force8BitInt )
+            result.append(boost::lexical_cast<string>(static_cast<int>(*iter)));
+        else
+            result.append(boost::lexical_cast<string>(*iter));
+    }
+}
+
+static
+vector<string>& split(const string& s, char delim, vector<string>& elems)
+{
+    stringstream ss(s);
+    string item;
+    while (getline(ss, item, delim))
+        elems.push_back(item);
+    return elems;
+}
+
+static
+vector<string> split(const string& s, char delim) {
+    vector<string> elems;
+    split(s, delim, elems);
+    return elems;
+}
+
+vector<float> readFloatSamMultiValue(const string& data)
+{
+    vector<float> result;
+    char* c = (char*)data.c_str();
+    const char* end = c + data.length();
+    while (c+1 < end) {
+        const float value = strtof(c+1, &c); // c+1 to skip comma
+        result.push_back(value);
+    }
+    return result;
+}
+
+template<typename T>
+vector<T> readSignedSamMultiValue(const string& data)
+{
+    vector<T> result;
+    char* c = (char*)data.c_str();
+    const char* end = c + data.length();
+    while (c+1 < end) {
+        const T value = strtol(c+1, &c, 0); // c+1 to skip comma
+        result.push_back(value);
+    }
+
+    return result;
+}
+
+template<typename T>
+vector<T> readUnsignedSamMultiValue(const string& data)
+{
+    vector<T> result;
+    char* c = (char*)data.c_str();
+    const char* end = c + data.length();
+    while (c+1 < end) {
+        const T value = strtoul(c+1, &c, 0); // c+1 to skip comma
+        result.push_back(value);
+    }
+    return result;
+}
+
+TagCollection SamTagCodec::Decode(const string& tagString)
+{
+    TagCollection tags;
+
+    const vector<string>& tokens = split(tagString, '\t');
+    const auto end = tokens.cend();
+    for (auto iter = tokens.cbegin(); iter != end; ++iter ) {
+        const string& token = (*iter);
+        if (token.size() < 6)          // TT:t:X
+            continue;
+
+        const string& name = token.substr(0, 2);
+        const char type = token.at(3);
+        const string& remainder = token.substr(5);
+        PB_ASSERT_OR_CONTINUE(!remainder.empty());
+
+        switch (type) {
+
+            // technically only 'A' is allowed in SAM chars,
+            // but we'll be a little permissive
+            case 'A' :
+            case 'a' :
+            {
+                tags[name] = Tag(static_cast<char>(remainder.at(0), TagModifier::ASCII_CHAR));
+                break;
+            }
+
+            // technically only 'i' is allowed in SAM ints, but we'll be a little
+            // permissive since SAM might be a bit more "user-edited" than BAM
+            case 'c' :
+            case 'C' :
+            case 's' :
+            case 'S' :
+            case 'i' :
+            case 'I' :
+            {
+                // check out boost::numeric cast for these conversions
+
+                // negative value (force signed int)
+                if (remainder.at(0) == '-') {
+                    const int32_t x = boost::lexical_cast<int32_t>(remainder);
+                    if ( x >= INT8_MIN )
+                        tags[name] = static_cast<int8_t>(x);
+                    else if ( x >= INT16_MIN )
+                        tags[name] = static_cast<int16_t>(x);
+                    else
+                        tags[name] = x;
+                }
+
+                // unsigned int
+                else {
+                    const uint32_t x = boost::lexical_cast<uint32_t>(remainder);
+                    if ( x <= UINT8_MAX )
+                        tags[name] = static_cast<uint8_t>(x);
+                    else if ( x <= UINT16_MAX )
+                        tags[name] = static_cast<uint16_t>(x);
+                    else
+                        tags[name] = x;
+                }
+                break;
+            }
+
+            case 'f' :
+            {
+                tags[name] = boost::lexical_cast<float>(remainder);
+                break;
+            }
+
+            case 'Z' :
+            {
+                tags[name] = remainder;
+                break;
+            }
+
+            case 'H' :
+            {
+                tags[name] = Tag(remainder, TagModifier::HEX_STRING);
+                break;
+            }
+
+            case 'B' :
+            {
+                const char elementType = remainder.at(0);
+                const string& arrayData = remainder.substr(1);
+                switch (elementType) {
+                    case 'c' : tags[name] = readSignedSamMultiValue<int8_t>(arrayData);     break;
+                    case 'C' : tags[name] = readUnsignedSamMultiValue<uint8_t>(arrayData);  break;
+                    case 's' : tags[name] = readSignedSamMultiValue<int16_t>(arrayData);    break;
+                    case 'S' : tags[name] = readUnsignedSamMultiValue<uint16_t>(arrayData); break;
+                    case 'i' : tags[name] = readSignedSamMultiValue<int32_t>(arrayData);    break;
+                    case 'I' : tags[name] = readUnsignedSamMultiValue<uint32_t>(arrayData); break;
+                    case 'f' : tags[name] = readFloatSamMultiValue(arrayData);              break;
+                    default:
+                        PB_ASSERT_OR_CONTINUE(false);
+                }
+                break;
+            }
+
+            // unsupported SAM tag type
+            default :
+                PB_ASSERT_OR_CONTINUE(false);
+        }
+    }
+
+    return tags;
+}
+
+string SamTagCodec::Encode(const TagCollection& tags)
+{
+    string result;
+    result.reserve(1024);
+
+    const auto tagEnd = tags.cend();
+    for (auto tagIter = tags.cbegin(); tagIter != tagEnd; ++tagIter) {
+        const string& name = (*tagIter).first;
+        const Tag& tag = (*tagIter).second;
+        PB_ASSERT_OR_CONTINUE(name.size() == 2);
+        if (tag.IsNull())
+            continue;
+
+        // tab separator
+        if (!result.empty())
+            result.append(1, '\t');
+
+        // "<TAG>:"
+        result.append(name);
+        result.append(1, ':');
+
+        // "<TYPE>:<DATA>" for printable, ASCII char
+        if (tag.HasModifier(TagModifier::ASCII_CHAR)) {
+            char c = tag.ToAscii();
+            if (c != '\0') {
+                result.append("A:");
+                result.append(1, c);
+                continue;
+            }
+        }
+
+        // "<TYPE>:<DATA>" for all other data
+        switch (tag.Type()) {
+            case TagDataType::INT8   : result.append("i:"); appendSamValue(tag.ToInt8(),   result, true); break;
+            case TagDataType::UINT8  : result.append("i:"); appendSamValue(tag.ToUInt8(),  result, true); break;
+            case TagDataType::INT16  : result.append("i:"); appendSamValue(tag.ToInt16(),  result); break;
+            case TagDataType::UINT16 : result.append("i:"); appendSamValue(tag.ToUInt16(), result); break;
+            case TagDataType::INT32  : result.append("i:"); appendSamValue(tag.ToInt32(),  result); break;
+            case TagDataType::UINT32 : result.append("i:"); appendSamValue(tag.ToUInt32(), result); break;
+            case TagDataType::FLOAT  : result.append("f:"); appendSamValue(tag.ToFloat(),  result); break;
+
+            case TagDataType::STRING :
+            {
+                result.append(tag.HasModifier(TagModifier::HEX_STRING) ? "H:" : "Z:");
+                result.append(tag.ToString());
+                break;
+            }
+
+            case TagDataType::INT8_ARRAY   : result.append("B:c"); appendSamMultiValue(tag.ToInt8Array(),   result, true); break;
+            case TagDataType::UINT8_ARRAY  : result.append("B:C"); appendSamMultiValue(tag.ToUInt8Array(),  result, true); break;
+            case TagDataType::INT16_ARRAY  : result.append("B:s"); appendSamMultiValue(tag.ToInt16Array(),  result); break;
+            case TagDataType::UINT16_ARRAY : result.append("B:S"); appendSamMultiValue(tag.ToUInt16Array(), result); break;
+            case TagDataType::INT32_ARRAY  : result.append("B:i"); appendSamMultiValue(tag.ToInt32Array(),  result); break;
+            case TagDataType::UINT32_ARRAY : result.append("B:I"); appendSamMultiValue(tag.ToUInt32Array(), result); break;
+            case TagDataType::FLOAT_ARRAY  : result.append("B:f"); appendSamMultiValue(tag.ToFloatArray(),  result); break;
+
+            default :
+                PB_ASSERT_OR_RETURN_VALUE(false, string());
+        }
+    }
+
+    return result;
+}
diff --git a/src/SamWriter.cpp b/src/SamWriter.cpp

new file mode 100644 (file)

index 0000000..38a28d2
--- /dev/null
+++ b/src/SamWriter.cpp
@@ -0,0 +1,142 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "pbbam/SamWriter.h"
+#include "pbbam/Validator.h"
+#include "FileProducer.h"
+#include "MemoryUtils.h"
+#include <htslib/hfile.h>
+#include <htslib/sam.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class SamWriterPrivate : public internal::FileProducer
+{
+public:
+    SamWriterPrivate(const std::string& filename,
+                      const PBBAM_SHARED_PTR<bam_hdr_t> rawHeader)
+        : internal::FileProducer(filename)
+        , file_(nullptr)
+        , header_(rawHeader)
+    {
+        if (!header_)
+            throw std::runtime_error("null header");
+
+        // open file
+        const string& usingFilename = TempFilename();
+        const string& mode = string("w");
+        file_.reset(sam_open(usingFilename.c_str(), mode.c_str()));
+        if (!file_)
+            throw std::runtime_error("could not open file for writing");
+
+        // write header
+        const int ret = sam_hdr_write(file_.get(), header_.get());
+        if (ret != 0)
+            throw std::runtime_error("could not write header");
+    }
+
+    void TryFlush(void);
+    void Write(const BamRecord& record);
+
+private:
+    std::unique_ptr<samFile, internal::HtslibFileDeleter> file_;
+    PBBAM_SHARED_PTR<bam_hdr_t> header_;
+};
+
+void SamWriterPrivate::TryFlush(void)
+{
+    const auto ret = file_.get()->fp.hfile;
+    if (ret != 0)
+        throw std::runtime_error("could not flush output buffer contents");
+}
+
+void SamWriterPrivate::Write(const BamRecord& record)
+{
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(record);
+#endif
+
+    const auto rawRecord = internal::BamRecordMemory::GetRawData(record);
+
+    // store bin number
+    // min_shift=14 & n_lvls=5 are SAM/BAM "magic numbers"
+    rawRecord->core.bin = hts_reg2bin(rawRecord->core.pos,
+                                      bam_endpos(rawRecord.get()), 14, 5);
+
+    // write record to file
+    const int ret = sam_write1(file_.get(), header_.get(), rawRecord.get());
+    if (ret <= 0)
+        throw std::runtime_error("could not write record");
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+SamWriter::SamWriter(const string& filename, const BamHeader& header)
+    : IRecordWriter()
+    , d_(nullptr)
+{
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(header);
+#endif
+    d_.reset(new internal::SamWriterPrivate{ filename,
+                                             internal::BamHeaderMemory::MakeRawHeader(header)
+                                           });
+}
+
+SamWriter::~SamWriter(void) { }
+
+void SamWriter::TryFlush(void)
+{
+    d_->TryFlush();
+}
+
+void SamWriter::Write(const BamRecord& record)
+{
+    d_->Write(record);
+}
+
+void SamWriter::Write(const BamRecordImpl& recordImpl)
+{
+    d_->Write( BamRecord{recordImpl} );
+}
diff --git a/src/SequenceInfo.cpp b/src/SequenceInfo.cpp

new file mode 100644 (file)

index 0000000..43e4343
--- /dev/null
+++ b/src/SequenceInfo.cpp
@@ -0,0 +1,176 @@
+
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file SequenceInfo.cpp
+/// \brief Implements the SequenceInfo class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/SequenceInfo.h"
+#include "SequenceUtils.h"
+#include <sstream>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static string token_SN = string("SN");
+static string token_LN = string("LN");
+static string token_AS = string("AS");
+static string token_M5 = string("M5");
+static string token_SP = string("SP");
+static string token_UR = string("UR");
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+SequenceInfo::SequenceInfo(void) { }
+
+SequenceInfo::SequenceInfo(const std::string& name,
+                           const std::string& length)
+    : name_(name)
+    , length_(length)
+{ }
+
+SequenceInfo::SequenceInfo(const SequenceInfo& other)
+    : name_(other.name_)
+    , length_(other.length_)
+    , assemblyId_(other.assemblyId_)
+    , checksum_(other.checksum_)
+    , species_(other.species_)
+    , uri_(other.uri_)
+{  }
+
+SequenceInfo::SequenceInfo(SequenceInfo&& other)
+    : name_(std::move(other.name_))
+    , length_(std::move(other.length_))
+    , assemblyId_(std::move(other.assemblyId_))
+    , checksum_(std::move(other.checksum_))
+    , species_(std::move(other.species_))
+    , uri_(std::move(other.uri_))
+{ }
+
+SequenceInfo::~SequenceInfo(void) { }
+
+SequenceInfo& SequenceInfo::operator=(const SequenceInfo& other)
+{
+    name_ = other.name_;
+    length_ = other.length_;
+    assemblyId_ = other.assemblyId_;
+    checksum_ = other.checksum_;
+    species_ = other.species_;
+    uri_ = other.uri_;
+    return *this;
+}
+
+SequenceInfo& SequenceInfo::operator=(SequenceInfo&& other)
+{
+    name_ = std::move(other.name_);
+    length_ = std::move(other.length_);
+    assemblyId_ = std::move(other.assemblyId_);
+    checksum_ = std::move(other.checksum_);
+    species_ = std::move(other.species_);
+    uri_ = std::move(other.uri_);
+    return *this;
+}
+
+SequenceInfo SequenceInfo::FromSam(const std::string& sam)
+{
+    // pop off '@SQ\t', then split rest of line into tokens
+    const vector<string>& tokens = internal::Split(sam.substr(4), '\t');
+    if (tokens.empty())
+        return SequenceInfo();
+
+    SequenceInfo seq;
+    map<string, string> custom;
+
+    // iterate over tokens
+    for (const string& token : tokens) {
+        const string& tokenTag   = token.substr(0,2);
+        const string& tokenValue = token.substr(3);
+
+        // set sequence info
+        if      (tokenTag == internal::token_SN) seq.Name(tokenValue);
+        else if (tokenTag == internal::token_LN) seq.Length(tokenValue);
+        else if (tokenTag == internal::token_AS) seq.AssemblyId(tokenValue);
+        else if (tokenTag == internal::token_M5) seq.Checksum(tokenValue);
+        else if (tokenTag == internal::token_SP) seq.Species(tokenValue);
+        else if (tokenTag == internal::token_UR) seq.Uri(tokenValue);
+
+        // otherwise, "custom" tag
+        else
+            custom[tokenTag] = tokenValue;
+    }
+
+    seq.CustomTags(custom);
+    return seq;
+}
+
+bool SequenceInfo::IsValid(void) const
+{
+    if (name_.empty())
+        return false;
+
+    // use long instead of int32_t, just to make sure we can catch overflow
+    const long l = atol(length_.c_str());
+    return l >= 0 && l <= INT32_MAX;
+}
+
+std::string SequenceInfo::ToSam(void) const
+{
+    stringstream out;
+    out << "@SQ"
+        << internal::MakeSamTag(internal::token_SN, name_);
+
+    if (!length_.empty())     out << internal::MakeSamTag(internal::token_LN, length_);
+    if (!assemblyId_.empty()) out << internal::MakeSamTag(internal::token_AS, assemblyId_);
+    if (!checksum_.empty())   out << internal::MakeSamTag(internal::token_M5, checksum_);
+    if (!species_.empty())    out << internal::MakeSamTag(internal::token_SP, species_);
+    if (!uri_.empty())        out << internal::MakeSamTag(internal::token_UR, uri_);
+
+    // append any custom tags
+    map<string, string>::const_iterator customIter = custom_.cbegin();
+    map<string, string>::const_iterator customEnd  = custom_.cend();
+    for ( ; customIter != customEnd; ++customIter )
+        out << internal::MakeSamTag(customIter->first, customIter->second);
+
+    return out.str();
+}
diff --git a/src/SequenceUtils.h b/src/SequenceUtils.h

new file mode 100644 (file)

index 0000000..14ad898
--- /dev/null
+++ b/src/SequenceUtils.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef SEQUENCEUTILS_H
+#define SEQUENCEUTILS_H
+
+#include "StringUtils.h"
+#include <algorithm>
+#include <string>
+#include <ctype.h>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+inline char Complement(const char character)
+{
+    static char const complementLookup[] =
+    {
+        '\0', 'T', 'V', 'G', 'H',
+        '\0', '\0', 'C', 'D', '\0',
+        '\0', 'M', '\0', 'K', 'N',
+        '\0', '\0', '\0', 'Y', 'S',
+        'A', 'A', 'B', 'W', '\0', 'R'
+    };
+    if (character == '-' || character == '*')
+        return character;
+    return complementLookup[toupper(character) & 0x1f];
+}
+
+//inline void Reverse(std::string& s)
+//{ std::reverse(s.begin(), s.end()); }
+
+template<typename T>
+void Reverse(T& input)
+{ std::reverse(input.begin(), input.end()); }
+
+template<typename T>
+T MaybeReverse(T&& input, bool reverse)
+{ 
+    if (reverse) std::reverse(input.begin(), input.end()); 
+    return input;
+}
+
+template<typename T>
+T Reversed(const T& input)
+{
+    T result = input;
+    Reverse(result);
+    return result;
+}
+
+//inline std::string Reversed(const std::string& input)
+//{
+//    std::string result = input;
+//    Reverse(result);
+//    return result;
+//}
+
+inline void ReverseComplement(std::string& seq) {
+
+    std::string::iterator sIter = seq.begin();
+    std::string::iterator sEnd  = seq.end();
+    for ( ; sIter != sEnd; ++sIter )
+        *sIter = Complement(*sIter);
+    Reverse(seq);
+}
+
+inline std::string MaybeReverseComplement(std::string&& seq, bool reverse)
+{
+    if (reverse) ReverseComplement(seq);
+    return seq;
+}
+
+/// Reverse complement a DNA sequence case-sensitive
+inline void ReverseComplementCaseSens(std::string& seq)
+{
+    const std::string original = seq;
+    int8_t rc_table[128] = {
+        4, 4, 4,   4,  4,   4, 4, 4,  4,  4,  4,  4, 4, 4,  4,  4, 4, 4, 4,
+        4, 4, 4,   4,  4,   4, 4, 4,  4,  4,  4,  4, 4, 32, 4,  4, 4, 4, 4,
+        4, 4, 4,   4,  42,  4, 4, 45, 4,  4,  4,  4, 4, 4,  4,  4, 4, 4, 4,
+        4, 4, 4,   4,  4,   4, 4, 4,  84, 4,  71, 4, 4, 4,  67, 4, 4, 4, 4,
+        4, 4, 78,  4,  4,   4, 4, 4,  65, 65, 4,  4, 4, 4,  4,  4, 4, 4, 4,
+        4, 4, 116, 4,  103, 4, 4, 4,  99, 4,  4,  4, 4, 4,  4,  4, 4, 4, 4,
+        4, 4, 97,  97, 4,   4, 4, 4,  4,  4,  4,  4, 4, 4};
+    std::string reverseCompl(original.length(), 'N');
+    for (uint32_t i = 0; i < original.length(); ++i)
+        reverseCompl[original.length()-i-1] = (char)rc_table[(int8_t)original[i]];
+    seq = reverseCompl;
+}
+
+inline std::string MaybeReverseComplementCaseSens(std::string&& seq, bool reverse)
+{
+    if (reverse) ReverseComplementCaseSens(seq);
+    return seq;
+}
+
+
+inline std::string ReverseComplemented(const std::string& input)
+{
+    std::string result = input;
+    ReverseComplement(result);
+    return result;
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // SEQUENCEUTILS_H
diff --git a/src/StringUtils.h b/src/StringUtils.h

new file mode 100644 (file)

index 0000000..24562fb
--- /dev/null
+++ b/src/StringUtils.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef STRINGUTILS_H
+#define STRINGUTILS_H
+
+#include <algorithm>
+#include <exception>
+#include <ios>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+inline std::string MakeSamTag(const std::string& tag,
+                              const std::string& value)
+{
+    return std::string('\t' + tag + ':' + value);
+}
+
+inline std::vector<std::string> Split(const std::string& line,
+                                      const char delim = '\t')
+{
+    std::vector<std::string> tokens;
+    std::stringstream lineStream(line);
+    std::string token;
+    while (std::getline(lineStream, token, delim))
+        tokens.push_back(token);
+    return tokens;
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // STRINGUTILS_H
diff --git a/src/SubreadLengthQuery.cpp b/src/SubreadLengthQuery.cpp

new file mode 100644 (file)

index 0000000..1c7ce41
--- /dev/null
+++ b/src/SubreadLengthQuery.cpp
@@ -0,0 +1,71 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file SubreadLengthQuery.cpp
+/// \brief Implements the SubreadLengthQuery class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/SubreadLengthQuery.h"
+#include "pbbam/PbiFilterTypes.h"
+#include "pbbam/CompositeBamReader.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+struct SubreadLengthQuery::SubreadLengthQueryPrivate
+{
+    SubreadLengthQueryPrivate(const int32_t length,
+                              const Compare::Type compareType,
+                              const DataSet& dataset)
+        : reader_(PbiQueryLengthFilter(length, compareType), dataset)
+    { }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_; // unsorted
+};
+
+SubreadLengthQuery::SubreadLengthQuery(const int32_t length,
+                                       const Compare::Type compareType,
+                                       const DataSet& dataset)
+    : internal::IQuery()
+    , d_(new SubreadLengthQueryPrivate(length, compareType, dataset))
+{ }
+
+SubreadLengthQuery::~SubreadLengthQuery(void) { }
+
+bool SubreadLengthQuery::GetNext(BamRecord &r)
+{ return d_->reader_.GetNext(r); }
diff --git a/src/Tag.cpp b/src/Tag.cpp

new file mode 100644 (file)

index 0000000..5c51321
--- /dev/null
+++ b/src/Tag.cpp
@@ -0,0 +1,124 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Tag.cpp
+/// \brief Defines the Tag class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Tag.h"
+#include <stdexcept>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+Tag::Tag(void)           : data_(),      modifier_(TagModifier::NONE) { }
+Tag::Tag(int8_t value)   : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(uint8_t value)  : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(int16_t value)  : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(uint16_t value) : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(int32_t value)  : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(uint32_t value) : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(float value)    : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(const std::string& value)      : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(const vector<int8_t>& value)   : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(const vector<uint8_t>& value)  : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(const vector<int16_t>& value)  : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(const vector<uint16_t>& value) : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(const vector<int32_t>& value)  : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(const vector<uint32_t>& value) : data_(value), modifier_(TagModifier::NONE) { }
+Tag::Tag(const vector<float>& value)    : data_(value), modifier_(TagModifier::NONE) { }
+
+Tag::Tag(int8_t value, const TagModifier mod)
+    : data_(value)
+    , modifier_(mod)
+{
+    if (mod == TagModifier::HEX_STRING)
+        throw runtime_error("HEX_STRING is not a valid tag modifier for int8_t data. "
+                            "It is intended for string-type data only.");
+}
+
+Tag::Tag(const std::string& value, const TagModifier mod)
+    : data_(value)
+    , modifier_(mod)
+{
+    if (mod == TagModifier::ASCII_CHAR)
+        throw runtime_error("ASCII_CHAR is not a valid tag modifier for string-type data. "
+                            "To construct an ASCII char tag, use a single-quoted value (e.g. 'X' instead of \"X\")");
+}
+
+Tag::Tag(const Tag& other)
+    : data_(other.data_)
+    , modifier_(other.modifier_)
+{ }
+
+Tag::Tag(Tag&& other)
+    : data_(std::move(other.data_))
+    , modifier_(std::move(other.modifier_))
+{ }
+
+Tag::~Tag(void) { }
+
+Tag& Tag::operator=(boost::blank value) { data_ = value; return *this; }
+Tag& Tag::operator=(int8_t value)   { data_ = value; return *this; }
+Tag& Tag::operator=(uint8_t value)  { data_ = value; return *this; }
+Tag& Tag::operator=(int16_t value)  { data_ = value; return *this; }
+Tag& Tag::operator=(uint16_t value) { data_ = value; return *this; }
+Tag& Tag::operator=(int32_t value)  { data_ = value; return *this; }
+Tag& Tag::operator=(uint32_t value) { data_ = value; return *this; }
+Tag& Tag::operator=(float value)    { data_ = value; return *this; }
+Tag& Tag::operator=(const std::string& value)      { data_ = value; return *this; }
+Tag& Tag::operator=(const vector<int8_t>& value)   { data_ = value; return *this; }
+Tag& Tag::operator=(const vector<uint8_t>& value)  { data_ = value; return *this; }
+Tag& Tag::operator=(const vector<int16_t>& value)  { data_ = value; return *this; }
+Tag& Tag::operator=(const vector<uint16_t>& value) { data_ = value; return *this; }
+Tag& Tag::operator=(const vector<int32_t>& value)  { data_ = value; return *this; }
+Tag& Tag::operator=(const vector<uint32_t>& value) { data_ = value; return *this; }
+Tag& Tag::operator=(const vector<float>& value)    { data_ = value; return *this; }
+
+Tag& Tag::operator=(const Tag& other)
+{
+    data_ = other.data_;
+    modifier_ = other.modifier_;
+    return *this;
+}
+
+Tag& Tag::operator=(Tag&& other)
+{
+    data_ = std::move(other.data_);
+    modifier_ = std::move(other.modifier_);
+    return *this;
+}
diff --git a/src/TagCollection.cpp b/src/TagCollection.cpp

new file mode 100644 (file)

index 0000000..98ed22b
--- /dev/null
+++ b/src/TagCollection.cpp
@@ -0,0 +1,50 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file TagCollection.cpp
+/// \brief Implements the TagCollection class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/TagCollection.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+bool TagCollection::Contains(const string& name) const
+{
+    return count(name) != 0;
+}
diff --git a/src/TimeUtils.h b/src/TimeUtils.h

new file mode 100644 (file)

index 0000000..b3fd75f
--- /dev/null
+++ b/src/TimeUtils.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef TIMEUTILS_H
+#define TIMEUTILS_H
+
+#include <chrono>
+#include <stdexcept>
+#include <string>
+#include <cassert>
+#include <ctime>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+inline
+std::string ToIso8601(const std::chrono::system_clock::time_point& tp)
+{
+    // get time info
+    const time_t ttime_t = std::chrono::system_clock::to_time_t(tp);
+    const std::chrono::system_clock::time_point tp_sec = std::chrono::system_clock::from_time_t(ttime_t);
+    const std::chrono::milliseconds ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp - tp_sec);
+    const std::tm* ttm = gmtime(&ttime_t);  // static obj, no free needed (may not be thread-safe though)
+
+    // format output
+    char date_time_format[] = "%FT%T";
+    char date_time_str[50];
+    strftime(date_time_str, sizeof(date_time_str), date_time_format, ttm);
+    std::string result(date_time_str);
+    if (ms.count() > 0) {
+        result.append(".");
+        result.append(std::to_string(ms.count()));
+    }
+    result.append("Z");
+    return result;
+}
+
+inline
+std::string ToDataSetFormat(const std::chrono::system_clock::time_point& tp)
+{
+    // get time info
+    const time_t ttime_t = std::chrono::system_clock::to_time_t(tp);
+    const std::chrono::system_clock::time_point tp_sec = std::chrono::system_clock::from_time_t(ttime_t);
+    const std::chrono::milliseconds ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp - tp_sec);
+    const std::tm* ttm = gmtime(&ttime_t);  // static obj, no free needed (may not be thread-safe though)
+
+    // format output
+    char date_time_format[] = "%y%m%d_%H%M%S";
+    char date_time_str[50];
+    strftime(date_time_str, sizeof(date_time_str), date_time_format, ttm);
+    std::string result(date_time_str);
+    if (ms.count() > 0)
+        result.append(std::to_string(ms.count()));
+    return result;
+}
+
+inline
+std::chrono::system_clock::time_point CurrentTime(void)
+{ return std::chrono::system_clock::now(); }
+
+} // namespace PacBio
+} // namespace BAM
+} // namespace internal
+
+#endif // TIMEUTILS_H
diff --git a/src/UnmappedReadsQuery.cpp b/src/UnmappedReadsQuery.cpp

new file mode 100644 (file)

index 0000000..9b60657
--- /dev/null
+++ b/src/UnmappedReadsQuery.cpp
@@ -0,0 +1,133 @@
+// Copyright (c) 2014, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+//#include "pbbam/UnmappedReadsQuery.h"
+//#include "pbbam/BamFile.h"
+//#include "MemoryUtils.h"
+
+//#include <iostream>
+
+//using namespace PacBio;
+//using namespace PacBio::BAM;
+//using namespace std;
+
+//UnmappedReadsQuery::UnmappedReadsQuery(const BamFile& file)
+//    : QueryBase()
+//{
+//    // open file
+//    file_.reset(sam_open(file.Filename().c_str(), "rb"), internal::HtslibFileDeleter());
+//    if (!file_) {
+//        error_ = UnmappedReadsQuery::FileOpenError;
+//        return;
+//    }
+
+//    // open index
+//    index_.reset(bam_index_load(file.Filename().c_str()), internal::HtslibIndexDeleter());
+//    if (!index_) {
+//        error_ = UnmappedReadsQuery::IndexFileOpenError;
+//        return;
+//    }
+
+//    // initialize query
+//    iterator_.reset(bam_itr_queryi(index_.get(), HTS_IDX_NOCOOR, 0, 0), internal::HtslibIteratorDeleter());
+//    if (iterator_) {
+
+//        cerr << endl
+//             << "UnmappedQueryReads::iterator" << endl
+//             << "read_rest: " << iterator_->read_rest << endl
+//             << "finished: " << iterator_->finished << endl
+//             << "dummy: " << iterator_->dummy << endl
+//             << "tid: " << iterator_->tid << endl
+//             << "beg: " << iterator_->beg << endl
+//             << "end: " << iterator_->end << endl
+//             << "n_off: " << iterator_->n_off << endl
+//             << "i: " << iterator_->i << endl
+//             << "curr_off: " << iterator_->curr_off << endl
+//             << endl;
+
+
+////        uint32_t read_rest:1, finished:1, dummy:29;
+////        int tid, beg, end, n_off, i;
+////        uint64_t curr_off;
+////        hts_pair64_t *off;
+////        hts_readrec_func *readrec;
+////        struct {
+////            int n, m;
+////            int *a;
+////        } bins;
+
+//    }
+//}
+
+//bool UnmappedReadsQuery::GetNext(BamRecord& record)
+//{
+//    if (error_ == UnmappedReadsQuery::NoError && iterator_) {
+//        const int result = bam_itr_next(file_.get(), iterator_.get(), record.RawData().get());
+//        if ( result > 0 )
+//            return true;
+//        else {
+//            cerr << "ERROR - result: " << result << endl;
+//            if ( result == -4 ) {
+
+//                bam1_t* b = record.RawData().get();
+//                bam1_core_t* c = &b->core;
+//                bool nonBgzfErrorFound = false;
+
+//                if (b->l_data < 0) {
+//                    cerr << "ERROR: bam1_t::l_data < 0" << endl;
+//                    nonBgzfErrorFound = true;
+//                }
+//                if (c->l_qseq < 0) {
+//                    cerr << "ERROR: bam1_t::core::l_qseq < 0" << endl;
+//                    nonBgzfErrorFound = true;
+//                }
+//                if (!b->data) {
+//                    cerr << "ERROR: bam1_t::data is null" << endl;
+//                    nonBgzfErrorFound = true;
+//                }
+//                if  (!nonBgzfErrorFound)
+//                    cerr << "ERROR: in bam_read1(), bgzf_read(fp, b->data, b->l_data) returned unexpected value" << endl;
+//            }
+//        }
+//    }
+//    else {
+//        cerr << "UnmappedReadsQuery::HasError() - " << Error() << endl;
+//    }
+
+
+//    return false;
+//}
diff --git a/src/ValidationErrors.cpp b/src/ValidationErrors.cpp

new file mode 100644 (file)

index 0000000..fe6e69c
--- /dev/null
+++ b/src/ValidationErrors.cpp
@@ -0,0 +1,144 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ValidationErrors.cpp
+/// \brief Implements the ValidationErrors class.
+//
+// Author: Derek Barnett
+
+#include "ValidationErrors.h"
+#include "pbbam/exception/ValidationException.h"
+#include "StringUtils.h"
+#include <sstream>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+const size_t ValidationErrors::MAX;
+
+ValidationErrors::ValidationErrors(const size_t maxNumErrors)
+    : maxNumErrors_(maxNumErrors)
+    , currentNumErrors_(0)
+{
+    if (maxNumErrors_ == 0)
+        maxNumErrors_ = ValidationErrors::MAX;
+}
+
+void ValidationErrors::AddFileError(const std::string& fn,
+                                    const std::string& details)
+{
+    string copy = details;
+    AddFileError(fn, std::move(copy));
+}
+
+void ValidationErrors::AddFileError(const std::string& fn,
+                                    std::string&& details)
+{
+    fileErrors_[fn].push_back(std::move(details));
+    OnErrorAdded();
+}
+
+void ValidationErrors::AddReadGroupError(const std::string& rg,
+                                         const std::string& details)
+{
+    string copy = details;
+    AddReadGroupError(rg, std::move(copy));
+}
+
+void ValidationErrors::AddReadGroupError(const std::string& rg,
+                                         std::string&& details)
+{
+    readGroupErrors_[rg].push_back(std::move(details));
+    OnErrorAdded();
+}
+
+void ValidationErrors::AddRecordError(const std::string& name,
+                                      const std::string& details)
+{
+    string copy = details;
+    AddRecordError(name, std::move(copy));
+}
+
+void ValidationErrors::AddRecordError(const std::string& name,
+                                      std::string&& details)
+{
+    recordErrors_[name].push_back(std::move(details));
+    OnErrorAdded();
+}
+
+void ValidationErrors::AddTagLengthError(const string& name,
+                                         const string& tagLabel,
+                                         const string& tagName,
+                                         const size_t observed,
+                                         const size_t expected)
+{
+    string copy  = tagLabel;
+    string copy2 = tagName;
+    AddTagLengthError(name, std::move(copy), std::move(copy2), observed, expected);
+}
+
+void ValidationErrors::AddTagLengthError(const string& name,
+                                         string&& tagLabel,
+                                         string&& tagName,
+                                         const size_t observed,
+                                         const size_t expected)
+{
+    // format
+    stringstream s;
+    s << tagLabel << " tag (" << tagName << ") length: " << observed
+      << ", does not match expected length: " << expected;
+    AddRecordError(name, s.str());
+}
+
+bool ValidationErrors::IsEmpty(void) const
+{
+    return currentNumErrors_ == 0;
+}
+
+void ValidationErrors::OnErrorAdded(void)
+{
+    ++currentNumErrors_;
+    if (currentNumErrors_ == maxNumErrors_)
+        ThrowErrors();
+}
+
+void ValidationErrors::ThrowErrors(void)
+{
+    throw ValidationException(std::move(fileErrors_),
+                              std::move(readGroupErrors_),
+                              std::move(recordErrors_));
+}
diff --git a/src/ValidationErrors.h b/src/ValidationErrors.h

new file mode 100644 (file)

index 0000000..af68ac6
--- /dev/null
+++ b/src/ValidationErrors.h
@@ -0,0 +1,115 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ValidationErrors.h
+/// \brief Defines the ValidationErrors class.
+//
+// Author: Derek Barnett
+
+#ifndef VALIDATIONERRORS_H
+#define VALIDATIONERRORS_H
+
+#include <limits>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+/// The ValidationErrors class catches error messages accumulated during
+/// validation (see Validator).
+///
+/// Convenience methods are provided for different BAM components, to help
+/// format the displayed output.
+///
+/// A maximum number of errors can be provided at construction, and this class
+/// will automatially throw a ValidationException whenever that count is reached.
+/// Otherwise, the Validator will check IsEmpty() and call ThrowErrors() if true.
+///
+class ValidationErrors
+{
+public:
+    typedef std::vector<std::string>         ErrorList;
+    typedef std::map<std::string, ErrorList> ErrorMap;
+public:
+    static const size_t MAX = std::numeric_limits<size_t>::max();
+
+public:
+    ValidationErrors(const size_t maxNumErrors = ValidationErrors::MAX);
+
+public:
+    void AddFileError(const std::string& fn, const std::string& details);
+    void AddFileError(const std::string& fn, std::string&& details);
+
+    void AddReadGroupError(const std::string& rg, const std::string& details);
+    void AddReadGroupError(const std::string& rg, std::string&& details);
+
+    void AddRecordError(const std::string& name, const std::string& details);
+    void AddRecordError(const std::string& name, std::string&& details);
+
+    void AddTagLengthError(const std::string& name,
+                           const std::string& tagLabel,
+                           const std::string& tagName,
+                           const size_t observed,
+                           const size_t expected);
+    void AddTagLengthError(const std::string& name,
+                           std::string&& tagLabel,
+                           std::string&& tagName,
+                           const size_t observed,
+                           const size_t expected);
+
+public:
+    bool IsEmpty(void) const;
+    void ThrowErrors(void);
+
+private:
+    size_t maxNumErrors_;
+    size_t currentNumErrors_;
+    ErrorMap fileErrors_;
+    ErrorMap readGroupErrors_;
+    ErrorMap recordErrors_;
+
+private:
+    void OnErrorAdded(void);
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // VALIDATIONERRORS_H
diff --git a/src/ValidationException.cpp b/src/ValidationException.cpp

new file mode 100644 (file)

index 0000000..2f7c5bc
--- /dev/null
+++ b/src/ValidationException.cpp
@@ -0,0 +1,122 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ValidationException.cpp
+/// \brief Implements the ValidationException class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/exception/ValidationException.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+ValidationException::ValidationException(const ErrorMap& fileErrors,
+                                         const ErrorMap& readGroupErrors,
+                                         const ErrorMap& recordErrors)
+    : std::runtime_error("")
+    , fileErrors_(fileErrors)
+    , readGroupErrors_(readGroupErrors)
+    , recordErrors_(recordErrors)
+{
+    FormatMessage();
+}
+
+ValidationException::ValidationException(ErrorMap&& fileErrors,
+                                         ErrorMap&& readGroupErrors,
+                                         ErrorMap&& recordErrors)
+    : std::runtime_error("")
+    , fileErrors_(std::move(fileErrors))
+    , readGroupErrors_(std::move(readGroupErrors))
+    , recordErrors_(std::move(recordErrors))
+{
+    FormatMessage();
+}
+
+const ValidationException::ErrorMap& ValidationException::FileErrors(void) const
+{ return fileErrors_; }
+
+const ValidationException::ErrorMap& ValidationException::ReadGroupErrors(void) const
+{ return readGroupErrors_; }
+
+const ValidationException::ErrorMap& ValidationException::RecordErrors(void) const
+{ return recordErrors_; }
+
+const char* ValidationException::what(void) const noexcept
+{ return msg_.c_str(); }
+
+void ValidationException::FormatMessage(void)
+{
+    std::stringstream s;
+    s << "Validation failed: " << std::endl;
+
+    // file errors
+    if (!fileErrors_.empty()) {
+        auto fileIter = fileErrors_.cbegin();
+        auto fileEnd  = fileErrors_.cend();
+        for ( ; fileIter != fileEnd; ++fileIter) {
+            s << "  In file (" << fileIter->first << ") : " << std::endl;
+            const auto& errors = fileIter->second;
+            for (const auto& e : errors)
+                s << "    " << e << std::endl;
+        }
+    }
+
+    // read group errors
+    if (!readGroupErrors_.empty()) {
+        auto rgIter = readGroupErrors_.cbegin();
+        auto rgEnd  = readGroupErrors_.cend();
+        for ( ; rgIter != rgEnd; ++rgIter) {
+            s << "  In read group (" << rgIter->first << ") : " << std::endl;
+            const auto& errors = rgIter->second;
+            for (const auto& e : errors)
+                s << "    " << e << std::endl;
+        }
+    }
+
+    // record errors
+    if (!recordErrors_.empty()) {
+        auto recIter = recordErrors_.cbegin();
+        auto recEnd  = recordErrors_.cend();
+        for ( ; recIter != recEnd; ++recIter) {
+            s << "  In record (" << recIter->first << ") : " << std::endl;
+            const auto& errors = recIter->second;
+            for (const auto& e : errors)
+                s << "    " << e << std::endl;
+        }
+    }
+
+    msg_ = s.str();
+}
diff --git a/src/Validator.cpp b/src/Validator.cpp

new file mode 100644 (file)

index 0000000..158f466
--- /dev/null
+++ b/src/Validator.cpp
@@ -0,0 +1,470 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Validator.cpp
+/// \brief Implements the Validator class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Validator.h"
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/ReadGroupInfo.h"
+#include "ValidationErrors.h"
+#include "Version.h"
+#include <boost/algorithm/string.hpp>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <stdexcept>
+#include <set>
+#include <vector>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct ilexcompare_wrapper {
+    bool operator()(const string& lhs, const string& rhs) const
+    { return boost::ilexicographical_compare(lhs, rhs); }
+};
+
+static const set<string, ilexcompare_wrapper> AcceptedSortOrders = {
+    "unknown",
+    "unsorted",
+    "queryname",
+    "coordinate"
+};
+
+static const set<string> AcceptedReadTypes = {
+    "POLYMERASE",
+    "HQREGION",
+    "SUBREAD",
+    "CCS",
+    "SCRAP",
+    "UNKNOWN"
+};
+
+static
+void ValidateReadGroup(const ReadGroupInfo& rg,
+                       unique_ptr<ValidationErrors>& errors)
+{
+    const string& id = rg.Id();
+
+    // has required fields
+    if (id.empty())
+        errors->AddReadGroupError(id, "missing ID");
+    if (rg.MovieName().empty())
+        errors->AddReadGroupError(id, "missing movie name (PU tag)");
+    // 3.0.2 adds required RG:PM - do not check for now, we'll add version-aware
+    // validation down the road
+
+    // description tag has required components
+    if (rg.ReadType().empty())
+        errors->AddReadGroupError(id, "missing READTYPE in description");
+    if (rg.BindingKit().empty())
+        errors->AddReadGroupError(id, "missing BINDINGKIT in description");
+    if (rg.SequencingKit().empty())
+        errors->AddReadGroupError(id, "missing SEQUENCINGKIT in description");
+    if (rg.BasecallerVersion().empty())
+        errors->AddReadGroupError(id, "missing BASECALLERVERSION in description");
+    if (rg.FrameRateHz().empty())
+        errors->AddReadGroupError(id, "missing FRAMERATEHZ in description");
+
+    // stored ID matches expected ID (as calculated from movie & type)
+    if (!id.empty()) {
+        const auto expectedId = MakeReadGroupId(rg.MovieName(), rg.ReadType());
+        if (expectedId != id) {
+            const string msg = "stored ID: " + id +
+                " does not match computed ID: " + expectedId;
+            errors->AddReadGroupError(id, std::move(msg));
+        }
+    }
+
+    // valid read type
+    if (!rg.ReadType().empty()) {
+        if (internal::AcceptedReadTypes.find(rg.ReadType()) == internal::AcceptedReadTypes.cend())
+            errors->AddReadGroupError(id, "read type: " + rg.ReadType() + " is unknown");
+    }
+
+    // valid read chemistry (binding, sequencing, chemistry)
+    if (!rg.BindingKit().empty() &&
+        !rg.SequencingKit().empty() &&
+        !rg.BasecallerVersion().empty())
+    {
+        try {
+            auto chem = rg.SequencingChemistry();
+            (void)chem;
+        } catch (std::exception& e) {
+            errors->AddReadGroupError(id, e.what());
+        }
+    }
+
+    // frame rate convertable to floating point
+    if (!rg.FrameRateHz().empty()) {
+        try {
+            const float frameRate = stof(rg.FrameRateHz());
+            (void)frameRate;
+        } catch (std::exception& e) {
+            errors->AddReadGroupError(id, e.what());
+        }
+    }
+}
+
+static
+void ValidateHeader(const BamHeader& header,
+                    const string& filename,
+                    unique_ptr<ValidationErrors>& errors)
+{
+    const string& fn = filename;
+
+    // SAM/BAM version
+    try {
+        Version v(header.Version());
+        (void)v;
+    } catch (std::exception& e) {
+        errors->AddFileError(fn, string("SAM version (@HD:VN) failed: ") + e.what());
+    }
+
+    // sort order
+    const string sortOrder = header.SortOrder();
+    if (AcceptedSortOrders.find(sortOrder) == AcceptedSortOrders.end())
+        errors->AddFileError(fn, string("unknown sort order: ") + sortOrder);
+
+    // PacBio version
+    try {
+        const Version v(header.PacBioBamVersion());
+        const Version minimum(3,0,1);
+        if (v < minimum) {
+            string msg = "PacBioBAM version (@HD:pb) ";
+            msg += v.ToString();
+            msg += string{ " is older than the minimum supported version " };
+            msg += ( "(" + minimum.ToString() + ")" );
+            errors->AddFileError(fn, std::move(msg));
+        }
+    } catch (std::exception& e) {
+        errors->AddFileError(fn, string("PacBioBAM version (@HD:pb) failed to parse: ") + e.what());
+    }
+
+    // sequences?
+
+    // read groups
+    for (const ReadGroupInfo& rg : header.ReadGroups() )
+        ValidateReadGroup(rg, errors);
+}
+
+static
+void ValidateMetadata(const BamFile& file,
+                      unique_ptr<ValidationErrors>& errors)
+{
+    // filename
+    const string fn = file.Filename();
+    if (fn == "-") {
+        errors->AddFileError(fn, "validation not is available for streamed BAM. Please "
+                                 "write to a file and run validation on it.");
+        errors->ThrowErrors(); // quit early
+    }
+    if (boost::algorithm::ends_with(fn, ".bam") ||
+        boost::algorithm::ends_with(fn, ".bam.tmp"))
+    {
+        errors->AddFileError(fn, "non-standard file extension");
+    }
+
+    // EOF
+    if (!file.HasEOF())
+        errors->AddFileError(fn, "missing end-of-file marker");
+
+    // has PBI
+    if (!file.PacBioIndexExists())
+        errors->AddFileError(fn, "missing PBI file");
+
+    // header
+    ValidateHeader(file.Header(), file.Filename(), errors);
+}
+
+void ValidateMappedRecord(const BamRecord& b,
+                          unique_ptr<ValidationErrors>& errors)
+{
+    const string& name = b.FullName();
+    if (b.ReferenceStart() < 0)
+        errors->AddRecordError(name, "mapped record position is invalid");
+    if (b.ReferenceId() < 0)
+        errors->AddRecordError(name, "mapped record reference ID is invalid");
+
+    // what else??
+}
+
+void ValidateRecordCore(const BamRecord& b,
+                        unique_ptr<ValidationErrors>& errors)
+{
+    const string& name = b.FullName();
+
+    if (b.Type() != RecordType::CCS) {
+        const auto qStart = b.QueryStart();
+        const auto qEnd   = b.QueryEnd();
+        if (qStart >= qEnd)
+            errors->AddRecordError(name, "queryStart (qs) should be < queryEnd (qe)");
+    }
+}
+
+void ValidateRecordReadGroup(const BamRecord& b,
+                             unique_ptr<ValidationErrors>& errors)
+{
+    try {
+        auto rg = b.ReadGroup();
+        (void)rg;
+    } catch (std::exception& e) {
+        errors->AddRecordError(b.FullName(), e.what());
+    }
+}
+
+void ValidateRecordRequiredTags(const BamRecord& b,
+                                unique_ptr<ValidationErrors>& errors)
+{
+    const string& name = b.FullName();
+
+    if (b.Type() != RecordType::CCS) {
+
+        // qe/qs
+        const bool hasQueryStart = b.HasQueryStart();
+        const bool hasQueryEnd   = b.HasQueryEnd();
+        if (hasQueryStart && hasQueryEnd) {
+            const auto qStart = b.QueryStart();
+            const auto qEnd   = b.QueryEnd();
+            if (qStart >= qEnd)
+                errors->AddRecordError(name, "queryStart (qs) should be < queryEnd (qe)");
+        } else {
+            if (!hasQueryStart)
+                errors->AddRecordError(name, "missing tag: qs (queryStart)");
+            if (!hasQueryEnd)
+                errors->AddRecordError(name, "missing tag: qe (queryEnd)");
+        }
+    }
+
+    // zm
+    if (!b.HasHoleNumber())
+        errors->AddRecordError(name, "missing tag: zm (ZMW hole number)");
+
+    // np
+    if (!b.HasNumPasses())
+        errors->AddRecordError(name, "missing tag: np (num passes)");
+    else {
+        const auto numPasses = b.NumPasses();
+        if (b.Type() != RecordType::CCS && numPasses != 1)
+            errors->AddRecordError(name, "np (numPasses) tag for non-CCS records should be 1");
+    }
+
+    // rq
+    if (!b.HasReadAccuracy())
+        errors->AddRecordError(name, "missing tag: rq (read accuracy)");
+
+    // sn
+    if (!b.HasSignalToNoise())
+        errors->AddRecordError(name, "missing tag: sn (signal-to-noise ratio)");
+}
+
+void ValidateRecordTagLengths(const BamRecord& b,
+                              unique_ptr<ValidationErrors>& errors)
+{
+    const string& name = b.FullName();
+    const size_t expectedLength = (b.Type() == RecordType::CCS ? b.Sequence().size()
+                                                               : (b.QueryEnd() - b.QueryStart()));
+
+    // check "per-base"-type data lengths are compatible
+    if (b.Sequence().size() != expectedLength)
+        errors->AddRecordError(name, "sequence length does not match expected length");
+
+    if (b.HasDeletionQV()) {
+        if (b.DeletionQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "DeletionQV", "dq", b.DeletionQV().size(), expectedLength);
+    }
+    if (b.HasDeletionTag()) {
+        if (b.DeletionTag().size() != expectedLength)
+            errors->AddTagLengthError(name, "DeletionTag", "dt", b.DeletionTag().size(), expectedLength);
+    }
+    if (b.HasInsertionQV()) {
+        if (b.InsertionQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "InsertionQV", "iq", b.InsertionQV().size(), expectedLength);
+    }
+    if (b.HasMergeQV()) {
+        if (b.MergeQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "MergeQV", "mq", b.MergeQV().size(), expectedLength);
+    }
+    if (b.HasSubstitutionQV()) {
+        if (b.SubstitutionQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "SubstitutionQV", "sq", b.SubstitutionQV().size(), expectedLength);
+    }
+    if (b.HasSubstitutionTag()) {
+        if (b.SubstitutionTag().size() != expectedLength)
+            errors->AddTagLengthError(name, "SubstitutionTag", "st", b.SubstitutionTag().size(), expectedLength);
+    }
+    if (b.HasIPD()) {
+        if (b.IPD().size() != expectedLength)
+            errors->AddTagLengthError(name, "IPD", "ip", b.IPD().size(), expectedLength);
+    }
+
+    // NOTE: disabling "internal" tag checks for now, only checking "standard"
+    //       PacBioBAM tags
+
+//    if (b.HasAltLabelQV()) {
+//        if (b.AltLabelQV().size() != expectedLength)
+//            errors->AddTagLengthError(name, "AltLabelQV", "pv", b.AltLabelQV().size(), expectedLength);
+//    }
+//    if (b.HasAltLabelTag()) {
+//        if (b.AltLabelTag().size() != expectedLength)
+//            errors->AddTagLengthError(name, "AltLabelTag", "pt", b.AltLabelTag().size(), expectedLength);
+//    }
+//    if (b.HasLabelQV()) {
+//        if (b.LabelQV().size() != expectedLength)
+//            errors->AddTagLengthError(name, "LabelQV", "pq", b.LabelQV().size(), expectedLength);
+//    }
+//    if (b.HasPkmean()) {
+//        if (b.Pkmean().size() != expectedLength)
+//            errors->AddTagLengthError(name, "Pkmean", "pa", b.Pkmean().size(), expectedLength);
+//    }
+//    if (b.HasPkmean2()) {
+//        if (b.Pkmean2().size() != expectedLength)
+//            errors->AddTagLengthError(name, "Pkmean2", "ps", b.Pkmean2().size(), expectedLength);
+//    }
+//    if (b.HasPkmid()) {
+//        if (b.Pkmid().size() != expectedLength)
+//            errors->AddTagLengthError(name, "Pkmid", "pm", b.Pkmid().size(), expectedLength);
+//    }
+//    if (b.HasPkmid2()) {
+//        if (b.Pkmid2().size() != expectedLength)
+//            errors->AddTagLengthError(name, "Pkmid2", "pi", b.Pkmid2().size(), expectedLength);
+//    }
+//    if (b.HasPrePulseFrames()) {
+//        if (b.PrePulseFrames().size() != expectedLength)
+//            errors->AddTagLengthError(name, "PrePulseFrames", "pd", b.PrePulseFrames().size(), expectedLength);
+//    }
+//    if (b.HasPulseCall()) {
+//        if (b.PulseCall().size() != expectedLength)
+//            errors->AddTagLengthError(name, "PulseCall", "pc", b.PulseCall().size(), expectedLength);
+//    }
+//    if (b.HasPulseCallWidth()) {
+//        if (b.PulseCallWidth().size() != expectedLength)
+//            errors->AddTagLengthError(name, "PulseCallWidth", "px", b.PulseCallWidth().size(), expectedLength);
+//    }
+//    if (b.HasPulseMergeQV()) {
+//        if (b.PulseMergeQV().size() != expectedLength)
+//            errors->AddTagLengthError(name, "PulseMergeQV", "pg", b.PulseMergeQV().size(), expectedLength);
+//    }
+//    if (b.HasPulseWidth()) {
+//        if (b.PulseWidth().size() != expectedLength)
+//            errors->AddTagLengthError(name, "PulseWidth", "pw", b.PulseWidth().size(), expectedLength);
+//    }
+}
+
+void ValidateUnmappedRecord(const BamRecord& b,
+                            unique_ptr<ValidationErrors>& errors)
+{
+    const string& name = b.FullName();
+    if (b.ReferenceStart() != -1)
+        errors->AddRecordError(name, "unmapped record has a position");
+    if (b.ReferenceId() != -1)
+        errors->AddRecordError(name, "unmapped record has a reference ID");
+}
+
+static
+void ValidateRecord(const BamRecord& b,
+                    unique_ptr<ValidationErrors>& errors)
+{
+    ValidateRecordCore(b, errors);
+    ValidateRecordReadGroup(b, errors);
+    ValidateRecordRequiredTags(b, errors);
+    ValidateRecordTagLengths(b, errors);
+    if (b.IsMapped())
+        ValidateMappedRecord(b, errors);
+    else
+        ValidateUnmappedRecord(b, errors);
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+void Validator::Validate(const BamHeader& header, const size_t maxErrors)
+{
+    unique_ptr<ValidationErrors> errors{ new ValidationErrors(maxErrors) };
+    internal::ValidateHeader(header, "unknown", errors);
+    if (!errors->IsEmpty())
+        errors->ThrowErrors();
+}
+
+void Validator::Validate(const ReadGroupInfo& rg,  const size_t maxErrors)
+{
+    unique_ptr<ValidationErrors> errors{ new ValidationErrors(maxErrors) };
+    internal::ValidateReadGroup(rg, errors);
+    if (!errors->IsEmpty())
+        errors->ThrowErrors();
+}
+
+void Validator::Validate(const BamRecord& b, const size_t maxErrors)
+{
+    unique_ptr<ValidationErrors> errors{ new ValidationErrors(maxErrors) };
+    internal::ValidateRecord(b, errors);
+    if (!errors->IsEmpty())
+        errors->ThrowErrors();
+}
+
+void Validator::ValidateEntireFile(const BamFile& file, const size_t maxErrors)
+{
+    unique_ptr<ValidationErrors> errors{ new ValidationErrors(maxErrors) };
+    internal::ValidateMetadata(file, errors);
+
+    EntireFileQuery query(file);
+    for (const BamRecord& record : query)
+        internal::ValidateRecord(record, errors);
+
+    if (!errors->IsEmpty())
+        errors->ThrowErrors();
+}
+
+void Validator::ValidateFileMetadata(const BamFile& file, const size_t maxErrors)
+{
+    unique_ptr<ValidationErrors> errors{ new ValidationErrors(maxErrors) };
+    internal::ValidateMetadata(file, errors);
+    if (!errors->IsEmpty())
+        errors->ThrowErrors();
+}
diff --git a/src/Version.cpp b/src/Version.cpp

new file mode 100644 (file)

index 0000000..b9089e3
--- /dev/null
+++ b/src/Version.cpp
@@ -0,0 +1,87 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Version.cpp
+/// \brief Implements the Version class.
+//
+// Author: Derek Barnett
+
+#include "Version.h"
+#include "SequenceUtils.h"
+#include <sstream>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+const Version Version::Current = Version(3,0,3);
+const Version Version::Minimum = Version(3,0,1);
+
+// string must be "<major>.<minor>.<version>"
+Version::Version(const std::string& v)
+    : major_(0)
+    , minor_(0)
+    , revision_(0)
+{
+    // parse string
+    try {
+        const auto fields = internal::Split(v, '.');
+        const auto numFields = fields.size();
+        if (numFields == 0)
+            throw std::runtime_error("invalid version number - empty string");
+        major_ = std::stoi(fields.at(0));
+        if (numFields > 1) {
+            minor_ = std::stoi(fields.at(1));
+            if (numFields > 2 )
+                revision_ = std::stoi(fields.at(2));
+        }
+    } catch (std::exception&) {
+        auto msg = string{ "invalid version number (" + v + "): failed to parse" };
+        throw std::runtime_error(msg);
+    }
+
+    // ensure valid numbers
+    Check();
+}
+
+std::string Version::ToString(void) const
+{
+    std::stringstream s;
+    s << major_ << '.' << minor_ << '.' << revision_;
+    return s.str();
+}
+
diff --git a/src/Version.h b/src/Version.h

new file mode 100644 (file)

index 0000000..70427c3
--- /dev/null
+++ b/src/Version.h
@@ -0,0 +1,209 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file Version.h
+/// \brief Defines the Version class.
+//
+// Author: Derek Barnett
+
+#ifndef PACBIOBAM_VERSION_H
+#define PACBIOBAM_VERSION_H
+
+#include <ostream>
+#include <stdexcept>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class Version
+{
+public:
+    static const Version Current;
+    static const Version Minimum;
+
+public:
+    constexpr Version(void);
+
+    Version(int major, int minor, int revision);
+
+    // string must be "<major>.<minor>.<version>"
+    Version(const std::string& v);
+
+    Version(const Version& other) = default;
+    Version(Version&& other) = default;
+    Version& operator=(const Version&) = default;
+    Version& operator=(Version&&) = default;
+    ~Version(void) = default;
+
+public:
+    bool operator==(const Version& other) const;
+    bool operator!=(const Version& other) const;
+    bool operator<(const Version& other) const;
+    bool operator<=(const Version& other) const;
+    bool operator>(const Version& other) const;
+    bool operator>=(const Version& other) const;
+
+public:
+    std::string ToString(void) const;
+    operator std::string(void) const;
+
+public:
+    int Major(void) const;
+    int Minor(void) const;
+    int Revision(void) const;
+
+public:
+    Version& Major(int major);
+    Version& Minor(int minor);
+    Version& Revision(int revision);
+
+private:
+    int major_;
+    int minor_;
+    int revision_;
+
+private:
+    void Check(void) const;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Version& version)
+{
+    out << version.ToString();
+    return out;
+}
+
+inline constexpr Version::Version(void)
+    : major_(0)
+    , minor_(0)
+    , revision_(0)
+{ }
+
+inline Version::Version(int major, int minor, int revision)
+    : major_(major)
+    , minor_(minor)
+    , revision_(revision)
+{ Check(); }
+
+inline bool Version::operator==(const Version& other) const
+{
+    return major_ == other.major_ &&
+           minor_ == other.minor_ &&
+           revision_ == other.revision_;
+}
+
+inline bool Version::operator!=(const Version& other) const
+{ return !(*this == other); }
+
+inline bool Version::operator<(const Version& other) const
+{
+    // 2.* < 3.*
+    if (major_ < other.major_)
+        return true;
+
+    // 3. ==  3.
+    else if (major_ == other.major_) {
+
+        // 3.1.* < 3.2.*
+        if (minor_ < other.minor_)
+            return true;
+
+        // 3.2. == 3.2.
+        else if (minor_ == other.minor_) {
+
+            // 3.2.1 < 3.2.2
+            if (revision_ < other.revision_)
+                return true;
+        }
+    }
+
+    // otherwise not less-than
+    return false;
+}
+inline bool Version::operator<=(const Version& other) const
+{ return !(*this > other); }
+
+inline bool Version::operator>(const Version& other) const
+{ return other < *this; }
+
+inline bool Version::operator>=(const Version& other) const
+{ return !(*this < other); }
+
+inline Version::operator std::string(void) const
+{ return ToString(); }
+
+inline void Version::Check(void) const
+{
+    if (major_ < 0 || minor_ < 0 || revision_ < 0)
+        throw std::runtime_error("version cannot contain negative numbers");
+}
+
+inline int Version::Major(void) const
+{ return major_; }
+
+inline Version& Version::Major(int major)
+{
+    major_ = major;
+    Check();
+    return *this;
+}
+
+inline int Version::Minor(void) const
+{ return minor_; }
+
+inline Version& Version::Minor(int minor)
+{
+    minor_ = minor;
+    Check();
+    return *this;
+}
+
+inline int Version::Revision(void) const
+{ return revision_; }
+
+inline Version& Version::Revision(int revision)
+{
+    revision_ = revision;
+    Check();
+    return *this;
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // PACBIOBAM_VERSION_H
diff --git a/src/VirtualRegionTypeMap.cpp b/src/VirtualRegionTypeMap.cpp

new file mode 100644 (file)

index 0000000..8c6c757
--- /dev/null
+++ b/src/VirtualRegionTypeMap.cpp
@@ -0,0 +1,54 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualRegionTypeMap.cpp
+/// \brief Implements the VirtualRegionTypeMap class.
+//
+// Author: Armin Töpfer
+
+#include "pbbam/virtual/VirtualRegionTypeMap.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+std::map<char, VirtualRegionType> VirtualRegionTypeMap::ParseChar
+{
+    { 'A' , VirtualRegionType::ADAPTER },
+    { 'B' , VirtualRegionType::BARCODE },
+    { 'H' , VirtualRegionType::HQREGION },
+    { 'F' , VirtualRegionType::FILTERED },
+    { 'L' , VirtualRegionType::LQREGION }
+};
diff --git a/src/VirtualZmwBamRecord.cpp b/src/VirtualZmwBamRecord.cpp

new file mode 100644 (file)

index 0000000..d494f89
--- /dev/null
+++ b/src/VirtualZmwBamRecord.cpp
@@ -0,0 +1,399 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualZmwBamRecord.cpp
+/// \brief Implements the VirtualZmwBamRecord class.
+//
+// Author: Armin Töpfer
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+#include "pbbam/virtual/VirtualRegionTypeMap.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+/// \brief Appends content of src vector to dst vector using move semantics.
+///
+/// \param[in]     src  Input vector that will be empty after execution
+/// \param[in,out] dst  Output vector that will be appended to
+///
+template <typename T>
+inline void MoveAppend(std::vector<T>& src, std::vector<T>& dst) noexcept
+{
+    if (dst.empty())
+    {
+        dst = std::move(src);
+    }
+    else
+    {
+        dst.reserve(dst.size() + src.size());
+        std::move(src.begin(), src.end(), std::back_inserter(dst));
+        src.clear();
+    }
+}
+
+/// \brief Appends content of src vector to dst vector using move semantics.
+///
+/// \param[in]     src  Input vector via perfect forwarding
+/// \param[in,out] dst  Output vector that will be appended to
+///
+template <typename T>
+inline void MoveAppend(std::vector<T>&& src, std::vector<T>& dst) noexcept
+{
+    if (dst.empty())
+    {
+        dst = std::move(src);
+    }
+    else
+    {
+        dst.reserve(dst.size() + src.size());
+        std::move(src.begin(), src.end(), std::back_inserter(dst));
+        src.clear();
+    }
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+VirtualZmwBamRecord::VirtualZmwBamRecord(
+    std::vector<BamRecord>&& unorderedSources, const BamHeader& header)
+    : BamRecord(header)
+    , sources_(std::move(unorderedSources))
+{
+    // Sort sources by queryStart
+    std::sort(sources_.begin(), sources_.end(),
+              [](const BamRecord& l1, const BamRecord& l2)
+              { return l1.QueryStart() < l2.QueryStart(); });
+    StitchSources();
+}
+
+bool VirtualZmwBamRecord::HasVirtualRegionType(const VirtualRegionType regionType) const
+{ return virtualRegionsMap_.find(regionType) != virtualRegionsMap_.end(); }
+
+Frames VirtualZmwBamRecord::IPDV1Frames(Orientation orientation) const
+{
+    const auto rawFrames = this->IPDRaw(orientation);
+    const std::vector<uint8_t> rawData(rawFrames.Data().begin(), rawFrames.Data().end());
+    return Frames::Decode(rawData);
+}
+
+
+void VirtualZmwBamRecord::StitchSources(void)
+{
+    const auto& firstRecord = sources_[0];
+    const auto& lastRecord = sources_[sources_.size() - 1];
+
+    std::string   sequence;
+    std::string   deletionTag;
+    std::string   substitutionTag;
+    std::string   alternativeLabelTag;
+    std::string   pulseCall;
+
+    QualityValues qualities;
+    QualityValues deletionQv;
+    QualityValues insertionQv;
+    QualityValues mergeQv;
+    QualityValues pulseMergeQv;
+    QualityValues substitutionQv;
+    QualityValues labelQv;
+    QualityValues alternativeLabelQv;
+
+    Frames                ipd;
+    Frames                pw;
+    Frames                pd;
+    Frames                px;
+    std::vector<float>    pa;
+    std::vector<float>    pm;
+    std::vector<uint32_t> sf;
+
+    // initialize capacity
+    const auto stitchedSize = lastRecord.QueryEnd() - firstRecord.QueryStart();
+    sequence.reserve(stitchedSize);
+    deletionTag.reserve(stitchedSize);
+    substitutionTag.reserve(stitchedSize);
+    alternativeLabelTag.reserve(stitchedSize);
+    pulseCall.reserve(stitchedSize);
+    qualities.reserve(stitchedSize);
+    deletionQv.reserve(stitchedSize);
+    insertionQv.reserve(stitchedSize);
+    mergeQv.reserve(stitchedSize);
+    pulseMergeQv.reserve(stitchedSize);
+    substitutionQv.reserve(stitchedSize);
+    labelQv.reserve(stitchedSize);
+    alternativeLabelQv.reserve(stitchedSize);
+    ipd.DataRaw().reserve(stitchedSize);
+    pw.DataRaw().reserve(stitchedSize);
+    pd.DataRaw().reserve(stitchedSize);
+    px.DataRaw().reserve(stitchedSize);
+    pa.reserve(stitchedSize);
+    pm.reserve(stitchedSize);
+    sf.reserve(stitchedSize);
+
+    // Stitch using tmp vars
+    for(auto& b : sources_)
+    {
+        sequence.append(b.Sequence());
+
+        MoveAppend(b.Qualities(), qualities);
+
+        if (b.HasDeletionQV())
+            MoveAppend(std::move(b.DeletionQV()), deletionQv);
+
+        if (b.HasInsertionQV())
+            MoveAppend(std::move(b.InsertionQV()), insertionQv);
+
+        if (b.HasMergeQV())
+            MoveAppend(std::move(b.MergeQV()), mergeQv);
+
+        if (b.HasPulseMergeQV())
+            MoveAppend(std::move(b.PulseMergeQV()), pulseMergeQv);
+
+        if (b.HasSubstitutionQV())
+            MoveAppend(std::move(b.SubstitutionQV()), substitutionQv);
+
+        if (b.HasLabelQV())
+            MoveAppend(std::move(b.LabelQV()), labelQv);
+
+        if (b.HasAltLabelQV())
+            MoveAppend(std::move(b.AltLabelQV()), alternativeLabelQv);
+
+        if (b.HasDeletionTag())
+            deletionTag.append(std::move(b.DeletionTag()));
+
+        if (b.HasSubstitutionTag())
+            substitutionTag.append(std::move(b.SubstitutionTag()));
+
+        if (b.HasAltLabelTag())
+            alternativeLabelTag.append(std::move(b.AltLabelTag()));
+
+        if (b.HasPulseCall())
+            pulseCall.append(std::move(b.PulseCall()));
+
+        if (b.HasIPD())
+            MoveAppend(b.IPDRaw().DataRaw(), ipd.DataRaw());
+
+        if (b.HasPulseWidth())
+            MoveAppend(b.PulseWidthRaw().DataRaw(), pw.DataRaw());
+
+        if (b.HasPulseCallWidth())
+            MoveAppend(b.PulseCallWidth().DataRaw(), px.DataRaw());
+
+        if (b.HasPrePulseFrames())
+            MoveAppend(b.PrePulseFrames().DataRaw(), pd.DataRaw());
+
+        if (b.HasPkmid())
+            MoveAppend(b.Pkmid(), pm);
+
+        if (b.HasPkmean())
+            MoveAppend(b.Pkmean(), pa);
+
+        if (b.HasPkmid2())
+            MoveAppend(b.Pkmid2(), pm);
+
+        if (b.HasPkmean2())
+            MoveAppend(b.Pkmean2(), pa);
+
+        if (b.HasStartFrame())
+            MoveAppend(b.StartFrame(), sf);
+
+        if (b.HasScrapRegionType())
+        {
+            const VirtualRegionType regionType = b.ScrapRegionType();
+
+            if (!HasVirtualRegionType(regionType))
+                virtualRegionsMap_[regionType] = std::vector<VirtualRegion>();
+
+            virtualRegionsMap_[regionType].emplace_back(
+                regionType, b.QueryStart(), b.QueryEnd());
+        }
+
+        if (b.HasLocalContextFlags())
+        {
+            std::pair<int, int> barcodes{-1, -1};
+            if (b.HasBarcodes())
+                barcodes = b.Barcodes();
+
+            constexpr auto regionType = VirtualRegionType::SUBREAD;
+            if (!HasVirtualRegionType(regionType))
+                virtualRegionsMap_[regionType] = std::vector<VirtualRegion>();
+
+            virtualRegionsMap_[regionType].emplace_back(
+                regionType, b.QueryStart(), b.QueryEnd(), b.LocalContextFlags(),
+                barcodes.first, barcodes.second);
+        }
+
+        if (b.HasBarcodes() && !this->HasBarcodes())
+            this->Barcodes(b.Barcodes());
+
+        if (b.HasBarcodeQuality() && !this->HasBarcodeQuality())
+            this->BarcodeQuality(b.BarcodeQuality());
+
+        if (b.HasReadAccuracy() && !this->HasReadAccuracy())
+            this->ReadAccuracy(b.ReadAccuracy());
+
+        if (b.HasScrapZmwType())
+        {
+            if (!this->HasScrapZmwType())
+                this->ScrapZmwType(b.ScrapZmwType());
+            else if (this->ScrapZmwType() != b.ScrapZmwType())
+                throw std::runtime_error("ScrapZmwTypes do not match");
+        }
+    }
+
+    // ReadGroup
+    this->ReadGroup(this->header_.ReadGroups()[0]);
+
+    this->NumPasses(1);
+
+    // All records should contain the same SNR and hole number
+    if (firstRecord.HasSignalToNoise())
+        this->SignalToNoise(firstRecord.SignalToNoise());
+    this->HoleNumber(firstRecord.HoleNumber());
+
+    // QueryStart
+    this->QueryStart(firstRecord.QueryStart());
+    this->QueryEnd(lastRecord.QueryEnd());
+    this->UpdateName();
+
+    std::string qualitiesStr = qualities.Fastq();
+    if (sequence.size() == qualitiesStr.size())
+        this->Impl().SetSequenceAndQualities(sequence, qualitiesStr);
+    else
+        this->Impl().SetSequenceAndQualities(sequence);
+
+    // Tags as strings
+    if (!deletionTag.empty())
+        this->DeletionTag(deletionTag);
+    if (!substitutionTag.empty())
+        this->SubstitutionTag(substitutionTag);
+    if (!alternativeLabelTag.empty())
+        this->AltLabelTag(alternativeLabelTag);
+    if (!pulseCall.empty())
+        this->PulseCall(pulseCall);
+
+    // QVs
+    if (!deletionQv.empty())
+        this->DeletionQV(deletionQv);
+    if (!insertionQv.empty())
+        this->InsertionQV(insertionQv);
+    if (!mergeQv.empty())
+        this->MergeQV(mergeQv);
+    if (!pulseMergeQv.empty())
+        this->PulseMergeQV(pulseMergeQv);
+    if (!substitutionQv.empty())
+        this->SubstitutionQV(substitutionQv);
+    if (!labelQv.empty())
+        this->LabelQV(labelQv);
+    if (!alternativeLabelQv.empty())
+        this->AltLabelQV(alternativeLabelQv);
+
+    // 16 bit arrays
+    if (!ipd.Data().empty())
+        this->IPD(ipd, FrameEncodingType::LOSSLESS);
+    if (!pw.Data().empty())
+        this->PulseWidth(pw, FrameEncodingType::LOSSLESS);
+    if (!pa.empty())
+        this->Pkmean(pa);
+    if (!pm.empty())
+        this->Pkmid(pm);
+    if (!pd.Data().empty())
+        this->PrePulseFrames(pd, FrameEncodingType::LOSSLESS);
+    if (!px.Data().empty())
+        this->PulseCallWidth(px, FrameEncodingType::LOSSLESS);
+
+    // 32 bit arrays
+    if (!sf.empty())
+        this->StartFrame(sf);
+
+    // Determine HQREGION bases on LQREGIONS
+    if (HasVirtualRegionType(VirtualRegionType::LQREGION))
+    {
+        if (virtualRegionsMap_[VirtualRegionType::LQREGION].size() == 1)
+        {
+            const auto lq = virtualRegionsMap_[VirtualRegionType::LQREGION][0];
+            if (lq.beginPos == 0)
+                virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(
+                    VirtualRegionType::HQREGION, lq.endPos, sequence.size());
+            else if (lq.endPos == static_cast<int>(sequence.size()))
+                virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(
+                    VirtualRegionType::HQREGION, 0, lq.beginPos);
+            else
+                throw std::runtime_error("Unknown HQREGION");
+        }
+        else
+        {
+            int beginPos = 0;
+            for (const auto& lqregion : virtualRegionsMap_[VirtualRegionType::LQREGION])
+            {
+                if (lqregion.beginPos - beginPos > 0)
+                    virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(
+                        VirtualRegionType::HQREGION, beginPos, lqregion.beginPos);
+                beginPos = lqregion.endPos;
+            }
+        }
+    }
+    else
+    {
+        virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(
+            VirtualRegionType::HQREGION, 0, sequence.size());
+    }
+}
+
+
+std::map<VirtualRegionType, std::vector<VirtualRegion>>
+VirtualZmwBamRecord::VirtualRegionsMap(void) const
+{ return virtualRegionsMap_; }
+
+std::vector<VirtualRegion>
+VirtualZmwBamRecord::VirtualRegionsTable(const VirtualRegionType regionType) const
+{
+   const auto iter = virtualRegionsMap_.find(regionType);
+   if (iter != virtualRegionsMap_.cend())
+       return iter->second;
+   return std::vector<VirtualRegion>();
+}
diff --git a/src/VirtualZmwCompositeReader.cpp b/src/VirtualZmwCompositeReader.cpp

new file mode 100644 (file)

index 0000000..686aaae
--- /dev/null
+++ b/src/VirtualZmwCompositeReader.cpp
@@ -0,0 +1,147 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualZmwCompositeReader.cpp
+/// \brief Implements the VirtualZmwCompositeReader class.
+//
+// Author: Derek Barnett
+
+#include "VirtualZmwCompositeReader.h"
+#include <boost/algorithm/string.hpp>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+VirtualZmwCompositeReader::VirtualZmwCompositeReader(const DataSet& dataset)
+    : currentReader_(nullptr)
+    , filter_(PbiFilter::FromDataSet(dataset))
+{
+    // set up source queue
+    string primaryFn;
+    string scrapsFn;
+    const ExternalResources& resources = dataset.ExternalResources();
+    for (const ExternalResource& resource : resources) {
+
+        primaryFn.clear();
+        scrapsFn.clear();
+
+        // if resource is possible "primary" BAM
+        const auto& metatype = resource.MetaType();
+        if (metatype == "PacBio.SubreadFile.SubreadBamFile" ||
+            metatype == "PacBio.SubreadFile.HqRegionBamFile")
+        {
+            // possible resolve relative path
+            primaryFn = dataset.ResolvePath(resource.ResourceId());
+
+            // check for associated scraps file
+            const ExternalResources& childResources = resource.ExternalResources();
+            for (const ExternalResource& childResource : childResources) {
+                const auto& childMetatype = childResource.MetaType();
+                if (childMetatype == "PacBio.SubreadFile.ScrapsBamFile" ||
+                    childMetatype == "PacBio.SubreadFile.HqScrapsBamFile")
+                {
+                    // possible resolve relative path
+                    scrapsFn = dataset.ResolvePath(childResource.ResourceId());
+                    break;
+                }
+            }
+        }
+
+        // queue up source for later
+        if (!primaryFn.empty() && !scrapsFn.empty())
+            sources_.push_back(make_pair(primaryFn,scrapsFn));
+    }
+
+    // open first available source
+    OpenNextReader();
+}
+
+bool VirtualZmwCompositeReader::HasNext(void)
+{
+    return (currentReader_ && currentReader_->HasNext());
+}
+
+VirtualZmwBamRecord VirtualZmwCompositeReader::Next(void)
+{
+    if (currentReader_) {
+        const auto result = currentReader_->Next();
+        if (!currentReader_->HasNext())
+            OpenNextReader();
+        return result;
+    }
+
+    // no reader active
+    const string msg = { "no readers active, make sure you use "
+                         "VirtualZmwCompositeReader::HasNext before "
+                         "requesting next record"
+                      };
+    throw std::runtime_error(msg);
+}
+
+vector<BamRecord> VirtualZmwCompositeReader::NextRaw(void)
+{
+    if (currentReader_) {
+        const auto result = currentReader_->NextRaw();
+        if (!currentReader_->HasNext())
+            OpenNextReader();
+        return result;
+    }
+
+    // no reader active
+    const string msg = { "no readers active, make sure you use "
+                         "VirtualZmwCompositeReader::HasNext before "
+                         "requesting next group of records"
+                      };
+    throw std::runtime_error(msg);
+}
+
+void VirtualZmwCompositeReader::OpenNextReader(void)
+{
+    currentReader_.reset(nullptr);
+
+    // find next source pair with data
+    while(!sources_.empty()) {
+        const auto nextSource = sources_.front();
+        sources_.pop_front();
+
+        currentReader_.reset(new VirtualZmwReader(nextSource.first,
+                                                  nextSource.second,
+                                                  filter_));
+        if (currentReader_->HasNext())
+            return;
+    }
+}
diff --git a/src/VirtualZmwCompositeReader.h b/src/VirtualZmwCompositeReader.h

new file mode 100644 (file)

index 0000000..7c920da
--- /dev/null
+++ b/src/VirtualZmwCompositeReader.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualZmwCompositeReader.h
+/// \brief Defines the VirtualZmwCompositeReader class.
+//
+// Author: Derek Barnett
+
+#ifndef VIRTUALZMWCOMPOSITEREADER_H
+#define VIRTUALZMWCOMPOSITEREADER_H
+
+#include "pbbam/DataSet.h"
+#include "pbbam/PbiFilter.h"
+#include "VirtualZmwReader.h"
+#include <deque>
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+/// \brief The VirtualZmwCompositeReader provides an interface for
+///        re-stitching "virtual" polymerase reads from their constituent parts,
+///        across multiple %BAM resources from a DataSet.
+///
+/// This class is essentially a DataSet-aware wrapper around
+/// VirtualZmwReader, enabling multiple resources as input. See that
+/// class's documentation for more info.
+///
+class PBBAM_EXPORT VirtualZmwCompositeReader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    VirtualZmwCompositeReader(const DataSet& dataset);
+
+    VirtualZmwCompositeReader(void) = delete;
+    VirtualZmwCompositeReader(const VirtualZmwCompositeReader&) = delete;
+    VirtualZmwCompositeReader(VirtualZmwCompositeReader&&) = delete;
+    VirtualZmwCompositeReader& operator=(const VirtualZmwCompositeReader&) = delete;
+    VirtualZmwCompositeReader& operator=(VirtualZmwCompositeReader&&) = delete;
+    ~VirtualZmwCompositeReader(void) = default;
+
+    /// \}
+
+public:
+    /// \name Stitched Record Reading
+    ///
+
+    /// \returns true if more ZMWs/files are available for reading.
+    bool HasNext(void);
+
+    /// \returns the next stitched polymerase read
+    VirtualZmwBamRecord Next(void);
+
+    /// \returns the next set of reads that belong to one ZMW from one %BAM
+    ///          resource (a primary %BAM and/or its scraps file). This enables
+    ///          stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw(void);
+
+    /// \}
+
+private:
+    std::deque< std::pair<std::string, std::string> > sources_;
+    std::unique_ptr<VirtualZmwReader> currentReader_;
+    PbiFilter filter_;
+
+private:
+    void OpenNextReader(void);
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // VIRTUALCOMPOSITEREADER_H
diff --git a/src/VirtualZmwReader.cpp b/src/VirtualZmwReader.cpp

new file mode 100644 (file)

index 0000000..239135d
--- /dev/null
+++ b/src/VirtualZmwReader.cpp
@@ -0,0 +1,143 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualZmwReader.cpp
+/// \brief Implements the VirtualZmwReader class.
+//
+// Author: Armin Töpfer
+
+#include <stdexcept>
+
+#include "VirtualZmwReader.h"
+#include "pbbam/ReadGroupInfo.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+VirtualZmwReader::VirtualZmwReader(const std::string& primaryBamFilepath,
+                                   const std::string& scrapsBamFilepath)
+    : VirtualZmwReader(primaryBamFilepath, scrapsBamFilepath, PbiFilter{})
+{ }
+
+VirtualZmwReader::VirtualZmwReader(const std::string& primaryBamFilepath,
+                                   const std::string& scrapsBamFilepath,
+                                   const PbiFilter& filter)
+{
+    primaryBamFile_.reset(new BamFile{ primaryBamFilepath });
+    scrapsBamFile_.reset(new BamFile{ scrapsBamFilepath });
+
+    if (filter.IsEmpty()) {
+        primaryQuery_.reset(new EntireFileQuery(*primaryBamFile_));
+        scrapsQuery_.reset(new EntireFileQuery(*scrapsBamFile_));
+    }
+    else {
+        primaryQuery_.reset(new PbiFilterQuery{ filter, *primaryBamFile_ });
+        scrapsQuery_.reset(new PbiFilterQuery{ filter, *scrapsBamFile_ });
+    }
+
+    primaryIt_ = (primaryQuery_->begin());
+    scrapsIt_ = (scrapsQuery_->begin());
+
+    stitchedHeader_.reset(new BamHeader{ primaryBamFile_->Header().ToSam() });
+
+    // update stitched read group in header
+    auto readGroups = stitchedHeader_->ReadGroups();
+    if (readGroups.empty())
+        throw std::runtime_error("Bam header of the primary bam has no read groups.");
+    readGroups[0].ReadType("POLYMERASE");
+    readGroups[0].Id(readGroups[0].MovieName(), "POLYMERASE");
+    if (readGroups.size() > 1)
+    {
+        std::vector<ReadGroupInfo> singleGroup;
+        singleGroup.emplace_back(std::move(readGroups[0]));
+        readGroups = std::move(singleGroup);
+        stitchedHeader_->ClearReadGroups();
+    }
+    stitchedHeader_->ReadGroups(readGroups);
+}
+
+VirtualZmwReader::~VirtualZmwReader(void) { }
+
+bool VirtualZmwReader::HasNext(void)
+{
+    // Return true until both iterators are at the end of the query
+    return primaryIt_ != primaryQuery_->end() ||
+            scrapsIt_ != scrapsQuery_->end();
+}
+
+// This method is not thread safe
+VirtualZmwBamRecord VirtualZmwReader::Next(void)
+{ return VirtualZmwBamRecord{ NextRaw(), *stitchedHeader_ }; }
+
+std::vector<BamRecord> VirtualZmwReader::NextRaw(void)
+{
+    std::vector<BamRecord> bamRecordVec;
+
+    // Current hole number, the smallest of scraps and primary.
+    // It can be that the next ZMW is scrap only.
+    int currentHoleNumber;
+    if (primaryIt_ == primaryQuery_->end())
+        currentHoleNumber = (*scrapsIt_).HoleNumber();
+    else if (scrapsIt_ == scrapsQuery_->end())
+        currentHoleNumber = (*primaryIt_).HoleNumber();
+    else
+        currentHoleNumber = std::min((*primaryIt_).HoleNumber(),
+                                     (*scrapsIt_).HoleNumber());
+
+    // collect subreads or hqregions
+    while (primaryIt_ != primaryQuery_->end() &&
+           currentHoleNumber == (*primaryIt_).HoleNumber())
+    {
+        bamRecordVec.push_back(*primaryIt_++);
+    }
+
+    // collect scraps
+    while (scrapsIt_ != scrapsQuery_->end() &&
+           currentHoleNumber == (*scrapsIt_).HoleNumber())
+    {
+        bamRecordVec.push_back(*scrapsIt_++);
+    }
+
+    return bamRecordVec;
+}
+
+BamHeader VirtualZmwReader::PrimaryHeader(void) const
+{ return primaryBamFile_->Header(); }
+
+BamHeader VirtualZmwReader::ScrapsHeader(void) const
+{ return scrapsBamFile_->Header(); }
diff --git a/src/VirtualZmwReader.h b/src/VirtualZmwReader.h

new file mode 100644 (file)

index 0000000..aaa9797
--- /dev/null
+++ b/src/VirtualZmwReader.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file VirtualZmwReader.h
+/// \brief Defines the VirtualZmwReader class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALZMWREADER_H
+#define VIRTUALZMWREADER_H
+
+#include <memory>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/PbiFilter.h"
+#include "pbbam/PbiFilterQuery.h"
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class VirtualZmwReader
+{
+public:
+    /// \brief Creates a reader that will operate on a primary %BAM file (e.g.
+    ///        subread data) and a scraps file, consuming all reads.
+    ///
+    /// \param[in] primaryBamFilepath hqregion.bam or subreads.bam file path
+    /// \param[in] scrapsBamFilepath  scraps.bam file path
+    ///
+    VirtualZmwReader(const std::string& primaryBamFilepath,
+                     const std::string& scrapsBamFilepath);
+
+    /// \brief Creates a reader that will operate on a primary %BAM file (e.g.
+    ///        subread data) and a scraps file, respecting the provided PBI
+    ///        filter.
+    ///
+    /// \note All %BAM files must have a corresponding ".pbi" index file to use
+    ///       the filter. You may need to call BamFile::EnsurePacBioIndexExists
+    ///       before constructing the reader.
+    ///
+    /// \param[in] primaryBamFilepath hqregion.bam or subreads.bam file path
+    /// \param[in] scrapsBamFilepath  scraps.bam file path
+    /// \param[in] filter PBI filter criteria
+    ///
+    VirtualZmwReader(const std::string& primaryBamFilepath,
+                     const std::string& scrapsBamFilepath,
+                     const PbiFilter& filter);
+
+    VirtualZmwReader(void) = delete;
+    VirtualZmwReader(const VirtualZmwReader&) = delete;
+    VirtualZmwReader(VirtualZmwReader&&) = delete;
+    VirtualZmwReader& operator=(const VirtualZmwReader&) = delete;
+    VirtualZmwReader& operator=(VirtualZmwReader&&) = delete;
+    ~VirtualZmwReader(void);
+
+public:
+
+    /// \returns the BamHeader associated with this reader's "primary" %BAM file
+    BamHeader PrimaryHeader(void) const;
+
+    /// \returns the BamHeader associated with this reader's "scraps" %BAM file
+    BamHeader ScrapsHeader(void) const;
+
+public:
+
+    /// \returns true if more ZMWs are available for reading.
+    bool HasNext(void);
+
+    /// \returns the next stitched polymerase read
+    VirtualZmwBamRecord Next(void);
+
+    /// \returns the next set of reads that belong to one ZMW.
+    ///          This enables stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw(void);
+
+private:
+    std::unique_ptr<BamFile>          primaryBamFile_;
+    std::unique_ptr<BamFile>          scrapsBamFile_;
+    std::unique_ptr<internal::IQuery> primaryQuery_;
+    std::unique_ptr<internal::IQuery> scrapsQuery_;
+    internal::IQuery::iterator        primaryIt_;
+    internal::IQuery::iterator        scrapsIt_;
+    std::unique_ptr<BamHeader>        stitchedHeader_;
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // VirtualZmwREADER_H
diff --git a/src/WhitelistedZmwReadStitcher.cpp b/src/WhitelistedZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..621305b
--- /dev/null
+++ b/src/WhitelistedZmwReadStitcher.cpp
@@ -0,0 +1,186 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file WhitelistedZmwReadStitcher.cpp
+/// \brief Implements the WhitelistedZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/virtual/WhitelistedZmwReadStitcher.h"
+#include "pbbam/PbiIndexedBamReader.h"
+#include "VirtualZmwReader.h"
+#include <cassert>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+
+struct WhitelistedZmwReadStitcher::WhitelistedZmwReadStitcherPrivate
+{
+public:
+    WhitelistedZmwReadStitcherPrivate(const vector<int32_t>& zmwWhitelist,
+                                      const string& primaryBamFilePath,
+                                      const string& scrapsBamFilePath)
+        : primaryBamFile_(new BamFile{ primaryBamFilePath })
+        , scrapsBamFile_(new BamFile{ scrapsBamFilePath })
+        , primaryReader_(new PbiIndexedBamReader{ *primaryBamFile_ })
+        , scrapsReader_(new PbiIndexedBamReader{ *scrapsBamFile_ })
+    {
+        // setup new header for stitched data
+        polyHeader_ = unique_ptr<BamHeader>(new BamHeader(primaryBamFile_->Header().ToSam()));
+        auto readGroups = polyHeader_->ReadGroups();
+        if (readGroups.empty())
+            throw runtime_error("Bam header of the primary bam has no read groups.");
+        readGroups[0].ReadType("POLYMERASE");
+        readGroups[0].Id(readGroups[0].MovieName(), "POLYMERASE");
+        if (readGroups.size() > 1)
+        {
+            vector<ReadGroupInfo> singleGroup;
+            singleGroup.emplace_back(move(readGroups[0]));
+            readGroups = move(singleGroup);
+            polyHeader_->ClearReadGroups();
+        }
+        polyHeader_->ReadGroups(readGroups);
+
+        // remove ZMWs up front, that are not found in either file
+        PreFilterZmws(zmwWhitelist);
+    }
+
+    bool HasNext(void) const
+    {
+        return !zmwWhitelist_.empty();
+    }
+
+    VirtualZmwBamRecord Next(void)
+    {
+        auto bamRecordVec = NextRaw();
+        VirtualZmwBamRecord stitched(move(bamRecordVec), *polyHeader_);
+        return stitched;
+    }
+
+    vector<BamRecord> NextRaw(void)
+    {
+        auto result = vector<BamRecord>{ };
+        if (!HasNext())
+            return result;
+
+        const auto& zmw = zmwWhitelist_.front();
+        primaryReader_->Filter(PbiZmwFilter{zmw});
+        scrapsReader_->Filter(PbiZmwFilter{zmw});
+
+        auto record = BamRecord{ };
+        while (primaryReader_->GetNext(record))
+            result.push_back(record);
+        while (scrapsReader_->GetNext(record))
+            result.push_back(record);
+
+        zmwWhitelist_.pop_front();
+        return result;
+    }
+
+    BamHeader PrimaryHeader(void) const
+    { return primaryBamFile_->Header(); }
+
+    BamHeader ScrapsHeader(void) const
+    { return scrapsBamFile_->Header(); }
+
+private:
+    unique_ptr<BamFile> primaryBamFile_;
+    unique_ptr<BamFile> scrapsBamFile_;
+    unique_ptr<PbiIndexedBamReader> primaryReader_;
+    unique_ptr<PbiIndexedBamReader> scrapsReader_;
+    unique_ptr<BamHeader> polyHeader_;
+    deque<int32_t>        zmwWhitelist_;
+
+private:
+    void PreFilterZmws(const vector<int32_t>& zmwWhitelist)
+    {
+        // fetch input ZMWs
+        const PbiRawData primaryIndex(primaryBamFile_->PacBioIndexFilename());
+        const PbiRawData scrapsIndex(scrapsBamFile_->PacBioIndexFilename());
+        const auto& primaryZmws = primaryIndex.BasicData().holeNumber_;
+        const auto& scrapsZmws = scrapsIndex.BasicData().holeNumber_;
+
+        // toss them all into a set (for uniqueness & lookup here soon)
+        set<int32_t> inputZmws;
+        for (const auto& zmw : primaryZmws)
+            inputZmws.insert(zmw);
+        for (const auto& zmw : scrapsZmws)
+            inputZmws.insert(zmw);
+
+        // check our requested whitelist against files' ZMWs, keep if found
+        const auto inputEnd = inputZmws.cend();
+        for (const int32_t zmw : zmwWhitelist) {
+            if (inputZmws.find(zmw) != inputEnd)
+                zmwWhitelist_.push_back(zmw);
+        }
+    }
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+// --------------------------------
+// ZmwReadStitcher implementation
+// --------------------------------
+
+WhitelistedZmwReadStitcher::WhitelistedZmwReadStitcher(const vector<int32_t>& zmwWhitelist,
+                                                       const string& primaryBamFilePath,
+                                                       const string& scrapsBamFilePath)
+    : d_(new WhitelistedZmwReadStitcherPrivate(zmwWhitelist,
+                                               primaryBamFilePath,
+                                               scrapsBamFilePath))
+{ }
+
+WhitelistedZmwReadStitcher::~WhitelistedZmwReadStitcher(void) { }
+
+bool WhitelistedZmwReadStitcher::HasNext(void) const
+{ return d_->HasNext(); }
+
+VirtualZmwBamRecord WhitelistedZmwReadStitcher::Next(void)
+{ return d_->Next(); }
+
+vector<BamRecord> WhitelistedZmwReadStitcher::NextRaw(void)
+{ return d_->NextRaw(); }
+
+BamHeader WhitelistedZmwReadStitcher::PrimaryHeader(void) const
+{ return d_->PrimaryHeader(); }
+
+BamHeader WhitelistedZmwReadStitcher::ScrapsHeader(void) const
+{ return d_->ScrapsHeader(); }
diff --git a/src/XmlReader.cpp b/src/XmlReader.cpp

new file mode 100644 (file)

index 0000000..df4e782
--- /dev/null
+++ b/src/XmlReader.cpp
@@ -0,0 +1,154 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "XmlReader.h"
+#include "StringUtils.h"
+#include "pugixml/pugixml.hpp"
+#include <iostream>
+#include <fstream>
+#include <memory>
+#include <vector>
+#include <cassert>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static
+void UpdateRegistry(const string& attributeName,
+                    const string& attributeValue,
+                    NamespaceRegistry& registry)
+{
+    vector<string> nameParts = Split(attributeName, ':');
+    assert(!nameParts.empty());
+    if (nameParts.size() > 2)
+        throw std::runtime_error("malformed xmlns attribute: " + attributeName);
+
+    const bool isDefault = (nameParts.size() == 1);
+    const XsdType& xsd = registry.XsdForUri(attributeValue);
+
+    if (isDefault)
+        registry.SetDefaultXsd(xsd);
+    else {
+        assert(nameParts.size() == 2);
+        const string& name = nameParts.at(1);
+        const string& uri  = attributeValue;
+        NamespaceInfo namespaceInfo(name, uri);
+        registry.Register(xsd, namespaceInfo);
+    }
+}
+
+static
+void FromXml(const pugi::xml_node& xmlNode, DataSetElement& parent)
+{
+    // ignore non-named XML nodes
+    //
+    // pugi::xml separates XML parts into more node types than we use
+    //
+    const string& label = xmlNode.name();
+    if (label.empty())
+        return;
+
+    // label & text
+    DataSetElement e(xmlNode.name(), FromInputXml());
+    e.Text(xmlNode.text().get());
+
+    // iterate attributes
+    auto attrIter = xmlNode.attributes_begin();
+    auto attrEnd  = xmlNode.attributes_end();
+    for ( ; attrIter != attrEnd; ++attrIter )
+        e.Attribute(attrIter->name(), attrIter->value());
+
+    // iterate children, recursively building up subtree
+    auto childIter = xmlNode.begin();
+    auto childEnd = xmlNode.end();
+    for ( ; childIter != childEnd; ++childIter ) {
+        pugi::xml_node childNode = *childIter;
+        FromXml(childNode, e);
+    }
+
+    // add our element to its parent
+    parent.AddChild(e);
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+std::unique_ptr<DataSetBase> XmlReader::FromStream(istream& in)
+{
+    pugi::xml_document doc;
+    const pugi::xml_parse_result& loadResult = doc.load(in);
+    if (loadResult.status != pugi::status_ok)
+        throw std::runtime_error(string("could not read XML file, error code:") + to_string(loadResult.status) );
+
+    // parse top-level attributes
+    pugi::xml_node rootNode = doc.document_element();
+    if (rootNode == pugi::xml_node())
+        throw std::runtime_error("could not fetch XML root node");
+
+    // create dataset matching type strings
+    std::unique_ptr<DataSetBase> dataset(new DataSetBase);
+    dataset->Label(rootNode.name());
+
+    // iterate attributes, capture namespace info
+    const string xmlnsPrefix("xmlns");
+    auto attrIter = rootNode.attributes_begin();
+    auto attrEnd  = rootNode.attributes_end();
+    for ( ; attrIter != attrEnd; ++attrIter ) {
+        const string& name = attrIter->name();
+        const string& value = attrIter->value();
+        dataset->Attribute(name, value);
+
+        if (name.find(xmlnsPrefix) == 0)
+            UpdateRegistry(name, value, dataset->Namespaces());
+    }
+
+    // iterate children, recursively building up subtree
+    auto childIter = rootNode.begin();
+    auto childEnd = rootNode.end();
+    for ( ; childIter != childEnd; ++childIter ) {
+        pugi::xml_node childNode = *childIter;
+        internal::FromXml(childNode, *dataset.get());
+    }
+
+    return dataset;
+}
diff --git a/src/XmlReader.h b/src/XmlReader.h

new file mode 100644 (file)

index 0000000..f5830a3
--- /dev/null
+++ b/src/XmlReader.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef XMLREADER_H
+#define XMLREADER_H
+
+#include "pbbam/DataSet.h"
+#include <iosfwd>
+#include <memory>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class XmlReader
+{
+public:
+    static std::unique_ptr<DataSetBase> FromStream(std::istream& in);
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // XMLREADER_H
diff --git a/src/XmlWriter.cpp b/src/XmlWriter.cpp

new file mode 100644 (file)

index 0000000..6c7b7af
--- /dev/null
+++ b/src/XmlWriter.cpp
@@ -0,0 +1,219 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "XmlWriter.h"
+#include "pbbam/DataSet.h"
+#include "pugixml/pugixml.hpp"
+#include <fstream>
+#include <iostream>
+#include <map>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static
+string Prefix(const string& input)
+{
+    const size_t colonFound = input.find(':');
+    if (colonFound == std::string::npos || colonFound == 0)
+        return string();
+    return input.substr(0, colonFound);
+}
+
+static
+string OutputName(const DataSetElement& node,
+                  const NamespaceRegistry& registry)
+{
+    // if from input XML, respect the namespaces given
+    if (node.IsVerbatimLabel()) 
+        return node.QualifiedNameLabel();
+
+    // otherwise, probably user-generated
+    else {
+        // if no namespace prefix, prepend the appropriate one & return
+        if (node.PrefixLabel().empty()) {
+            static const string colon = ":";
+            XsdType xsdType = node.Xsd();
+            if (xsdType == XsdType::NONE)
+                xsdType = registry.XsdForElement(node.LocalNameLabel().to_string());
+            return registry.Namespace(xsdType).Name() + colon + node.LocalNameLabel().to_string();
+        }
+        // otherwise, has prefix - return full name
+        else
+            return node.QualifiedNameLabel();
+    }
+}
+
+static
+void ToXml(const DataSetElement& node,
+           const NamespaceRegistry& registry,
+           map<XsdType, string>& xsdPrefixesUsed,
+           pugi::xml_node& parentXml)
+{
+    // create child of parent, w/ label & text
+    const string& label = OutputName(node, registry);
+    if (label.empty())
+        return; // error?
+    pugi::xml_node xmlNode = parentXml.append_child(label.c_str());
+
+    if (!node.Text().empty())
+        xmlNode.text().set(node.Text().c_str());
+
+    // store XSD type for later
+    const string prefix = Prefix(label);
+    if (!prefix.empty())
+        xsdPrefixesUsed[node.Xsd()] = prefix;
+
+    // add attributes
+    auto attrIter = node.Attributes().cbegin();
+    auto attrEnd  = node.Attributes().cend();
+    for ( ; attrIter != attrEnd; ++attrIter) {
+        const string& name = attrIter->first;
+        if (name.empty())
+            continue;
+        pugi::xml_attribute attr = xmlNode.append_attribute(name.c_str());
+        attr.set_value(attrIter->second.c_str());
+    }
+
+    // additional stuff later? (e.g. comments)
+
+    // iterate children, recursively building up subtree
+    auto childIter = node.Children().cbegin();
+    auto childEnd  = node.Children().cend();
+    for ( ; childIter != childEnd; ++childIter) {
+        const DataSetElement& child = (*childIter);
+        ToXml(child, registry, xsdPrefixesUsed, xmlNode);
+    }
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+void XmlWriter::ToStream(const DataSetBase& dataset,
+                         ostream& out)
+{
+    pugi::xml_document doc;
+
+    const NamespaceRegistry& registry = dataset.Namespaces();
+
+    // create top-level dataset XML node
+    const string& label = internal::OutputName(dataset, registry);
+    if (label.empty())
+        throw std::runtime_error("could not convert dataset node to XML");
+    pugi::xml_node root = doc.append_child(label.c_str());
+
+    const string& text = dataset.Text();
+    if (!text.empty())
+        root.text().set(text.c_str());
+
+    // add top-level attributes
+    auto attrIter = dataset.Attributes().cbegin();
+    auto attrEnd  = dataset.Attributes().cend();
+    for ( ; attrIter != attrEnd; ++attrIter) {
+        const string name = attrIter->first;
+        const string value = attrIter->second;
+        if (name.empty())
+            continue;
+        pugi::xml_attribute attr = root.append_attribute(name.c_str());
+        attr.set_value(value.c_str());
+    }
+
+    map<XsdType, string> xsdPrefixesUsed;
+    xsdPrefixesUsed[dataset.Xsd()] = Prefix(label);
+
+    // iterate children, recursively building up subtree
+    auto childIter = dataset.Children().cbegin();
+    auto childEnd  = dataset.Children().cend();
+    for ( ; childIter != childEnd; ++childIter) {
+        const DataSetElement& child = (*childIter);
+        ToXml(child, registry, xsdPrefixesUsed, root);
+    }
+
+    // write XML to stream
+    pugi::xml_node decl = doc.prepend_child(pugi::node_declaration);
+    decl.append_attribute("version")  = "1.0";
+    decl.append_attribute("encoding") = "utf-8";
+
+    // add XSD namespace attributes
+    pugi::xml_attribute xmlnsDefaultAttribute = root.attribute("xmlns");
+    if (xmlnsDefaultAttribute.empty()) {
+        xmlnsDefaultAttribute = root.append_attribute("xmlns");
+        xmlnsDefaultAttribute.set_value(registry.DefaultNamespace().Uri().c_str());
+    }
+    pugi::xml_attribute xsiAttribute = root.attribute("xmlns:xsi");
+    if (xsiAttribute.empty()) {
+        xsiAttribute = root.append_attribute("xmlns:xsi");
+        xsiAttribute.set_value("http://www.w3.org/2001/XMLSchema-instance");
+    }
+    pugi::xml_attribute xsiSchemaLocationAttribute = root.attribute("xsi:schemaLocation");
+    if (xsiSchemaLocationAttribute.empty()) {
+        xsiSchemaLocationAttribute = root.append_attribute("xsi:schemaLocation");
+        xsiSchemaLocationAttribute.set_value(registry.DefaultNamespace().Uri().c_str());
+    }
+
+    static const string xmlnsPrefix = "xmlns:";
+    map<XsdType, string>::const_iterator prefixIter = xsdPrefixesUsed.cbegin();
+    map<XsdType, string>::const_iterator prefixEnd  = xsdPrefixesUsed.cend();
+    for ( ; prefixIter != prefixEnd; ++prefixIter ) {
+        const XsdType& xsd = prefixIter->first;
+        const string& prefix = prefixIter->second;
+        if (xsd == XsdType::NONE || prefix.empty())
+            continue;
+        const NamespaceInfo& nsInfo = registry.Namespace(xsd);
+        assert(nsInfo.Name() == prefix);
+        const string xmlnsName = xmlnsPrefix + prefix;
+        pugi::xml_attribute xmlnsAttribute = root.attribute(xmlnsName.c_str());
+        if (xmlnsAttribute.empty()) {
+            xmlnsAttribute = root.append_attribute(xmlnsName.c_str());
+            xmlnsAttribute.set_value(nsInfo.Uri().c_str());
+        }
+    }
+
+    // "no escapes" to allow explicit ">" "<" comparison operators in filter parameters
+    // we may remove this if/when comparison is separated from the value
+    doc.save(out, "\t", pugi::format_default | pugi::format_no_escapes, pugi::encoding_utf8);
+}
+
+void XmlWriter::ToStream(const unique_ptr<DataSetBase>& dataset,
+                         ostream& out)
+{ ToStream(*dataset.get(), out); }
diff --git a/src/XmlWriter.h b/src/XmlWriter.h

new file mode 100644 (file)

index 0000000..7fc457d
--- /dev/null
+++ b/src/XmlWriter.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef XMLWRITER_H
+#define XMLWRITER_H
+
+#include <iosfwd>
+#include <memory>
+
+namespace PacBio {
+namespace BAM {
+
+class DataSetBase;
+
+namespace internal {
+
+class XmlWriter
+{
+public:
+    static void ToStream(const DataSetBase& dataset, std::ostream& out);
+    static void ToStream(const std::unique_ptr<DataSetBase>& dataset, std::ostream& out);
+};
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
+
+#endif // XMLWRITER_H
diff --git a/src/ZmwGroupQuery.cpp b/src/ZmwGroupQuery.cpp

new file mode 100644 (file)

index 0000000..d33b34a
--- /dev/null
+++ b/src/ZmwGroupQuery.cpp
@@ -0,0 +1,112 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ZmwQuery.cpp
+/// \brief Implements the ZmwQuery class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/ZmwGroupQuery.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/PbiFilterTypes.h"
+#include "MemoryUtils.h"
+#include <algorithm>
+#include <deque>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+struct ZmwGroupQuery::ZmwGroupQueryPrivate
+{
+    typedef PbiFilterCompositeBamReader<Compare::Zmw> ReaderType;
+    typedef std::unique_ptr<ReaderType> ReaderPtr;
+
+    ZmwGroupQueryPrivate(const std::vector<int32_t>& zmwWhitelist,
+                         const DataSet& dataset)
+        : whitelist_(zmwWhitelist.cbegin(), zmwWhitelist.cend())
+        , reader_(nullptr)
+    {
+        std::sort(whitelist_.begin(), whitelist_.end());
+        whitelist_.erase(std::unique(whitelist_.begin(),
+                                     whitelist_.end()),
+                         whitelist_.end());
+
+        if (!whitelist_.empty()) {
+            reader_ = ReaderPtr(new ReaderType(PbiZmwFilter{whitelist_.front()}, dataset));
+            whitelist_.pop_front();
+        }
+    }
+
+    bool GetNext(std::vector<BamRecord>& records)
+    {
+        records.clear();
+        if (!reader_)
+            return false;
+
+        // get all records matching ZMW
+        BamRecord r;
+        while (reader_->GetNext(r))
+            records.push_back(r);
+
+        // set next ZMW (if any left)
+        if (!whitelist_.empty()) {
+            reader_->Filter(PbiZmwFilter{whitelist_.front()});
+            whitelist_.pop_front();
+        }
+
+        // otherwise destroy reader, next iteration will return false
+        else
+            reader_.reset(nullptr);
+
+        return true;
+    }
+
+    std::deque<int32_t> whitelist_;
+    ReaderPtr reader_;
+};
+
+ZmwGroupQuery::ZmwGroupQuery(const std::vector<int32_t>& zmwWhitelist,
+                             const DataSet& dataset)
+    : internal::IGroupQuery()
+    , d_(new ZmwGroupQueryPrivate(zmwWhitelist, dataset))
+{ }
+
+ZmwGroupQuery::~ZmwGroupQuery(void) { }
+
+bool ZmwGroupQuery::GetNext(std::vector<BamRecord>& records)
+{ return d_->GetNext(records); }
diff --git a/src/ZmwQuery.cpp b/src/ZmwQuery.cpp

new file mode 100644 (file)

index 0000000..7a45541
--- /dev/null
+++ b/src/ZmwQuery.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ZmwQuery.cpp
+/// \brief Implements the ZmwQuery class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/ZmwQuery.h"
+#include "pbbam/PbiFilterTypes.h"
+#include "pbbam/CompositeBamReader.h"
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+struct ZmwQuery::ZmwQueryPrivate
+{
+    ZmwQueryPrivate(const std::vector<int32_t>& zmwWhitelist,
+                    const DataSet& dataset)
+        : reader_(PbiZmwFilter(zmwWhitelist), dataset)
+    { }
+
+    PbiFilterCompositeBamReader<Compare::Zmw> reader_;
+};
+
+ZmwQuery::ZmwQuery(const std::vector<int32_t>& zmwWhitelist,
+                   const DataSet& dataset)
+    : internal::IQuery()
+    , d_(new ZmwQueryPrivate(zmwWhitelist, dataset))
+{ }
+
+ZmwQuery::~ZmwQuery(void) { }
+
+bool ZmwQuery::GetNext(BamRecord &r)
+{ return d_->reader_.GetNext(r); }
diff --git a/src/ZmwReadStitcher.cpp b/src/ZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..98daa48
--- /dev/null
+++ b/src/ZmwReadStitcher.cpp
@@ -0,0 +1,223 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ZmwReadStitcher.cpp
+/// \brief Implements the ZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/virtual/ZmwReadStitcher.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/PbiFilter.h"
+#include "pbbam/PbiFilterQuery.h"
+#include "VirtualZmwReader.h"
+#include <deque>
+#include <stdexcept>
+#include <utility>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+
+struct ZmwReadStitcher::ZmwReadStitcherPrivate
+{
+public:
+    ZmwReadStitcherPrivate(const string& primaryBamFilePath,
+                           const string& scrapsBamFilePath,
+                           const PbiFilter& filter)
+        : filter_(filter)
+    {
+        sources_.push_back(std::make_pair(primaryBamFilePath, scrapsBamFilePath));
+        OpenNextReader();
+    }
+
+    ZmwReadStitcherPrivate(const DataSet& dataset)
+        : filter_(PbiFilter::FromDataSet(dataset))
+    {
+        // set up source queue
+        string primaryFn;
+        string scrapsFn;
+        const ExternalResources& resources = dataset.ExternalResources();
+        for (const ExternalResource& resource : resources) {
+
+            primaryFn.clear();
+            scrapsFn.clear();
+
+            // if resource is possible "primary" BAM
+            const auto& metatype = resource.MetaType();
+            if (metatype == "PacBio.SubreadFile.SubreadBamFile" ||
+                metatype == "PacBio.SubreadFile.HqRegionBamFile")
+            {
+                // possible resolve relative path
+                primaryFn = dataset.ResolvePath(resource.ResourceId());
+
+                // check for associated scraps file
+                const ExternalResources& childResources = resource.ExternalResources();
+                for (const ExternalResource& childResource : childResources) {
+                    const auto& childMetatype = childResource.MetaType();
+                    if (childMetatype == "PacBio.SubreadFile.ScrapsBamFile" ||
+                        childMetatype == "PacBio.SubreadFile.HqScrapsBamFile")
+                    {
+                        // possible resolve relative path
+                        scrapsFn = dataset.ResolvePath(childResource.ResourceId());
+                        break;
+                    }
+                }
+            }
+
+            // queue up source for later
+            if (!primaryFn.empty() && !scrapsFn.empty())
+                sources_.push_back(make_pair(primaryFn,scrapsFn));
+        }
+
+        OpenNextReader();
+    }
+
+public:
+    bool HasNext(void) const
+    { return (currentReader_ && currentReader_->HasNext()); }
+
+    VirtualZmwBamRecord Next(void)
+    {
+        if (currentReader_) {
+            const auto result = currentReader_->Next();
+            if (!currentReader_->HasNext())
+                OpenNextReader();
+            return result;
+        }
+
+        // no reader active
+        const string msg = { "no readers active, make sure you use "
+                             "ZmwReadStitcher::HasNext before "
+                             "requesting next record"
+                          };
+        throw std::runtime_error(msg);
+    }
+
+    vector<BamRecord> NextRaw(void)
+    {
+        if (currentReader_) {
+            const auto result = currentReader_->NextRaw();
+            if (!currentReader_->HasNext())
+                OpenNextReader();
+            return result;
+        }
+
+        // no reader active
+        const string msg = { "no readers active, make sure you use "
+                             "ZmwReadStitcher::HasNext before "
+                             "requesting next group of records"
+                          };
+        throw std::runtime_error(msg);
+    }
+
+    BamHeader PrimaryHeader(void) const
+    { return currentReader_->PrimaryHeader(); }
+
+    BamHeader ScrapsHeader(void) const
+    { return currentReader_->ScrapsHeader(); }
+
+private:
+    std::deque< std::pair<std::string, std::string> > sources_;
+    std::unique_ptr<VirtualZmwReader> currentReader_;
+    PbiFilter filter_;
+
+private:
+    void OpenNextReader(void)
+    {
+        currentReader_.reset(nullptr);
+
+        // find next source pair with data
+        while(!sources_.empty()) {
+            const auto nextSource = sources_.front();
+            sources_.pop_front();
+
+            currentReader_.reset(new VirtualZmwReader(nextSource.first,
+                                                      nextSource.second,
+                                                      filter_));
+            if (currentReader_->HasNext())
+                return;
+        }
+    }
+};
+
+} // namespace BAM
+} // namespace PacBio
+
+// --------------------------------
+// ZmwReadStitcher implementation
+// --------------------------------
+
+ZmwReadStitcher::ZmwReadStitcher(const string& primaryBamFilePath,
+                                 const string& scrapsBamFilePath)
+    : ZmwReadStitcher(primaryBamFilePath,
+                      scrapsBamFilePath,
+                      PbiFilter{})
+{ }
+
+ZmwReadStitcher::ZmwReadStitcher(const string& primaryBamFilePath,
+                                 const string& scrapsBamFilePath,
+                                 const PbiFilter& filter)
+    : d_(new ZmwReadStitcherPrivate(primaryBamFilePath,
+                                    scrapsBamFilePath,
+                                    filter))
+{ }
+
+ZmwReadStitcher::ZmwReadStitcher(const DataSet& dataset)
+    : d_(new ZmwReadStitcherPrivate(dataset))
+{ }
+
+ZmwReadStitcher::~ZmwReadStitcher(void) { }
+
+bool ZmwReadStitcher::HasNext(void)
+{ return d_->HasNext(); }
+
+VirtualZmwBamRecord ZmwReadStitcher::Next(void)
+{ return d_->Next(); }
+
+vector<BamRecord> ZmwReadStitcher::NextRaw(void)
+{ return d_->NextRaw(); }
+
+BamHeader ZmwReadStitcher::PrimaryHeader(void) const
+{ return d_->PrimaryHeader().DeepCopy(); }
+
+BamHeader ZmwReadStitcher::ScrapsHeader(void) const 
+{ return d_->ScrapsHeader().DeepCopy(); }
+
diff --git a/src/ZmwTypeMap.cpp b/src/ZmwTypeMap.cpp

new file mode 100644 (file)

index 0000000..2eea7b7
--- /dev/null
+++ b/src/ZmwTypeMap.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// File Description
+/// \file ZmwTypeMap.cpp
+/// \brief Implements the ZmwTypeMap class.
+//
+// Author: Armin Töpfer
+
+#include "pbbam/ZmwTypeMap.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+std::map<char, ZmwType> ZmwTypeMap::ParseChar
+{
+       { 'C' , ZmwType::CONTROL   },
+       { 'M' , ZmwType::MALFORMED },
+       { 'N' , ZmwType::NORMAL    },
+       { 'S' , ZmwType::SENTINEL  }
+};
diff --git a/src/files.cmake b/src/files.cmake

new file mode 100644 (file)

index 0000000..808b585
--- /dev/null
+++ b/src/files.cmake
@@ -0,0 +1,235 @@
+
+# headers
+set( PacBioBAM_H
+
+    # API headers
+    ${PacBioBAM_IncludeDir}/pbbam/Accuracy.h
+    ${PacBioBAM_IncludeDir}/pbbam/AlignmentPrinter.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamFile.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamHeader.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamRecord.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamRecordBuilder.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamRecordImpl.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamRecordTag.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamRecordView.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamTagCodec.h
+    ${PacBioBAM_IncludeDir}/pbbam/BaiIndexedBamReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/CompositeBamReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamWriter.h
+    ${PacBioBAM_IncludeDir}/pbbam/BarcodeQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/Cigar.h
+    ${PacBioBAM_IncludeDir}/pbbam/CigarOperation.h
+    ${PacBioBAM_IncludeDir}/pbbam/ClipType.h
+    ${PacBioBAM_IncludeDir}/pbbam/Compare.h
+    ${PacBioBAM_IncludeDir}/pbbam/Config.h
+    ${PacBioBAM_IncludeDir}/pbbam/DataSet.h
+    ${PacBioBAM_IncludeDir}/pbbam/DataSetTypes.h
+    ${PacBioBAM_IncludeDir}/pbbam/DataSetXsd.h
+    ${PacBioBAM_IncludeDir}/pbbam/EntireFileQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/FastaReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/FastaSequence.h
+    ${PacBioBAM_IncludeDir}/pbbam/FrameEncodingType.h
+    ${PacBioBAM_IncludeDir}/pbbam/Frames.h
+    ${PacBioBAM_IncludeDir}/pbbam/GenomicInterval.h
+    ${PacBioBAM_IncludeDir}/pbbam/GenomicIntervalQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/IndexedFastaReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/Interval.h
+    ${PacBioBAM_IncludeDir}/pbbam/IRecordWriter.h
+    ${PacBioBAM_IncludeDir}/pbbam/LocalContextFlags.h
+    ${PacBioBAM_IncludeDir}/pbbam/MD5.h
+    ${PacBioBAM_IncludeDir}/pbbam/Orientation.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiBasicTypes.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiBuilder.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiFile.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiFilter.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiFilterQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiFilterTypes.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiIndex.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiIndexedBamReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiLookupData.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiRawData.h
+    ${PacBioBAM_IncludeDir}/pbbam/Position.h
+    ${PacBioBAM_IncludeDir}/pbbam/ProgramInfo.h
+    ${PacBioBAM_IncludeDir}/pbbam/PulseBehavior.h
+    ${PacBioBAM_IncludeDir}/pbbam/QNameQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/QualityValue.h
+    ${PacBioBAM_IncludeDir}/pbbam/QualityValues.h
+    ${PacBioBAM_IncludeDir}/pbbam/ReadAccuracyQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/ReadGroupInfo.h
+    ${PacBioBAM_IncludeDir}/pbbam/RecordType.h
+    ${PacBioBAM_IncludeDir}/pbbam/SamTagCodec.h
+    ${PacBioBAM_IncludeDir}/pbbam/SamWriter.h
+    ${PacBioBAM_IncludeDir}/pbbam/SequenceInfo.h
+    ${PacBioBAM_IncludeDir}/pbbam/Strand.h  
+    ${PacBioBAM_IncludeDir}/pbbam/SubreadLengthQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/Tag.h
+    ${PacBioBAM_IncludeDir}/pbbam/TagCollection.h
+#    ${PacBioBAM_IncludeDir}/pbbam/UnmappedReadsQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/Validator.h
+    ${PacBioBAM_IncludeDir}/pbbam/ZmwGroupQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/ZmwQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/ZmwType.h
+    ${PacBioBAM_IncludeDir}/pbbam/ZmwTypeMap.h
+
+    # exception headers
+    ${PacBioBAM_IncludeDir}/pbbam/exception/InvalidSequencingChemistryException.h
+    ${PacBioBAM_IncludeDir}/pbbam/exception/ValidationException.h
+
+    # API-internal headers & inline files
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Accuracy.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/BamHeader.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecord.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecordBuilder.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecordImpl.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecordView.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Cigar.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/CigarOperation.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Compare.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/CompositeBamReader.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSet.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetBaseTypes.h
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetBaseTypes.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetElement.h
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetElement.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetListElement.h
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetListElement.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetTypes.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/FastaSequence.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Frames.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/GenomicInterval.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Interval.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/PbiBasicTypes.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/PbiFilter.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/PbiFilterTypes.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/PbiIndex.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/PbiLookupData.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/PbiRawData.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/ProgramInfo.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/QualityValue.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/QualityValues.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/QueryBase.h
+    ${PacBioBAM_IncludeDir}/pbbam/internal/QueryBase.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/ReadGroupInfo.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/SequenceInfo.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Tag.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Validator.inl
+
+    # virtual headers
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualPolymeraseBamRecord.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualPolymeraseCompositeReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualPolymeraseReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualRegion.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualRegionType.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualRegionTypeMap.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualZmwBamRecord.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/WhitelistedZmwReadStitcher.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/ZmwReadStitcher.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/ZmwWhitelistVirtualReader.h
+
+    # library-internal headers
+    ${PacBioBAM_SourceDir}/AssertUtils.h
+    ${PacBioBAM_SourceDir}/BamRecordTags.h
+    ${PacBioBAM_SourceDir}/ChemistryTable.h
+    ${PacBioBAM_SourceDir}/DataSetIO.h
+    ${PacBioBAM_SourceDir}/DataSetUtils.h
+    ${PacBioBAM_SourceDir}/EnumClassHash.h
+    ${PacBioBAM_SourceDir}/FileProducer.h
+    ${PacBioBAM_SourceDir}/FileUtils.h
+    ${PacBioBAM_SourceDir}/FofnReader.h
+    ${PacBioBAM_SourceDir}/MemoryUtils.h
+    ${PacBioBAM_SourceDir}/PbiIndexIO.h
+    ${PacBioBAM_SourceDir}/Pulse2BaseCache.h
+    ${PacBioBAM_SourceDir}/SequenceUtils.h
+    ${PacBioBAM_SourceDir}/StringUtils.h
+    ${PacBioBAM_SourceDir}/TimeUtils.h
+    ${PacBioBAM_SourceDir}/ValidationErrors.h
+    ${PacBioBAM_SourceDir}/Version.h
+    ${PacBioBAM_SourceDir}/VirtualZmwCompositeReader.h
+    ${PacBioBAM_SourceDir}/VirtualZmwReader.h
+    ${PacBioBAM_SourceDir}/XmlReader.h
+    ${PacBioBAM_SourceDir}/XmlWriter.h
+    ${PacBioBAM_SourceDir}/pugixml/pugiconfig.hpp
+    ${PacBioBAM_SourceDir}/pugixml/pugixml.hpp
+)
+
+# sources
+set( PacBioBAM_CPP
+
+    ${PacBioBAM_SourceDir}/Accuracy.cpp
+    ${PacBioBAM_SourceDir}/AlignmentPrinter.cpp
+    ${PacBioBAM_SourceDir}/AssertUtils.cpp
+    ${PacBioBAM_SourceDir}/BaiIndexedBamReader.cpp
+    ${PacBioBAM_SourceDir}/BamFile.cpp
+    ${PacBioBAM_SourceDir}/BamHeader.cpp
+    ${PacBioBAM_SourceDir}/BamReader.cpp
+    ${PacBioBAM_SourceDir}/BamRecord.cpp
+    ${PacBioBAM_SourceDir}/BamRecordBuilder.cpp
+    ${PacBioBAM_SourceDir}/BamRecordImpl.cpp
+    ${PacBioBAM_SourceDir}/BamRecordTags.cpp
+    ${PacBioBAM_SourceDir}/BamTagCodec.cpp
+    ${PacBioBAM_SourceDir}/BamWriter.cpp
+    ${PacBioBAM_SourceDir}/BarcodeQuery.cpp
+    ${PacBioBAM_SourceDir}/ChemistryTable.cpp
+    ${PacBioBAM_SourceDir}/Cigar.cpp
+    ${PacBioBAM_SourceDir}/CigarOperation.cpp
+    ${PacBioBAM_SourceDir}/Compare.cpp
+    ${PacBioBAM_SourceDir}/Config.cpp
+    ${PacBioBAM_SourceDir}/DataSet.cpp
+    ${PacBioBAM_SourceDir}/DataSetBaseTypes.cpp
+    ${PacBioBAM_SourceDir}/DataSetElement.cpp
+    ${PacBioBAM_SourceDir}/DataSetIO.cpp
+    ${PacBioBAM_SourceDir}/DataSetTypes.cpp
+    ${PacBioBAM_SourceDir}/DataSetXsd.cpp
+    ${PacBioBAM_SourceDir}/EntireFileQuery.cpp
+    ${PacBioBAM_SourceDir}/FastaReader.cpp
+    ${PacBioBAM_SourceDir}/FileProducer.cpp
+    ${PacBioBAM_SourceDir}/FileUtils.cpp
+    ${PacBioBAM_SourceDir}/FofnReader.cpp
+    ${PacBioBAM_SourceDir}/Frames.cpp
+    ${PacBioBAM_SourceDir}/GenomicInterval.cpp
+    ${PacBioBAM_SourceDir}/GenomicIntervalQuery.cpp
+    ${PacBioBAM_SourceDir}/IndexedFastaReader.cpp
+    ${PacBioBAM_SourceDir}/IRecordWriter.cpp
+    ${PacBioBAM_SourceDir}/MD5.cpp
+    ${PacBioBAM_SourceDir}/MemoryUtils.cpp
+    ${PacBioBAM_SourceDir}/PbiBuilder.cpp
+    ${PacBioBAM_SourceDir}/PbiFile.cpp
+    ${PacBioBAM_SourceDir}/PbiFilter.cpp
+    ${PacBioBAM_SourceDir}/PbiFilterQuery.cpp
+    ${PacBioBAM_SourceDir}/PbiFilterTypes.cpp
+    ${PacBioBAM_SourceDir}/PbiIndex.cpp
+    ${PacBioBAM_SourceDir}/PbiIndexedBamReader.cpp
+    ${PacBioBAM_SourceDir}/PbiIndexIO.cpp
+    ${PacBioBAM_SourceDir}/PbiRawData.cpp
+    ${PacBioBAM_SourceDir}/ProgramInfo.cpp
+    ${PacBioBAM_SourceDir}/QNameQuery.cpp
+    ${PacBioBAM_SourceDir}/QualityValue.cpp
+    ${PacBioBAM_SourceDir}/ReadAccuracyQuery.cpp
+    ${PacBioBAM_SourceDir}/ReadGroupInfo.cpp
+    ${PacBioBAM_SourceDir}/SamTagCodec.cpp
+    ${PacBioBAM_SourceDir}/SamWriter.cpp
+    ${PacBioBAM_SourceDir}/SequenceInfo.cpp
+    ${PacBioBAM_SourceDir}/SubreadLengthQuery.cpp
+    ${PacBioBAM_SourceDir}/Tag.cpp
+    ${PacBioBAM_SourceDir}/TagCollection.cpp
+#    ${PacBioBAM_SourceDir}/UnmappedReadsQuery.cpp
+    ${PacBioBAM_SourceDir}/Validator.cpp
+    ${PacBioBAM_SourceDir}/ValidationErrors.cpp
+    ${PacBioBAM_SourceDir}/ValidationException.cpp
+    ${PacBioBAM_SourceDir}/Version.cpp
+    ${PacBioBAM_SourceDir}/VirtualZmwBamRecord.cpp
+    ${PacBioBAM_SourceDir}/VirtualZmwCompositeReader.cpp
+    ${PacBioBAM_SourceDir}/VirtualZmwReader.cpp
+    ${PacBioBAM_SourceDir}/VirtualRegionTypeMap.cpp
+    ${PacBioBAM_SourceDir}/XmlReader.cpp
+    ${PacBioBAM_SourceDir}/XmlWriter.cpp
+    ${PacBioBAM_SourceDir}/WhitelistedZmwReadStitcher.cpp
+    ${PacBioBAM_SourceDir}/ZmwGroupQuery.cpp
+    ${PacBioBAM_SourceDir}/ZmwReadStitcher.cpp
+    ${PacBioBAM_SourceDir}/ZmwQuery.cpp
+    ${PacBioBAM_SourceDir}/ZmwTypeMap.cpp
+
+    # XML I/O
+    ${PacBioBAM_SourceDir}/pugixml/pugixml.cpp
+)
diff --git a/src/pugixml/pugiconfig.hpp b/src/pugixml/pugiconfig.hpp

new file mode 100644 (file)

index 0000000..6219dbe
--- /dev/null
+++ b/src/pugixml/pugiconfig.hpp
@@ -0,0 +1,71 @@
+/**
+ * pugixml parser - version 1.5
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef HEADER_PUGICONFIG_HPP
+#define HEADER_PUGICONFIG_HPP
+
+// Uncomment this to enable wchar_t mode
+// #define PUGIXML_WCHAR_MODE
+
+// Uncomment this to disable XPath
+// #define PUGIXML_NO_XPATH
+
+// Uncomment this to disable STL
+// #define PUGIXML_NO_STL
+
+// Uncomment this to disable exceptions
+// #define PUGIXML_NO_EXCEPTIONS
+
+// Set this to control attributes for public classes/functions, i.e.:
+// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
+// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
+// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
+// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
+
+// Tune these constants to adjust memory-related behavior
+// #define PUGIXML_MEMORY_PAGE_SIZE 32768
+// #define PUGIXML_MEMORY_OUTPUT_STACK 10240
+// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
+
+// Uncomment this to switch to header-only version
+// #define PUGIXML_HEADER_ONLY
+
+// Uncomment this to enable long long support
+// #define PUGIXML_HAS_LONG_LONG
+
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/src/pugixml/pugixml.cpp b/src/pugixml/pugixml.cpp

new file mode 100644 (file)

index 0000000..0f696ab
--- /dev/null
+++ b/src/pugixml/pugixml.cpp
@@ -0,0 +1,11525 @@
+/**
+ * pugixml parser - version 1.5
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef SOURCE_PUGIXML_CPP
+#define SOURCE_PUGIXML_CPP
+
+#include "pugixml.hpp"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#ifdef PUGIXML_WCHAR_MODE
+#      include <wchar.h>
+#endif
+
+#ifndef PUGIXML_NO_XPATH
+#      include <math.h>
+#      include <float.h>
+#      ifdef PUGIXML_NO_EXCEPTIONS
+#              include <setjmp.h>
+#      endif
+#endif
+
+#ifndef PUGIXML_NO_STL
+#      include <istream>
+#      include <ostream>
+#      include <string>
+#endif
+
+// For placement new
+#include <new>
+
+#ifdef _MSC_VER
+#      pragma warning(push)
+#      pragma warning(disable: 4127) // conditional expression is constant
+#      pragma warning(disable: 4324) // structure was padded due to __declspec(align())
+#      pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable
+#      pragma warning(disable: 4702) // unreachable code
+#      pragma warning(disable: 4996) // this function or variable may be unsafe
+#      pragma warning(disable: 4793) // function compiled as native: presence of '_setjmp' makes a function unmanaged
+#endif
+
+#ifdef __INTEL_COMPILER
+#      pragma warning(disable: 177) // function was declared but never referenced 
+#      pragma warning(disable: 279) // controlling expression is constant
+#      pragma warning(disable: 1478 1786) // function was declared "deprecated"
+#      pragma warning(disable: 1684) // conversion from pointer to same-sized integral type
+#endif
+
+#if defined(__BORLANDC__) && defined(PUGIXML_HEADER_ONLY)
+#      pragma warn -8080 // symbol is declared but never used; disabling this inside push/pop bracket does not make the warning go away
+#endif
+
+#ifdef __BORLANDC__
+#      pragma option push
+#      pragma warn -8008 // condition is always false
+#      pragma warn -8066 // unreachable code
+#endif
+
+#ifdef __SNC__
+// Using diag_push/diag_pop does not disable the warnings inside templates due to a compiler bug
+#      pragma diag_suppress=178 // function was declared but never referenced
+#      pragma diag_suppress=237 // controlling expression is constant
+#endif
+
+// Inlining controls
+#if defined(_MSC_VER) && _MSC_VER >= 1300
+#      define PUGI__NO_INLINE __declspec(noinline)
+#elif defined(__GNUC__)
+#      define PUGI__NO_INLINE __attribute__((noinline))
+#else
+#      define PUGI__NO_INLINE 
+#endif
+
+// Branch weight controls
+#if defined(__GNUC__)
+#      define PUGI__UNLIKELY(cond) __builtin_expect(cond, 0)
+#else
+#      define PUGI__UNLIKELY(cond) (cond)
+#endif
+
+// Simple static assertion
+#define PUGI__STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; }
+
+// Digital Mars C++ bug workaround for passing char loaded from memory via stack
+#ifdef __DMC__
+#      define PUGI__DMC_VOLATILE volatile
+#else
+#      define PUGI__DMC_VOLATILE
+#endif
+
+// Borland C++ bug workaround for not defining ::memcpy depending on header include order (can't always use std::memcpy because some compilers don't have it at all)
+#if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST)
+using std::memcpy;
+using std::memmove;
+#endif
+
+// In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features
+#if defined(_MSC_VER) && !defined(__S3E__)
+#      define PUGI__MSVC_CRT_VERSION _MSC_VER
+#endif
+
+#ifdef PUGIXML_HEADER_ONLY
+#      define PUGI__NS_BEGIN namespace pugi { namespace impl {
+#      define PUGI__NS_END } }
+#      define PUGI__FN inline
+#      define PUGI__FN_NO_INLINE inline
+#else
+#      if defined(_MSC_VER) && _MSC_VER < 1300 // MSVC6 seems to have an amusing bug with anonymous namespaces inside namespaces
+#              define PUGI__NS_BEGIN namespace pugi { namespace impl {
+#              define PUGI__NS_END } }
+#      else
+#              define PUGI__NS_BEGIN namespace pugi { namespace impl { namespace {
+#              define PUGI__NS_END } } }
+#      endif
+#      define PUGI__FN
+#      define PUGI__FN_NO_INLINE PUGI__NO_INLINE
+#endif
+
+// uintptr_t
+#if !defined(_MSC_VER) || _MSC_VER >= 1600
+#      include <stdint.h>
+#else
+#      ifndef _UINTPTR_T_DEFINED
+// No native uintptr_t in MSVC6 and in some WinCE versions
+typedef size_t uintptr_t;
+#define _UINTPTR_T_DEFINED
+#      endif
+PUGI__NS_BEGIN
+       typedef unsigned __int8 uint8_t;
+       typedef unsigned __int16 uint16_t;
+       typedef unsigned __int32 uint32_t;
+PUGI__NS_END
+#endif
+
+// Memory allocation
+PUGI__NS_BEGIN
+       PUGI__FN void* default_allocate(size_t size)
+       {
+               return malloc(size);
+       }
+
+       PUGI__FN void default_deallocate(void* ptr)
+       {
+               free(ptr);
+       }
+
+       template <typename T>
+       struct xml_memory_management_function_storage
+       {
+               static allocation_function allocate;
+               static deallocation_function deallocate;
+       };
+
+       // Global allocation functions are stored in class statics so that in header mode linker deduplicates them
+       // Without a template<> we'll get multiple definitions of the same static
+       template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
+       template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
+
+       typedef xml_memory_management_function_storage<int> xml_memory;
+PUGI__NS_END
+
+// String utilities
+PUGI__NS_BEGIN
+       // Get string length
+       PUGI__FN size_t strlength(const char_t* s)
+       {
+               assert(s);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcslen(s);
+       #else
+               return strlen(s);
+       #endif
+       }
+
+       // Compare two strings
+       PUGI__FN bool strequal(const char_t* src, const char_t* dst)
+       {
+               assert(src && dst);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcscmp(src, dst) == 0;
+       #else
+               return strcmp(src, dst) == 0;
+       #endif
+       }
+
+       // Compare lhs with [rhs_begin, rhs_end)
+       PUGI__FN bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count)
+       {
+               for (size_t i = 0; i < count; ++i)
+                       if (lhs[i] != rhs[i])
+                               return false;
+       
+               return lhs[count] == 0;
+       }
+
+       // Get length of wide string, even if CRT lacks wide character support
+       PUGI__FN size_t strlength_wide(const wchar_t* s)
+       {
+               assert(s);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcslen(s);
+       #else
+               const wchar_t* end = s;
+               while (*end) end++;
+               return static_cast<size_t>(end - s);
+       #endif
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       // Convert string to wide string, assuming all symbols are ASCII
+       PUGI__FN void widen_ascii(wchar_t* dest, const char* source)
+       {
+               for (const char* i = source; *i; ++i) *dest++ = *i;
+               *dest = 0;
+       }
+#endif
+PUGI__NS_END
+
+#if !defined(PUGIXML_NO_STL) || !defined(PUGIXML_NO_XPATH)
+// auto_ptr-like buffer holder for exception recovery
+PUGI__NS_BEGIN
+       struct buffer_holder
+       {
+               void* data;
+               void (*deleter)(void*);
+
+               buffer_holder(void* data_, void (*deleter_)(void*)): data(data_), deleter(deleter_)
+               {
+               }
+
+               ~buffer_holder()
+               {
+                       if (data) deleter(data);
+               }
+
+               void* release()
+               {
+                       void* result = data;
+                       data = 0;
+                       return result;
+               }
+       };
+PUGI__NS_END
+#endif
+
+PUGI__NS_BEGIN
+       static const size_t xml_memory_page_size =
+       #ifdef PUGIXML_MEMORY_PAGE_SIZE
+               PUGIXML_MEMORY_PAGE_SIZE
+       #else
+               32768
+       #endif
+               ;
+
+       static const uintptr_t xml_memory_page_alignment = 64;
+       static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1);
+       static const uintptr_t xml_memory_page_contents_shared_mask = 32;
+       static const uintptr_t xml_memory_page_name_allocated_mask = 16;
+       static const uintptr_t xml_memory_page_value_allocated_mask = 8;
+       static const uintptr_t xml_memory_page_type_mask = 7;
+       static const uintptr_t xml_memory_page_name_allocated_or_shared_mask = xml_memory_page_name_allocated_mask | xml_memory_page_contents_shared_mask;
+       static const uintptr_t xml_memory_page_value_allocated_or_shared_mask = xml_memory_page_value_allocated_mask | xml_memory_page_contents_shared_mask;
+
+       #define PUGI__NODETYPE(n) static_cast<xml_node_type>(((n)->header & impl::xml_memory_page_type_mask) + 1)
+
+       struct xml_allocator;
+
+       struct xml_memory_page
+       {
+               static xml_memory_page* construct(void* memory)
+               {
+                       xml_memory_page* result = static_cast<xml_memory_page*>(memory);
+
+                       result->allocator = 0;
+                       result->prev = 0;
+                       result->next = 0;
+                       result->busy_size = 0;
+                       result->freed_size = 0;
+
+                       return result;
+               }
+
+               xml_allocator* allocator;
+
+               xml_memory_page* prev;
+               xml_memory_page* next;
+
+               size_t busy_size;
+               size_t freed_size;
+       };
+
+       struct xml_memory_string_header
+       {
+               uint16_t page_offset; // offset from page->data
+               uint16_t full_size; // 0 if string occupies whole page
+       };
+
+       struct xml_allocator
+       {
+               xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size)
+               {
+               }
+
+               xml_memory_page* allocate_page(size_t data_size)
+               {
+                       size_t size = sizeof(xml_memory_page) + data_size;
+
+                       // allocate block with some alignment, leaving memory for worst-case padding
+                       void* memory = xml_memory::allocate(size + xml_memory_page_alignment);
+                       if (!memory) return 0;
+
+                       // align to next page boundary (note: this guarantees at least 1 usable byte before the page)
+                       char* page_memory = reinterpret_cast<char*>((reinterpret_cast<uintptr_t>(memory) + xml_memory_page_alignment) & ~(xml_memory_page_alignment - 1));
+
+                       // prepare page structure
+                       xml_memory_page* page = xml_memory_page::construct(page_memory);
+                       assert(page);
+
+                       page->allocator = _root->allocator;
+
+                       // record the offset for freeing the memory block
+                       assert(page_memory > memory && page_memory - static_cast<char*>(memory) <= 127);
+                       page_memory[-1] = static_cast<char>(page_memory - static_cast<char*>(memory));
+
+                       return page;
+               }
+
+               static void deallocate_page(xml_memory_page* page)
+               {
+                       char* page_memory = reinterpret_cast<char*>(page);
+
+                       xml_memory::deallocate(page_memory - page_memory[-1]);
+               }
+
+               void* allocate_memory_oob(size_t size, xml_memory_page*& out_page);
+
+               void* allocate_memory(size_t size, xml_memory_page*& out_page)
+               {
+                       if (_busy_size + size > xml_memory_page_size) return allocate_memory_oob(size, out_page);
+
+                       void* buf = reinterpret_cast<char*>(_root) + sizeof(xml_memory_page) + _busy_size;
+
+                       _busy_size += size;
+
+                       out_page = _root;
+
+                       return buf;
+               }
+
+               void deallocate_memory(void* ptr, size_t size, xml_memory_page* page)
+               {
+                       if (page == _root) page->busy_size = _busy_size;
+
+                       assert(ptr >= reinterpret_cast<char*>(page) + sizeof(xml_memory_page) && ptr < reinterpret_cast<char*>(page) + sizeof(xml_memory_page) + page->busy_size);
+                       (void)!ptr;
+
+                       page->freed_size += size;
+                       assert(page->freed_size <= page->busy_size);
+
+                       if (page->freed_size == page->busy_size)
+                       {
+                               if (page->next == 0)
+                               {
+                                       assert(_root == page);
+
+                                       // top page freed, just reset sizes
+                                       page->busy_size = page->freed_size = 0;
+                                       _busy_size = 0;
+                               }
+                               else
+                               {
+                                       assert(_root != page);
+                                       assert(page->prev);
+
+                                       // remove from the list
+                                       page->prev->next = page->next;
+                                       page->next->prev = page->prev;
+
+                                       // deallocate
+                                       deallocate_page(page);
+                               }
+                       }
+               }
+
+               char_t* allocate_string(size_t length)
+               {
+                       PUGI__STATIC_ASSERT(xml_memory_page_size <= (1 << 16));
+
+                       // allocate memory for string and header block
+                       size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t);
+                       
+                       // round size up to pointer alignment boundary
+                       size_t full_size = (size + (sizeof(void*) - 1)) & ~(sizeof(void*) - 1);
+
+                       xml_memory_page* page;
+                       xml_memory_string_header* header = static_cast<xml_memory_string_header*>(allocate_memory(full_size, page));
+
+                       if (!header) return 0;
+
+                       // setup header
+                       ptrdiff_t page_offset = reinterpret_cast<char*>(header) - reinterpret_cast<char*>(page) - sizeof(xml_memory_page);
+
+                       assert(page_offset >= 0 && page_offset < (1 << 16));
+                       header->page_offset = static_cast<uint16_t>(page_offset);
+
+                       // full_size == 0 for large strings that occupy the whole page
+                       assert(full_size < (1 << 16) || (page->busy_size == full_size && page_offset == 0));
+                       header->full_size = static_cast<uint16_t>(full_size < (1 << 16) ? full_size : 0);
+
+                       // round-trip through void* to avoid 'cast increases required alignment of target type' warning
+                       // header is guaranteed a pointer-sized alignment, which should be enough for char_t
+                       return static_cast<char_t*>(static_cast<void*>(header + 1));
+               }
+
+               void deallocate_string(char_t* string)
+               {
+                       // this function casts pointers through void* to avoid 'cast increases required alignment of target type' warnings
+                       // we're guaranteed the proper (pointer-sized) alignment on the input string if it was allocated via allocate_string
+
+                       // get header
+                       xml_memory_string_header* header = static_cast<xml_memory_string_header*>(static_cast<void*>(string)) - 1;
+                       assert(header);
+
+                       // deallocate
+                       size_t page_offset = sizeof(xml_memory_page) + header->page_offset;
+                       xml_memory_page* page = reinterpret_cast<xml_memory_page*>(static_cast<void*>(reinterpret_cast<char*>(header) - page_offset));
+
+                       // if full_size == 0 then this string occupies the whole page
+                       size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size;
+
+                       deallocate_memory(header, full_size, page);
+               }
+
+               xml_memory_page* _root;
+               size_t _busy_size;
+       };
+
+       PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page)
+       {
+               const size_t large_allocation_threshold = xml_memory_page_size / 4;
+
+               xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size);
+               out_page = page;
+
+               if (!page) return 0;
+
+               if (size <= large_allocation_threshold)
+               {
+                       _root->busy_size = _busy_size;
+
+                       // insert page at the end of linked list
+                       page->prev = _root;
+                       _root->next = page;
+                       _root = page;
+
+                       _busy_size = size;
+               }
+               else
+               {
+                       // insert page before the end of linked list, so that it is deleted as soon as possible
+                       // the last page is not deleted even if it's empty (see deallocate_memory)
+                       assert(_root->prev);
+
+                       page->prev = _root->prev;
+                       page->next = _root;
+
+                       _root->prev->next = page;
+                       _root->prev = page;
+               }
+
+               // allocate inside page
+               page->busy_size = size;
+
+               return reinterpret_cast<char*>(page) + sizeof(xml_memory_page);
+       }
+PUGI__NS_END
+
+namespace pugi
+{
+       /// A 'name=value' XML attribute structure.
+       struct xml_attribute_struct
+       {
+               /// Default ctor
+               xml_attribute_struct(impl::xml_memory_page* page): header(reinterpret_cast<uintptr_t>(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0)
+               {
+               }
+
+               uintptr_t header;
+
+               char_t* name;   ///< Pointer to attribute name.
+               char_t* value;  ///< Pointer to attribute value.
+
+               xml_attribute_struct* prev_attribute_c; ///< Previous attribute (cyclic list)
+               xml_attribute_struct* next_attribute;   ///< Next attribute
+       };
+
+       /// An XML document tree node.
+       struct xml_node_struct
+       {
+               /// Default ctor
+               /// \param type - node type
+               xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(reinterpret_cast<uintptr_t>(page) | (type - 1)), parent(0), name(0), value(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0)
+               {
+               }
+
+               uintptr_t header;
+
+               xml_node_struct*                parent;                                 ///< Pointer to parent
+
+               char_t*                                 name;                                   ///< Pointer to element name.
+               char_t*                                 value;                                  ///< Pointer to any associated string data.
+
+               xml_node_struct*                first_child;                    ///< First child
+               
+               xml_node_struct*                prev_sibling_c;                 ///< Left brother (cyclic list)
+               xml_node_struct*                next_sibling;                   ///< Right brother
+               
+               xml_attribute_struct*   first_attribute;                ///< First attribute
+       };
+}
+
+PUGI__NS_BEGIN
+       struct xml_extra_buffer
+       {
+               char_t* buffer;
+               xml_extra_buffer* next;
+       };
+
+       struct xml_document_struct: public xml_node_struct, public xml_allocator
+       {
+               xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0), extra_buffers(0)
+               {
+               }
+
+               const char_t* buffer;
+
+               xml_extra_buffer* extra_buffers;
+       };
+
+       inline xml_allocator& get_allocator(const xml_node_struct* node)
+       {
+               assert(node);
+
+               return *reinterpret_cast<xml_memory_page*>(node->header & xml_memory_page_pointer_mask)->allocator;
+       }
+
+       template <typename Object> inline xml_document_struct& get_document(const Object* object)
+       {
+               assert(object);
+
+               return *static_cast<xml_document_struct*>(reinterpret_cast<xml_memory_page*>(object->header & xml_memory_page_pointer_mask)->allocator);
+       }
+PUGI__NS_END
+
+// Low-level DOM operations
+PUGI__NS_BEGIN
+       inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc)
+       {
+               xml_memory_page* page;
+               void* memory = alloc.allocate_memory(sizeof(xml_attribute_struct), page);
+
+               return new (memory) xml_attribute_struct(page);
+       }
+
+       inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type)
+       {
+               xml_memory_page* page;
+               void* memory = alloc.allocate_memory(sizeof(xml_node_struct), page);
+
+               return new (memory) xml_node_struct(page, type);
+       }
+
+       inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc)
+       {
+               uintptr_t header = a->header;
+
+               if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(a->name);
+               if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(a->value);
+
+               alloc.deallocate_memory(a, sizeof(xml_attribute_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+       }
+
+       inline void destroy_node(xml_node_struct* n, xml_allocator& alloc)
+       {
+               uintptr_t header = n->header;
+
+               if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(n->name);
+               if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(n->value);
+
+               for (xml_attribute_struct* attr = n->first_attribute; attr; )
+               {
+                       xml_attribute_struct* next = attr->next_attribute;
+
+                       destroy_attribute(attr, alloc);
+
+                       attr = next;
+               }
+
+               for (xml_node_struct* child = n->first_child; child; )
+               {
+                       xml_node_struct* next = child->next_sibling;
+
+                       destroy_node(child, alloc);
+
+                       child = next;
+               }
+
+               alloc.deallocate_memory(n, sizeof(xml_node_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+       }
+
+       inline void append_node(xml_node_struct* child, xml_node_struct* node)
+       {
+               child->parent = node;
+
+               xml_node_struct* head = node->first_child;
+
+               if (head)
+               {
+                       xml_node_struct* tail = head->prev_sibling_c;
+
+                       tail->next_sibling = child;
+                       child->prev_sibling_c = tail;
+                       head->prev_sibling_c = child;
+               }
+               else
+               {
+                       node->first_child = child;
+                       child->prev_sibling_c = child;
+               }
+       }
+
+       inline void prepend_node(xml_node_struct* child, xml_node_struct* node)
+       {
+               child->parent = node;
+
+               xml_node_struct* head = node->first_child;
+
+               if (head)
+               {
+                       child->prev_sibling_c = head->prev_sibling_c;
+                       head->prev_sibling_c = child;
+               }
+               else
+                       child->prev_sibling_c = child;
+
+               child->next_sibling = head;
+               node->first_child = child;
+       }
+
+       inline void insert_node_after(xml_node_struct* child, xml_node_struct* node)
+       {
+               xml_node_struct* parent = node->parent;
+
+               child->parent = parent;
+
+               if (node->next_sibling)
+                       node->next_sibling->prev_sibling_c = child;
+               else
+                       parent->first_child->prev_sibling_c = child;
+
+               child->next_sibling = node->next_sibling;
+               child->prev_sibling_c = node;
+
+               node->next_sibling = child;
+       }
+
+       inline void insert_node_before(xml_node_struct* child, xml_node_struct* node)
+       {
+               xml_node_struct* parent = node->parent;
+
+               child->parent = parent;
+
+               if (node->prev_sibling_c->next_sibling)
+                       node->prev_sibling_c->next_sibling = child;
+               else
+                       parent->first_child = child;
+
+               child->prev_sibling_c = node->prev_sibling_c;
+               child->next_sibling = node;
+
+               node->prev_sibling_c = child;
+       }
+
+       inline void remove_node(xml_node_struct* node)
+       {
+               xml_node_struct* parent = node->parent;
+
+               if (node->next_sibling)
+                       node->next_sibling->prev_sibling_c = node->prev_sibling_c;
+               else
+                       parent->first_child->prev_sibling_c = node->prev_sibling_c;
+
+               if (node->prev_sibling_c->next_sibling)
+                       node->prev_sibling_c->next_sibling = node->next_sibling;
+               else
+                       parent->first_child = node->next_sibling;
+
+               node->parent = 0;
+               node->prev_sibling_c = 0;
+               node->next_sibling = 0;
+       }
+
+       inline void append_attribute(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               xml_attribute_struct* head = node->first_attribute;
+
+               if (head)
+               {
+                       xml_attribute_struct* tail = head->prev_attribute_c;
+
+                       tail->next_attribute = attr;
+                       attr->prev_attribute_c = tail;
+                       head->prev_attribute_c = attr;
+               }
+               else
+               {
+                       node->first_attribute = attr;
+                       attr->prev_attribute_c = attr;
+               }
+       }
+
+       inline void prepend_attribute(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               xml_attribute_struct* head = node->first_attribute;
+
+               if (head)
+               {
+                       attr->prev_attribute_c = head->prev_attribute_c;
+                       head->prev_attribute_c = attr;
+               }
+               else
+                       attr->prev_attribute_c = attr;
+
+               attr->next_attribute = head;
+               node->first_attribute = attr;
+       }
+
+       inline void insert_attribute_after(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node)
+       {
+               if (place->next_attribute)
+                       place->next_attribute->prev_attribute_c = attr;
+               else
+                       node->first_attribute->prev_attribute_c = attr;
+
+               attr->next_attribute = place->next_attribute;
+               attr->prev_attribute_c = place;
+               place->next_attribute = attr;
+       }
+
+       inline void insert_attribute_before(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node)
+       {
+               if (place->prev_attribute_c->next_attribute)
+                       place->prev_attribute_c->next_attribute = attr;
+               else
+                       node->first_attribute = attr;
+
+               attr->prev_attribute_c = place->prev_attribute_c;
+               attr->next_attribute = place;
+               place->prev_attribute_c = attr;
+       }
+
+       inline void remove_attribute(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               if (attr->next_attribute)
+                       attr->next_attribute->prev_attribute_c = attr->prev_attribute_c;
+               else
+                       node->first_attribute->prev_attribute_c = attr->prev_attribute_c;
+
+               if (attr->prev_attribute_c->next_attribute)
+                       attr->prev_attribute_c->next_attribute = attr->next_attribute;
+               else
+                       node->first_attribute = attr->next_attribute;
+
+               attr->prev_attribute_c = 0;
+               attr->next_attribute = 0;
+       }
+
+       PUGI__FN_NO_INLINE xml_node_struct* append_new_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element)
+       {
+               xml_node_struct* child = allocate_node(alloc, type);
+               if (!child) return 0;
+
+               append_node(child, node);
+
+               return child;
+       }
+
+       PUGI__FN_NO_INLINE xml_attribute_struct* append_new_attribute(xml_node_struct* node, xml_allocator& alloc)
+       {
+               xml_attribute_struct* attr = allocate_attribute(alloc);
+               if (!attr) return 0;
+
+               append_attribute(attr, node);
+
+               return attr;
+       }
+PUGI__NS_END
+
+// Helper classes for code generation
+PUGI__NS_BEGIN
+       struct opt_false
+       {
+               enum { value = 0 };
+       };
+
+       struct opt_true
+       {
+               enum { value = 1 };
+       };
+PUGI__NS_END
+
+// Unicode utilities
+PUGI__NS_BEGIN
+       inline uint16_t endian_swap(uint16_t value)
+       {
+               return static_cast<uint16_t>(((value & 0xff) << 8) | (value >> 8));
+       }
+
+       inline uint32_t endian_swap(uint32_t value)
+       {
+               return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24);
+       }
+
+       struct utf8_counter
+       {
+               typedef size_t value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       // U+0000..U+007F
+                       if (ch < 0x80) return result + 1;
+                       // U+0080..U+07FF
+                       else if (ch < 0x800) return result + 2;
+                       // U+0800..U+FFFF
+                       else return result + 3;
+               }
+
+               static value_type high(value_type result, uint32_t)
+               {
+                       // U+10000..U+10FFFF
+                       return result + 4;
+               }
+       };
+
+       struct utf8_writer
+       {
+               typedef uint8_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       // U+0000..U+007F
+                       if (ch < 0x80)
+                       {
+                               *result = static_cast<uint8_t>(ch);
+                               return result + 1;
+                       }
+                       // U+0080..U+07FF
+                       else if (ch < 0x800)
+                       {
+                               result[0] = static_cast<uint8_t>(0xC0 | (ch >> 6));
+                               result[1] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+                               return result + 2;
+                       }
+                       // U+0800..U+FFFF
+                       else
+                       {
+                               result[0] = static_cast<uint8_t>(0xE0 | (ch >> 12));
+                               result[1] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+                               result[2] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+                               return result + 3;
+                       }
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       // U+10000..U+10FFFF
+                       result[0] = static_cast<uint8_t>(0xF0 | (ch >> 18));
+                       result[1] = static_cast<uint8_t>(0x80 | ((ch >> 12) & 0x3F));
+                       result[2] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+                       result[3] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+                       return result + 4;
+               }
+
+               static value_type any(value_type result, uint32_t ch)
+               {
+                       return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+               }
+       };
+
+       struct utf16_counter
+       {
+               typedef size_t value_type;
+
+               static value_type low(value_type result, uint32_t)
+               {
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t)
+               {
+                       return result + 2;
+               }
+       };
+
+       struct utf16_writer
+       {
+               typedef uint16_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       *result = static_cast<uint16_t>(ch);
+
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       uint32_t msh = static_cast<uint32_t>(ch - 0x10000) >> 10;
+                       uint32_t lsh = static_cast<uint32_t>(ch - 0x10000) & 0x3ff;
+
+                       result[0] = static_cast<uint16_t>(0xD800 + msh);
+                       result[1] = static_cast<uint16_t>(0xDC00 + lsh);
+
+                       return result + 2;
+               }
+
+               static value_type any(value_type result, uint32_t ch)
+               {
+                       return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+               }
+       };
+
+       struct utf32_counter
+       {
+               typedef size_t value_type;
+
+               static value_type low(value_type result, uint32_t)
+               {
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t)
+               {
+                       return result + 1;
+               }
+       };
+
+       struct utf32_writer
+       {
+               typedef uint32_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       *result = ch;
+
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       *result = ch;
+
+                       return result + 1;
+               }
+
+               static value_type any(value_type result, uint32_t ch)
+               {
+                       *result = ch;
+
+                       return result + 1;
+               }
+       };
+
+       struct latin1_writer
+       {
+               typedef uint8_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       *result = static_cast<uint8_t>(ch > 255 ? '?' : ch);
+
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       (void)ch;
+
+                       *result = '?';
+
+                       return result + 1;
+               }
+       };
+
+       template <size_t size> struct wchar_selector;
+
+       template <> struct wchar_selector<2>
+       {
+               typedef uint16_t type;
+               typedef utf16_counter counter;
+               typedef utf16_writer writer;
+       };
+
+       template <> struct wchar_selector<4>
+       {
+               typedef uint32_t type;
+               typedef utf32_counter counter;
+               typedef utf32_writer writer;
+       };
+
+       typedef wchar_selector<sizeof(wchar_t)>::counter wchar_counter;
+       typedef wchar_selector<sizeof(wchar_t)>::writer wchar_writer;
+
+       template <typename Traits, typename opt_swap = opt_false> struct utf_decoder
+       {
+               static inline typename Traits::value_type decode_utf8_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+               {
+                       const uint8_t utf8_byte_mask = 0x3f;
+
+                       while (size)
+                       {
+                               uint8_t lead = *data;
+
+                               // 0xxxxxxx -> U+0000..U+007F
+                               if (lead < 0x80)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                                       size -= 1;
+
+                                       // process aligned single-byte (ascii) blocks
+                                       if ((reinterpret_cast<uintptr_t>(data) & 3) == 0)
+                                       {
+                                               // round-trip through void* to silence 'cast increases required alignment of target type' warnings
+                                               while (size >= 4 && (*static_cast<const uint32_t*>(static_cast<const void*>(data)) & 0x80808080) == 0)
+                                               {
+                                                       result = Traits::low(result, data[0]);
+                                                       result = Traits::low(result, data[1]);
+                                                       result = Traits::low(result, data[2]);
+                                                       result = Traits::low(result, data[3]);
+                                                       data += 4;
+                                                       size -= 4;
+                                               }
+                                       }
+                               }
+                               // 110xxxxx -> U+0080..U+07FF
+                               else if (static_cast<unsigned int>(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80)
+                               {
+                                       result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask));
+                                       data += 2;
+                                       size -= 2;
+                               }
+                               // 1110xxxx -> U+0800-U+FFFF
+                               else if (static_cast<unsigned int>(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
+                               {
+                                       result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask));
+                                       data += 3;
+                                       size -= 3;
+                               }
+                               // 11110xxx -> U+10000..U+10FFFF
+                               else if (static_cast<unsigned int>(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
+                               {
+                                       result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask));
+                                       data += 4;
+                                       size -= 4;
+                               }
+                               // 10xxxxxx or 11111xxx -> invalid
+                               else
+                               {
+                                       data += 1;
+                                       size -= 1;
+                               }
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_utf16_block(const uint16_t* data, size_t size, typename Traits::value_type result)
+               {
+                       const uint16_t* end = data + size;
+
+                       while (data < end)
+                       {
+                               unsigned int lead = opt_swap::value ? endian_swap(*data) : *data;
+
+                               // U+0000..U+D7FF
+                               if (lead < 0xD800)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                               }
+                               // U+E000..U+FFFF
+                               else if (static_cast<unsigned int>(lead - 0xE000) < 0x2000)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                               }
+                               // surrogate pair lead
+                               else if (static_cast<unsigned int>(lead - 0xD800) < 0x400 && data + 1 < end)
+                               {
+                                       uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1];
+
+                                       if (static_cast<unsigned int>(next - 0xDC00) < 0x400)
+                                       {
+                                               result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff));
+                                               data += 2;
+                                       }
+                                       else
+                                       {
+                                               data += 1;
+                                       }
+                               }
+                               else
+                               {
+                                       data += 1;
+                               }
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_utf32_block(const uint32_t* data, size_t size, typename Traits::value_type result)
+               {
+                       const uint32_t* end = data + size;
+
+                       while (data < end)
+                       {
+                               uint32_t lead = opt_swap::value ? endian_swap(*data) : *data;
+
+                               // U+0000..U+FFFF
+                               if (lead < 0x10000)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                               }
+                               // U+10000..U+10FFFF
+                               else
+                               {
+                                       result = Traits::high(result, lead);
+                                       data += 1;
+                               }
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_latin1_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+               {
+                       for (size_t i = 0; i < size; ++i)
+                       {
+                               result = Traits::low(result, data[i]);
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_wchar_block_impl(const uint16_t* data, size_t size, typename Traits::value_type result)
+               {
+                       return decode_utf16_block(data, size, result);
+               }
+
+               static inline typename Traits::value_type decode_wchar_block_impl(const uint32_t* data, size_t size, typename Traits::value_type result)
+               {
+                       return decode_utf32_block(data, size, result);
+               }
+
+               static inline typename Traits::value_type decode_wchar_block(const wchar_t* data, size_t size, typename Traits::value_type result)
+               {
+                       return decode_wchar_block_impl(reinterpret_cast<const wchar_selector<sizeof(wchar_t)>::type*>(data), size, result);
+               }
+       };
+
+       template <typename T> PUGI__FN void convert_utf_endian_swap(T* result, const T* data, size_t length)
+       {
+               for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]);
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length)
+       {
+               for (size_t i = 0; i < length; ++i) result[i] = static_cast<wchar_t>(endian_swap(static_cast<wchar_selector<sizeof(wchar_t)>::type>(data[i])));
+       }
+#endif
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+       enum chartype_t
+       {
+               ct_parse_pcdata = 1,    // \0, &, \r, <
+               ct_parse_attr = 2,              // \0, &, \r, ', "
+               ct_parse_attr_ws = 4,   // \0, &, \r, ', ", \n, tab
+               ct_space = 8,                   // \r, \n, space, tab
+               ct_parse_cdata = 16,    // \0, ], >, \r
+               ct_parse_comment = 32,  // \0, -, >, \r
+               ct_symbol = 64,                 // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
+               ct_start_symbol = 128   // Any symbol > 127, a-z, A-Z, _, :
+       };
+
+       static const unsigned char chartype_table[256] =
+       {
+               55,  0,   0,   0,   0,   0,   0,   0,      0,   12,  12,  0,   0,   63,  0,   0,   // 0-15
+               0,   0,   0,   0,   0,   0,   0,   0,      0,   0,   0,   0,   0,   0,   0,   0,   // 16-31
+               8,   0,   6,   0,   0,   0,   7,   6,      0,   0,   0,   0,   0,   96,  64,  0,   // 32-47
+               64,  64,  64,  64,  64,  64,  64,  64,     64,  64,  192, 0,   1,   0,   48,  0,   // 48-63
+               0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 64-79
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0,   0,   16,  0,   192, // 80-95
+               0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 96-111
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0, 0, 0, 0, 0,           // 112-127
+
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 128+
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192
+       };
+
+       enum chartypex_t
+       {
+               ctx_special_pcdata = 1,   // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
+               ctx_special_attr = 2,     // Any symbol >= 0 and < 32 (except \t), &, <, >, "
+               ctx_start_symbol = 4,     // Any symbol > 127, a-z, A-Z, _
+               ctx_digit = 8,                    // 0-9
+               ctx_symbol = 16                   // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
+       };
+       
+       static const unsigned char chartypex_table[256] =
+       {
+               3,  3,  3,  3,  3,  3,  3,  3,     3,  0,  2,  3,  3,  2,  3,  3,     // 0-15
+               3,  3,  3,  3,  3,  3,  3,  3,     3,  3,  3,  3,  3,  3,  3,  3,     // 16-31
+               0,  0,  2,  0,  0,  0,  3,  0,     0,  0,  0,  0,  0, 16, 16,  0,     // 32-47
+               24, 24, 24, 24, 24, 24, 24, 24,    24, 24, 0,  0,  3,  0,  3,  0,     // 48-63
+
+               0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 64-79
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  20,    // 80-95
+               0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 96-111
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  0,     // 112-127
+
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 128+
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20
+       };
+       
+#ifdef PUGIXML_WCHAR_MODE
+       #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast<unsigned int>(c) < 128 ? table[static_cast<unsigned int>(c)] : table[128]) & (ct))
+#else
+       #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast<unsigned char>(c)] & (ct))
+#endif
+
+       #define PUGI__IS_CHARTYPE(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartype_table)
+       #define PUGI__IS_CHARTYPEX(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartypex_table)
+
+       PUGI__FN bool is_little_endian()
+       {
+               unsigned int ui = 1;
+
+               return *reinterpret_cast<unsigned char*>(&ui) == 1;
+       }
+
+       PUGI__FN xml_encoding get_wchar_encoding()
+       {
+               PUGI__STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
+
+               if (sizeof(wchar_t) == 2)
+                       return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+               else 
+                       return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+       }
+
+       PUGI__FN xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3)
+       {
+               // look for BOM in first few bytes
+               if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be;
+               if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le;
+               if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be;
+               if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le;
+               if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8;
+
+               // look for <, <? or <?xm in various encodings
+               if (d0 == 0 && d1 == 0 && d2 == 0 && d3 == 0x3c) return encoding_utf32_be;
+               if (d0 == 0x3c && d1 == 0 && d2 == 0 && d3 == 0) return encoding_utf32_le;
+               if (d0 == 0 && d1 == 0x3c && d2 == 0 && d3 == 0x3f) return encoding_utf16_be;
+               if (d0 == 0x3c && d1 == 0 && d2 == 0x3f && d3 == 0) return encoding_utf16_le;
+               if (d0 == 0x3c && d1 == 0x3f && d2 == 0x78 && d3 == 0x6d) return encoding_utf8;
+
+               // look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early)
+               if (d0 == 0 && d1 == 0x3c) return encoding_utf16_be;
+               if (d0 == 0x3c && d1 == 0) return encoding_utf16_le;
+
+               // no known BOM detected, assume utf8
+               return encoding_utf8;
+       }
+
+       PUGI__FN xml_encoding get_buffer_encoding(xml_encoding encoding, const void* contents, size_t size)
+       {
+               // replace wchar encoding with utf implementation
+               if (encoding == encoding_wchar) return get_wchar_encoding();
+
+               // replace utf16 encoding with utf16 with specific endianness
+               if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+               // replace utf32 encoding with utf32 with specific endianness
+               if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+               // only do autodetection if no explicit encoding is requested
+               if (encoding != encoding_auto) return encoding;
+
+               // skip encoding autodetection if input buffer is too small
+               if (size < 4) return encoding_utf8;
+
+               // try to guess encoding (based on XML specification, Appendix F.1)
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+               PUGI__DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3];
+
+               return guess_buffer_encoding(d0, d1, d2, d3);
+       }
+
+       PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+       {
+               size_t length = size / sizeof(char_t);
+
+               if (is_mutable)
+               {
+                       out_buffer = static_cast<char_t*>(const_cast<void*>(contents));
+                       out_length = length;
+               }
+               else
+               {
+                       char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!buffer) return false;
+
+                       if (contents)
+                               memcpy(buffer, contents, length * sizeof(char_t));
+                       else
+                               assert(length == 0);
+
+                       buffer[length] = 0;
+
+                       out_buffer = buffer;
+                       out_length = length + 1;
+               }
+
+               return true;
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       PUGI__FN bool need_endian_swap_utf(xml_encoding le, xml_encoding re)
+       {
+               return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) ||
+                          (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be);
+       }
+
+       PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+       {
+               const char_t* data = static_cast<const char_t*>(contents);
+               size_t length = size / sizeof(char_t);
+
+               if (is_mutable)
+               {
+                       char_t* buffer = const_cast<char_t*>(data);
+
+                       convert_wchar_endian_swap(buffer, data, length);
+
+                       out_buffer = buffer;
+                       out_length = length;
+               }
+               else
+               {
+                       char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!buffer) return false;
+
+                       convert_wchar_endian_swap(buffer, data, length);
+                       buffer[length] = 0;
+
+                       out_buffer = buffer;
+                       out_length = length + 1;
+               }
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+       {
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+               size_t data_length = size;
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf8 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer>::decode_utf8_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint16_t* data = static_cast<const uint16_t*>(contents);
+               size_t data_length = size / sizeof(uint16_t);
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter, opt_swap>::decode_utf16_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf16 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer, opt_swap>::decode_utf16_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint32_t* data = static_cast<const uint32_t*>(contents);
+               size_t data_length = size / sizeof(uint32_t);
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter, opt_swap>::decode_utf32_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf32 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer, opt_swap>::decode_utf32_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+       {
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+               size_t data_length = size;
+
+               // get length in wchar_t units
+               size_t length = data_length;
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // convert latin1 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer>::decode_latin1_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+       {
+               // get native encoding
+               xml_encoding wchar_encoding = get_wchar_encoding();
+
+               // fast path: no conversion required
+               if (encoding == wchar_encoding) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+               // only endian-swapping is required
+               if (need_endian_swap_utf(encoding, wchar_encoding)) return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable);
+
+               // source encoding is utf8
+               if (encoding == encoding_utf8) return convert_buffer_utf8(out_buffer, out_length, contents, size);
+
+               // source encoding is utf16
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is utf32
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is latin1
+               if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size);
+
+               assert(!"Invalid encoding");
+               return false;
+       }
+#else
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint16_t* data = static_cast<const uint16_t*>(contents);
+               size_t data_length = size / sizeof(uint16_t);
+
+               // first pass: get length in utf8 units
+               size_t length = utf_decoder<utf8_counter, opt_swap>::decode_utf16_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf16 input to utf8
+               uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* oend = utf_decoder<utf8_writer, opt_swap>::decode_utf16_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint32_t* data = static_cast<const uint32_t*>(contents);
+               size_t data_length = size / sizeof(uint32_t);
+
+               // first pass: get length in utf8 units
+               size_t length = utf_decoder<utf8_counter, opt_swap>::decode_utf32_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf32 input to utf8
+               uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* oend = utf_decoder<utf8_writer, opt_swap>::decode_utf32_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size)
+       {
+               for (size_t i = 0; i < size; ++i)
+                       if (data[i] > 127)
+                               return i;
+
+               return size;
+       }
+
+       PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+       {
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+               size_t data_length = size;
+
+               // get size of prefix that does not need utf8 conversion
+               size_t prefix_length = get_latin1_7bit_prefix_length(data, data_length);
+               assert(prefix_length <= data_length);
+
+               const uint8_t* postfix = data + prefix_length;
+               size_t postfix_length = data_length - prefix_length;
+
+               // if no conversion is needed, just return the original buffer
+               if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+               // first pass: get length in utf8 units
+               size_t length = prefix_length + utf_decoder<utf8_counter>::decode_latin1_block(postfix, postfix_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert latin1 input to utf8
+               memcpy(buffer, data, prefix_length);
+
+               uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* oend = utf_decoder<utf8_writer>::decode_latin1_block(postfix, postfix_length, obegin + prefix_length);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+       {
+               // fast path: no conversion required
+               if (encoding == encoding_utf8) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+               // source encoding is utf16
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is utf32
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is latin1
+               if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
+
+               assert(!"Invalid encoding");
+               return false;
+       }
+#endif
+
+       PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length)
+       {
+               // get length in utf8 characters
+               return utf_decoder<utf8_counter>::decode_wchar_block(str, length, 0);
+       }
+
+       PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length)
+       {
+               // convert to utf8
+               uint8_t* begin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(str, length, begin);
+       
+               assert(begin + size == end);
+               (void)!end;
+
+               // zero-terminate
+               buffer[size] = 0;
+       }
+       
+#ifndef PUGIXML_NO_STL
+       PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length)
+       {
+               // first pass: get length in utf8 characters
+               size_t size = as_utf8_begin(str, length);
+
+               // allocate resulting string
+               std::string result;
+               result.resize(size);
+
+               // second pass: convert to utf8
+               if (size > 0) as_utf8_end(&result[0], size, str, length);
+
+               return result;
+       }
+
+       PUGI__FN std::basic_string<wchar_t> as_wide_impl(const char* str, size_t size)
+       {
+               const uint8_t* data = reinterpret_cast<const uint8_t*>(str);
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
+
+               // allocate resulting string
+               std::basic_string<wchar_t> result;
+               result.resize(length);
+
+               // second pass: convert to wchar_t
+               if (length > 0)
+               {
+                       wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]);
+                       wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin);
+
+                       assert(begin + length == end);
+                       (void)!end;
+               }
+
+               return result;
+       }
+#endif
+
+       inline bool strcpy_insitu_allow(size_t length, uintptr_t header, uintptr_t header_mask, char_t* target)
+       {
+               // never reuse shared memory
+               if (header & xml_memory_page_contents_shared_mask) return false;
+
+               size_t target_length = strlength(target);
+
+               // always reuse document buffer memory if possible
+               if ((header & header_mask) == 0) return target_length >= length;
+
+               // reuse heap memory if waste is not too great
+               const size_t reuse_threshold = 32;
+
+               return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2);
+       }
+
+       PUGI__FN bool strcpy_insitu(char_t*& dest, uintptr_t& header, uintptr_t header_mask, const char_t* source)
+       {
+               assert(header);
+
+               size_t source_length = strlength(source);
+
+               if (source_length == 0)
+               {
+                       // empty string and null pointer are equivalent, so just deallocate old memory
+                       xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+                       if (header & header_mask) alloc->deallocate_string(dest);
+                       
+                       // mark the string as not allocated
+                       dest = 0;
+                       header &= ~header_mask;
+
+                       return true;
+               }
+               else if (dest && strcpy_insitu_allow(source_length, header, header_mask, dest))
+               {
+                       // we can reuse old buffer, so just copy the new data (including zero terminator)
+                       memcpy(dest, source, (source_length + 1) * sizeof(char_t));
+                       
+                       return true;
+               }
+               else
+               {
+                       xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+                       // allocate new buffer
+                       char_t* buf = alloc->allocate_string(source_length + 1);
+                       if (!buf) return false;
+
+                       // copy the string (including zero terminator)
+                       memcpy(buf, source, (source_length + 1) * sizeof(char_t));
+
+                       // deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures)
+                       if (header & header_mask) alloc->deallocate_string(dest);
+                       
+                       // the string is now allocated, so set the flag
+                       dest = buf;
+                       header |= header_mask;
+
+                       return true;
+               }
+       }
+
+       struct gap
+       {
+               char_t* end;
+               size_t size;
+                       
+               gap(): end(0), size(0)
+               {
+               }
+                       
+               // Push new gap, move s count bytes further (skipping the gap).
+               // Collapse previous gap.
+               void push(char_t*& s, size_t count)
+               {
+                       if (end) // there was a gap already; collapse it
+                       {
+                               // Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
+                               assert(s >= end);
+                               memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+                       }
+                               
+                       s += count; // end of current gap
+                               
+                       // "merge" two gaps
+                       end = s;
+                       size += count;
+               }
+                       
+               // Collapse all gaps, return past-the-end pointer
+               char_t* flush(char_t* s)
+               {
+                       if (end)
+                       {
+                               // Move [old_gap_end, current_pos) to [old_gap_start, ...)
+                               assert(s >= end);
+                               memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+
+                               return s - size;
+                       }
+                       else return s;
+               }
+       };
+       
+       PUGI__FN char_t* strconv_escape(char_t* s, gap& g)
+       {
+               char_t* stre = s + 1;
+
+               switch (*stre)
+               {
+                       case '#':       // &#...
+                       {
+                               unsigned int ucsc = 0;
+
+                               if (stre[1] == 'x') // &#x... (hex code)
+                               {
+                                       stre += 2;
+
+                                       char_t ch = *stre;
+
+                                       if (ch == ';') return stre;
+
+                                       for (;;)
+                                       {
+                                               if (static_cast<unsigned int>(ch - '0') <= 9)
+                                                       ucsc = 16 * ucsc + (ch - '0');
+                                               else if (static_cast<unsigned int>((ch | ' ') - 'a') <= 5)
+                                                       ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10);
+                                               else if (ch == ';')
+                                                       break;
+                                               else // cancel
+                                                       return stre;
+
+                                               ch = *++stre;
+                                       }
+                                       
+                                       ++stre;
+                               }
+                               else    // &#... (dec code)
+                               {
+                                       char_t ch = *++stre;
+
+                                       if (ch == ';') return stre;
+
+                                       for (;;)
+                                       {
+                                               if (static_cast<unsigned int>(static_cast<unsigned int>(ch) - '0') <= 9)
+                                                       ucsc = 10 * ucsc + (ch - '0');
+                                               else if (ch == ';')
+                                                       break;
+                                               else // cancel
+                                                       return stre;
+
+                                               ch = *++stre;
+                                       }
+                                       
+                                       ++stre;
+                               }
+
+                       #ifdef PUGIXML_WCHAR_MODE
+                               s = reinterpret_cast<char_t*>(wchar_writer::any(reinterpret_cast<wchar_writer::value_type>(s), ucsc));
+                       #else
+                               s = reinterpret_cast<char_t*>(utf8_writer::any(reinterpret_cast<uint8_t*>(s), ucsc));
+                       #endif
+                                       
+                               g.push(s, stre - s);
+                               return stre;
+                       }
+
+                       case 'a':       // &a
+                       {
+                               ++stre;
+
+                               if (*stre == 'm') // &am
+                               {
+                                       if (*++stre == 'p' && *++stre == ';') // &amp;
+                                       {
+                                               *s++ = '&';
+                                               ++stre;
+                                                       
+                                               g.push(s, stre - s);
+                                               return stre;
+                                       }
+                               }
+                               else if (*stre == 'p') // &ap
+                               {
+                                       if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // &apos;
+                                       {
+                                               *s++ = '\'';
+                                               ++stre;
+
+                                               g.push(s, stre - s);
+                                               return stre;
+                                       }
+                               }
+                               break;
+                       }
+
+                       case 'g': // &g
+                       {
+                               if (*++stre == 't' && *++stre == ';') // &gt;
+                               {
+                                       *s++ = '>';
+                                       ++stre;
+                                       
+                                       g.push(s, stre - s);
+                                       return stre;
+                               }
+                               break;
+                       }
+
+                       case 'l': // &l
+                       {
+                               if (*++stre == 't' && *++stre == ';') // &lt;
+                               {
+                                       *s++ = '<';
+                                       ++stre;
+                                               
+                                       g.push(s, stre - s);
+                                       return stre;
+                               }
+                               break;
+                       }
+
+                       case 'q': // &q
+                       {
+                               if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // &quot;
+                               {
+                                       *s++ = '"';
+                                       ++stre;
+                                       
+                                       g.push(s, stre - s);
+                                       return stre;
+                               }
+                               break;
+                       }
+
+                       default:
+                               break;
+               }
+               
+               return stre;
+       }
+
+       // Parser utilities
+       #define PUGI__ENDSWITH(c, e)        ((c) == (e) || ((c) == 0 && endch == (e)))
+       #define PUGI__SKIPWS()              { while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; }
+       #define PUGI__OPTSET(OPT)           ( optmsk & (OPT) )
+       #define PUGI__PUSHNODE(TYPE)        { cursor = append_new_node(cursor, alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); }
+       #define PUGI__POPNODE()             { cursor = cursor->parent; }
+       #define PUGI__SCANFOR(X)            { while (*s != 0 && !(X)) ++s; }
+       #define PUGI__SCANWHILE(X)          { while (X) ++s; }
+       #define PUGI__SCANWHILE_UNROLL(X)   { for (;;) { char_t ss = s[0]; if (PUGI__UNLIKELY(!(X))) { break; } ss = s[1]; if (PUGI__UNLIKELY(!(X))) { s += 1; break; } ss = s[2]; if (PUGI__UNLIKELY(!(X))) { s += 2; break; } ss = s[3]; if (PUGI__UNLIKELY(!(X))) { s += 3; break; } s += 4; } }
+       #define PUGI__ENDSEG()              { ch = *s; *s = 0; ++s; }
+       #define PUGI__THROW_ERROR(err, m)   return error_offset = m, error_status = err, static_cast<char_t*>(0)
+       #define PUGI__CHECK_ERROR(err, m)   { if (*s == 0) PUGI__THROW_ERROR(err, m); }
+
+       PUGI__FN char_t* strconv_comment(char_t* s, char_t endch)
+       {
+               gap g;
+               
+               while (true)
+               {
+                       PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_comment));
+               
+                       if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+                       {
+                               *s++ = '\n'; // replace first one with 0x0a
+                               
+                               if (*s == '\n') g.push(s, 1);
+                       }
+                       else if (s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')) // comment ends here
+                       {
+                               *g.flush(s) = 0;
+                               
+                               return s + (s[2] == '>' ? 3 : 2);
+                       }
+                       else if (*s == 0)
+                       {
+                               return 0;
+                       }
+                       else ++s;
+               }
+       }
+
+       PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch)
+       {
+               gap g;
+                       
+               while (true)
+               {
+                       PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_cdata));
+                       
+                       if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+                       {
+                               *s++ = '\n'; // replace first one with 0x0a
+                               
+                               if (*s == '\n') g.push(s, 1);
+                       }
+                       else if (s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')) // CDATA ends here
+                       {
+                               *g.flush(s) = 0;
+                               
+                               return s + 1;
+                       }
+                       else if (*s == 0)
+                       {
+                               return 0;
+                       }
+                       else ++s;
+               }
+       }
+       
+       typedef char_t* (*strconv_pcdata_t)(char_t*);
+               
+       template <typename opt_trim, typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
+       {
+               static char_t* parse(char_t* s)
+               {
+                       gap g;
+
+                       char_t* begin = s;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_pcdata));
+
+                               if (*s == '<') // PCDATA ends here
+                               {
+                                       char_t* end = g.flush(s);
+
+                                       if (opt_trim::value)
+                                               while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
+                                                       --end;
+
+                                       *end = 0;
+                                       
+                                       return s + 1;
+                               }
+                               else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+                               {
+                                       *s++ = '\n'; // replace first one with 0x0a
+                                       
+                                       if (*s == '\n') g.push(s, 1);
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (*s == 0)
+                               {
+                                       char_t* end = g.flush(s);
+
+                                       if (opt_trim::value)
+                                               while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
+                                                       --end;
+
+                                       *end = 0;
+
+                                       return s;
+                               }
+                               else ++s;
+                       }
+               }
+       };
+       
+       PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
+       {
+               PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800);
+
+               switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim)
+               {
+               case 0: return strconv_pcdata_impl<opt_false, opt_false, opt_false>::parse;
+               case 1: return strconv_pcdata_impl<opt_false, opt_false, opt_true>::parse;
+               case 2: return strconv_pcdata_impl<opt_false, opt_true, opt_false>::parse;
+               case 3: return strconv_pcdata_impl<opt_false, opt_true, opt_true>::parse;
+               case 4: return strconv_pcdata_impl<opt_true, opt_false, opt_false>::parse;
+               case 5: return strconv_pcdata_impl<opt_true, opt_false, opt_true>::parse;
+               case 6: return strconv_pcdata_impl<opt_true, opt_true, opt_false>::parse;
+               case 7: return strconv_pcdata_impl<opt_true, opt_true, opt_true>::parse;
+               default: assert(false); return 0; // should not get here
+               }
+       }
+
+       typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
+       
+       template <typename opt_escape> struct strconv_attribute_impl
+       {
+               static char_t* parse_wnorm(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       // trim leading whitespaces
+                       if (PUGI__IS_CHARTYPE(*s, ct_space))
+                       {
+                               char_t* str = s;
+                               
+                               do ++str;
+                               while (PUGI__IS_CHARTYPE(*str, ct_space));
+                               
+                               g.push(s, str - s);
+                       }
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space));
+                               
+                               if (*s == end_quote)
+                               {
+                                       char_t* str = g.flush(s);
+                                       
+                                       do *str-- = 0;
+                                       while (PUGI__IS_CHARTYPE(*str, ct_space));
+                               
+                                       return s + 1;
+                               }
+                               else if (PUGI__IS_CHARTYPE(*s, ct_space))
+                               {
+                                       *s++ = ' ';
+               
+                                       if (PUGI__IS_CHARTYPE(*s, ct_space))
+                                       {
+                                               char_t* str = s + 1;
+                                               while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str;
+                                               
+                                               g.push(s, str - s);
+                                       }
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+
+               static char_t* parse_wconv(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws));
+                               
+                               if (*s == end_quote)
+                               {
+                                       *g.flush(s) = 0;
+                               
+                                       return s + 1;
+                               }
+                               else if (PUGI__IS_CHARTYPE(*s, ct_space))
+                               {
+                                       if (*s == '\r')
+                                       {
+                                               *s++ = ' ';
+                               
+                                               if (*s == '\n') g.push(s, 1);
+                                       }
+                                       else *s++ = ' ';
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+
+               static char_t* parse_eol(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
+                               
+                               if (*s == end_quote)
+                               {
+                                       *g.flush(s) = 0;
+                               
+                                       return s + 1;
+                               }
+                               else if (*s == '\r')
+                               {
+                                       *s++ = '\n';
+                                       
+                                       if (*s == '\n') g.push(s, 1);
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+
+               static char_t* parse_simple(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
+                               
+                               if (*s == end_quote)
+                               {
+                                       *g.flush(s) = 0;
+                               
+                                       return s + 1;
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+       };
+
+       PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
+       {
+               PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
+               
+               switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
+               {
+               case 0:  return strconv_attribute_impl<opt_false>::parse_simple;
+               case 1:  return strconv_attribute_impl<opt_true>::parse_simple;
+               case 2:  return strconv_attribute_impl<opt_false>::parse_eol;
+               case 3:  return strconv_attribute_impl<opt_true>::parse_eol;
+               case 4:  return strconv_attribute_impl<opt_false>::parse_wconv;
+               case 5:  return strconv_attribute_impl<opt_true>::parse_wconv;
+               case 6:  return strconv_attribute_impl<opt_false>::parse_wconv;
+               case 7:  return strconv_attribute_impl<opt_true>::parse_wconv;
+               case 8:  return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 9:  return strconv_attribute_impl<opt_true>::parse_wnorm;
+               case 10: return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 11: return strconv_attribute_impl<opt_true>::parse_wnorm;
+               case 12: return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 13: return strconv_attribute_impl<opt_true>::parse_wnorm;
+               case 14: return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 15: return strconv_attribute_impl<opt_true>::parse_wnorm;
+               default: assert(false); return 0; // should not get here
+               }
+       }
+
+       inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
+       {
+               xml_parse_result result;
+               result.status = status;
+               result.offset = offset;
+
+               return result;
+       }
+
+       struct xml_parser
+       {
+               xml_allocator alloc;
+               char_t* error_offset;
+               xml_parse_status error_status;
+               
+               xml_parser(const xml_allocator& alloc_): alloc(alloc_), error_offset(0), error_status(status_ok)
+               {
+               }
+
+               // DOCTYPE consists of nested sections of the following possible types:
+               // <!-- ... -->, <? ... ?>, "...", '...'
+               // <![...]]>
+               // <!...>
+               // First group can not contain nested groups
+               // Second group can contain nested groups of the same type
+               // Third group can contain all other groups
+               char_t* parse_doctype_primitive(char_t* s)
+               {
+                       if (*s == '"' || *s == '\'')
+                       {
+                               // quoted string
+                               char_t ch = *s++;
+                               PUGI__SCANFOR(*s == ch);
+                               if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               s++;
+                       }
+                       else if (s[0] == '<' && s[1] == '?')
+                       {
+                               // <? ... ?>
+                               s += 2;
+                               PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
+                               if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               s += 2;
+                       }
+                       else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
+                       {
+                               s += 4;
+                               PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
+                               if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               s += 4;
+                       }
+                       else PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                       return s;
+               }
+
+               char_t* parse_doctype_ignore(char_t* s)
+               {
+                       size_t depth = 0;
+
+                       assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
+                       s += 3;
+
+                       while (*s)
+                       {
+                               if (s[0] == '<' && s[1] == '!' && s[2] == '[')
+                               {
+                                       // nested ignore section
+                                       s += 3;
+                                       depth++;
+                               }
+                               else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
+                               {
+                                       // ignore section end
+                                       s += 3;
+
+                                       if (depth == 0)
+                                               return s;
+
+                                       depth--;
+                               }
+                               else s++;
+                       }
+
+                       PUGI__THROW_ERROR(status_bad_doctype, s);
+               }
+
+               char_t* parse_doctype_group(char_t* s, char_t endch)
+               {
+                       size_t depth = 0;
+
+                       assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
+                       s += 2;
+
+                       while (*s)
+                       {
+                               if (s[0] == '<' && s[1] == '!' && s[2] != '-')
+                               {
+                                       if (s[2] == '[')
+                                       {
+                                               // ignore
+                                               s = parse_doctype_ignore(s);
+                                               if (!s) return s;
+                                       }
+                                       else
+                                       {
+                                               // some control group
+                                               s += 2;
+                                               depth++;
+                                       }
+                               }
+                               else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
+                               {
+                                       // unknown tag (forbidden), or some primitive group
+                                       s = parse_doctype_primitive(s);
+                                       if (!s) return s;
+                               }
+                               else if (*s == '>')
+                               {
+                                       if (depth == 0)
+                                               return s;
+
+                                       depth--;
+                                       s++;
+                               }
+                               else s++;
+                       }
+
+                       if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                       return s;
+               }
+
+               char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch)
+               {
+                       // parse node contents, starting with exclamation mark
+                       ++s;
+
+                       if (*s == '-') // '<!-...'
+                       {
+                               ++s;
+
+                               if (*s == '-') // '<!--...'
+                               {
+                                       ++s;
+
+                                       if (PUGI__OPTSET(parse_comments))
+                                       {
+                                               PUGI__PUSHNODE(node_comment); // Append a new node on the tree.
+                                               cursor->value = s; // Save the offset.
+                                       }
+
+                                       if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments))
+                                       {
+                                               s = strconv_comment(s, endch);
+
+                                               if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value);
+                                       }
+                                       else
+                                       {
+                                               // Scan for terminating '-->'.
+                                               PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>'));
+                                               PUGI__CHECK_ERROR(status_bad_comment, s);
+
+                                               if (PUGI__OPTSET(parse_comments))
+                                                       *s = 0; // Zero-terminate this segment at the first terminating '-'.
+
+                                               s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
+                                       }
+                               }
+                               else PUGI__THROW_ERROR(status_bad_comment, s);
+                       }
+                       else if (*s == '[')
+                       {
+                               // '<![CDATA[...'
+                               if (*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
+                               {
+                                       ++s;
+
+                                       if (PUGI__OPTSET(parse_cdata))
+                                       {
+                                               PUGI__PUSHNODE(node_cdata); // Append a new node on the tree.
+                                               cursor->value = s; // Save the offset.
+
+                                               if (PUGI__OPTSET(parse_eol))
+                                               {
+                                                       s = strconv_cdata(s, endch);
+
+                                                       if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value);
+                                               }
+                                               else
+                                               {
+                                                       // Scan for terminating ']]>'.
+                                                       PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>'));
+                                                       PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+                                                       *s++ = 0; // Zero-terminate this segment.
+                                               }
+                                       }
+                                       else // Flagged for discard, but we still have to scan for the terminator.
+                                       {
+                                               // Scan for terminating ']]>'.
+                                               PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>'));
+                                               PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+                                               ++s;
+                                       }
+
+                                       s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
+                               }
+                               else PUGI__THROW_ERROR(status_bad_cdata, s);
+                       }
+                       else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && PUGI__ENDSWITH(s[6], 'E'))
+                       {
+                               s -= 2;
+
+                               if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               char_t* mark = s + 9;
+
+                               s = parse_doctype_group(s, endch);
+                               if (!s) return s;
+
+                               assert((*s == 0 && endch == '>') || *s == '>');
+                               if (*s) *s++ = 0;
+
+                               if (PUGI__OPTSET(parse_doctype))
+                               {
+                                       while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark;
+
+                                       PUGI__PUSHNODE(node_doctype);
+
+                                       cursor->value = mark;
+                               }
+                       }
+                       else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s);
+                       else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s);
+                       else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+
+                       return s;
+               }
+
+               char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch)
+               {
+                       // load into registers
+                       xml_node_struct* cursor = ref_cursor;
+                       char_t ch = 0;
+
+                       // parse node contents, starting with question mark
+                       ++s;
+
+                       // read PI target
+                       char_t* target = s;
+
+                       if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s);
+
+                       PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol));
+                       PUGI__CHECK_ERROR(status_bad_pi, s);
+
+                       // determine node type; stricmp / strcasecmp is not portable
+                       bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s;
+
+                       if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi))
+                       {
+                               if (declaration)
+                               {
+                                       // disallow non top-level declarations
+                                       if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s);
+
+                                       PUGI__PUSHNODE(node_declaration);
+                               }
+                               else
+                               {
+                                       PUGI__PUSHNODE(node_pi);
+                               }
+
+                               cursor->name = target;
+
+                               PUGI__ENDSEG();
+
+                               // parse value/attributes
+                               if (ch == '?')
+                               {
+                                       // empty node
+                                       if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s);
+                                       s += (*s == '>');
+
+                                       PUGI__POPNODE();
+                               }
+                               else if (PUGI__IS_CHARTYPE(ch, ct_space))
+                               {
+                                       PUGI__SKIPWS();
+
+                                       // scan for tag end
+                                       char_t* value = s;
+
+                                       PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>'));
+                                       PUGI__CHECK_ERROR(status_bad_pi, s);
+
+                                       if (declaration)
+                                       {
+                                               // replace ending ? with / so that 'element' terminates properly
+                                               *s = '/';
+
+                                               // we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES
+                                               s = value;
+                                       }
+                                       else
+                                       {
+                                               // store value and step over >
+                                               cursor->value = value;
+                                               PUGI__POPNODE();
+
+                                               PUGI__ENDSEG();
+
+                                               s += (*s == '>');
+                                       }
+                               }
+                               else PUGI__THROW_ERROR(status_bad_pi, s);
+                       }
+                       else
+                       {
+                               // scan for tag end
+                               PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>'));
+                               PUGI__CHECK_ERROR(status_bad_pi, s);
+
+                               s += (s[1] == '>' ? 2 : 1);
+                       }
+
+                       // store from registers
+                       ref_cursor = cursor;
+
+                       return s;
+               }
+
+               char_t* parse_tree(char_t* s, xml_node_struct* root, unsigned int optmsk, char_t endch)
+               {
+                       strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
+                       strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
+                       
+                       char_t ch = 0;
+                       xml_node_struct* cursor = root;
+                       char_t* mark = s;
+
+                       while (*s != 0)
+                       {
+                               if (*s == '<')
+                               {
+                                       ++s;
+
+                               LOC_TAG:
+                                       if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
+                                       {
+                                               PUGI__PUSHNODE(node_element); // Append a new node to the tree.
+
+                                               cursor->name = s;
+
+                                               PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
+                                               PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+
+                                               if (ch == '>')
+                                               {
+                                                       // end of tag
+                                               }
+                                               else if (PUGI__IS_CHARTYPE(ch, ct_space))
+                                               {
+                                               LOC_ATTRIBUTES:
+                                                       while (true)
+                                                       {
+                                                               PUGI__SKIPWS(); // Eat any whitespace.
+                                               
+                                                               if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
+                                                               {
+                                                                       xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for this attribute.
+                                                                       if (!a) PUGI__THROW_ERROR(status_out_of_memory, s);
+
+                                                                       a->name = s; // Save the offset.
+
+                                                                       PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
+                                                                       PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+
+                                                                       if (PUGI__IS_CHARTYPE(ch, ct_space))
+                                                                       {
+                                                                               PUGI__SKIPWS(); // Eat any whitespace.
+
+                                                                               ch = *s;
+                                                                               ++s;
+                                                                       }
+                                                                       
+                                                                       if (ch == '=') // '<... #=...'
+                                                                       {
+                                                                               PUGI__SKIPWS(); // Eat any whitespace.
+
+                                                                               if (*s == '"' || *s == '\'') // '<... #="...'
+                                                                               {
+                                                                                       ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
+                                                                                       ++s; // Step over the quote.
+                                                                                       a->value = s; // Save the offset.
+
+                                                                                       s = strconv_attribute(s, ch);
+                                                                               
+                                                                                       if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value);
+
+                                                                                       // After this line the loop continues from the start;
+                                                                                       // Whitespaces, / and > are ok, symbols and EOF are wrong,
+                                                                                       // everything else will be detected
+                                                                                       if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s);
+                                                                               }
+                                                                               else PUGI__THROW_ERROR(status_bad_attribute, s);
+                                                                       }
+                                                                       else PUGI__THROW_ERROR(status_bad_attribute, s);
+                                                               }
+                                                               else if (*s == '/')
+                                                               {
+                                                                       ++s;
+                                                                       
+                                                                       if (*s == '>')
+                                                                       {
+                                                                               PUGI__POPNODE();
+                                                                               s++;
+                                                                               break;
+                                                                       }
+                                                                       else if (*s == 0 && endch == '>')
+                                                                       {
+                                                                               PUGI__POPNODE();
+                                                                               break;
+                                                                       }
+                                                                       else PUGI__THROW_ERROR(status_bad_start_element, s);
+                                                               }
+                                                               else if (*s == '>')
+                                                               {
+                                                                       ++s;
+
+                                                                       break;
+                                                               }
+                                                               else if (*s == 0 && endch == '>')
+                                                               {
+                                                                       break;
+                                                               }
+                                                               else PUGI__THROW_ERROR(status_bad_start_element, s);
+                                                       }
+
+                                                       // !!!
+                                               }
+                                               else if (ch == '/') // '<#.../'
+                                               {
+                                                       if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s);
+
+                                                       PUGI__POPNODE(); // Pop.
+
+                                                       s += (*s == '>');
+                                               }
+                                               else if (ch == 0)
+                                               {
+                                                       // we stepped over null terminator, backtrack & handle closing tag
+                                                       --s;
+                                                       
+                                                       if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s);
+                                               }
+                                               else PUGI__THROW_ERROR(status_bad_start_element, s);
+                                       }
+                                       else if (*s == '/')
+                                       {
+                                               ++s;
+
+                                               char_t* name = cursor->name;
+                                               if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+                                               
+                                               while (PUGI__IS_CHARTYPE(*s, ct_symbol))
+                                               {
+                                                       if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+                                               }
+
+                                               if (*name)
+                                               {
+                                                       if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s);
+                                                       else PUGI__THROW_ERROR(status_end_element_mismatch, s);
+                                               }
+                                                       
+                                               PUGI__POPNODE(); // Pop.
+
+                                               PUGI__SKIPWS();
+
+                                               if (*s == 0)
+                                               {
+                                                       if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+                                               }
+                                               else
+                                               {
+                                                       if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+                                                       ++s;
+                                               }
+                                       }
+                                       else if (*s == '?') // '<?...'
+                                       {
+                                               s = parse_question(s, cursor, optmsk, endch);
+                                               if (!s) return s;
+
+                                               assert(cursor);
+                                               if (PUGI__NODETYPE(cursor) == node_declaration) goto LOC_ATTRIBUTES;
+                                       }
+                                       else if (*s == '!') // '<!...'
+                                       {
+                                               s = parse_exclamation(s, cursor, optmsk, endch);
+                                               if (!s) return s;
+                                       }
+                                       else if (*s == 0 && endch == '?') PUGI__THROW_ERROR(status_bad_pi, s);
+                                       else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+                               }
+                               else
+                               {
+                                       mark = s; // Save this offset while searching for a terminator.
+
+                                       PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
+
+                                       if (*s == '<' || !*s)
+                                       {
+                                               // We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
+                                               assert(mark != s);
+
+                                               if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single) || PUGI__OPTSET(parse_trim_pcdata))
+                                               {
+                                                       continue;
+                                               }
+                                               else if (PUGI__OPTSET(parse_ws_pcdata_single))
+                                               {
+                                                       if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue;
+                                               }
+                                       }
+
+                                       if (!PUGI__OPTSET(parse_trim_pcdata))
+                                               s = mark;
+                                                       
+                                       if (cursor->parent || PUGI__OPTSET(parse_fragment))
+                                       {
+                                               PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
+                                               cursor->value = s; // Save the offset.
+
+                                               s = strconv_pcdata(s);
+                                                               
+                                               PUGI__POPNODE(); // Pop since this is a standalone.
+                                               
+                                               if (!*s) break;
+                                       }
+                                       else
+                                       {
+                                               PUGI__SCANFOR(*s == '<'); // '...<'
+                                               if (!*s) break;
+                                               
+                                               ++s;
+                                       }
+
+                                       // We're after '<'
+                                       goto LOC_TAG;
+                               }
+                       }
+
+                       // check that last tag is closed
+                       if (cursor != root) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+
+                       return s;
+               }
+
+       #ifdef PUGIXML_WCHAR_MODE
+               static char_t* parse_skip_bom(char_t* s)
+               {
+                       unsigned int bom = 0xfeff;
+                       return (s[0] == static_cast<wchar_t>(bom)) ? s + 1 : s;
+               }
+       #else
+               static char_t* parse_skip_bom(char_t* s)
+               {
+                       return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
+               }
+       #endif
+
+               static bool has_element_node_siblings(xml_node_struct* node)
+               {
+                       while (node)
+                       {
+                               if (PUGI__NODETYPE(node) == node_element) return true;
+
+                               node = node->next_sibling;
+                       }
+
+                       return false;
+               }
+
+               static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk)
+               {
+                       // allocator object is a part of document object
+                       xml_allocator& alloc_ = *static_cast<xml_allocator*>(xmldoc);
+
+                       // early-out for empty documents
+                       if (length == 0)
+                               return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element);
+
+                       // get last child of the root before parsing
+                       xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0;
+       
+                       // create parser on stack
+                       xml_parser parser(alloc_);
+
+                       // save last character and make buffer zero-terminated (speeds up parsing)
+                       char_t endch = buffer[length - 1];
+                       buffer[length - 1] = 0;
+                       
+                       // skip BOM to make sure it does not end up as part of parse output
+                       char_t* buffer_data = parse_skip_bom(buffer);
+
+                       // perform actual parsing
+                       parser.parse_tree(buffer_data, root, optmsk, endch);
+
+                       // update allocator state
+                       alloc_ = parser.alloc;
+
+                       xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
+                       assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
+
+                       if (result)
+                       {
+                               // since we removed last character, we have to handle the only possible false positive (stray <)
+                               if (endch == '<')
+                                       return make_parse_result(status_unrecognized_tag, length - 1);
+
+                               // check if there are any element nodes parsed
+                               xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child;
+
+                               if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed))
+                                       return make_parse_result(status_no_document_element, length - 1);
+                       }
+                       else
+                       {
+                               // roll back offset if it occurs on a null terminator in the source buffer
+                               if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
+                                       result.offset--;
+                       }
+
+                       return result;
+               }
+       };
+
+       // Output facilities
+       PUGI__FN xml_encoding get_write_native_encoding()
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               return get_wchar_encoding();
+       #else
+               return encoding_utf8;
+       #endif
+       }
+
+       PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding)
+       {
+               // replace wchar encoding with utf implementation
+               if (encoding == encoding_wchar) return get_wchar_encoding();
+
+               // replace utf16 encoding with utf16 with specific endianness
+               if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+               // replace utf32 encoding with utf32 with specific endianness
+               if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+               // only do autodetection if no explicit encoding is requested
+               if (encoding != encoding_auto) return encoding;
+
+               // assume utf8 encoding
+               return encoding_utf8;
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+       {
+               if (length < 1) return 0;
+
+               // discard last character if it's the lead of a surrogate pair 
+               return (sizeof(wchar_t) == 2 && static_cast<unsigned int>(static_cast<uint16_t>(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length;
+       }
+
+       PUGI__FN size_t convert_buffer_output(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+       {
+               // only endian-swapping is required
+               if (need_endian_swap_utf(encoding, get_wchar_encoding()))
+               {
+                       convert_wchar_endian_swap(r_char, data, length);
+
+                       return length * sizeof(char_t);
+               }
+       
+               // convert to utf8
+               if (encoding == encoding_utf8)
+               {
+                       uint8_t* dest = r_u8;
+                       uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(data, length, dest);
+
+                       return static_cast<size_t>(end - dest);
+               }
+
+               // convert to utf16
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       uint16_t* dest = r_u16;
+
+                       // convert to native utf16
+                       uint16_t* end = utf_decoder<utf16_writer>::decode_wchar_block(data, length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+               }
+
+               // convert to utf32
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       uint32_t* dest = r_u32;
+
+                       // convert to native utf32
+                       uint32_t* end = utf_decoder<utf32_writer>::decode_wchar_block(data, length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+               }
+
+               // convert to latin1
+               if (encoding == encoding_latin1)
+               {
+                       uint8_t* dest = r_u8;
+                       uint8_t* end = utf_decoder<latin1_writer>::decode_wchar_block(data, length, dest);
+
+                       return static_cast<size_t>(end - dest);
+               }
+
+               assert(!"Invalid encoding");
+               return 0;
+       }
+#else
+       PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+       {
+               if (length < 5) return 0;
+
+               for (size_t i = 1; i <= 4; ++i)
+               {
+                       uint8_t ch = static_cast<uint8_t>(data[length - i]);
+
+                       // either a standalone character or a leading one
+                       if ((ch & 0xc0) != 0x80) return length - i;
+               }
+
+               // there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk
+               return length;
+       }
+
+       PUGI__FN size_t convert_buffer_output(char_t* /* r_char */, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+       {
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       uint16_t* dest = r_u16;
+
+                       // convert to native utf16
+                       uint16_t* end = utf_decoder<utf16_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+               }
+
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       uint32_t* dest = r_u32;
+
+                       // convert to native utf32
+                       uint32_t* end = utf_decoder<utf32_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+               }
+
+               if (encoding == encoding_latin1)
+               {
+                       uint8_t* dest = r_u8;
+                       uint8_t* end = utf_decoder<latin1_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+                       return static_cast<size_t>(end - dest);
+               }
+
+               assert(!"Invalid encoding");
+               return 0;
+       }
+#endif
+
+       class xml_buffered_writer
+       {
+               xml_buffered_writer(const xml_buffered_writer&);
+               xml_buffered_writer& operator=(const xml_buffered_writer&);
+
+       public:
+               xml_buffered_writer(xml_writer& writer_, xml_encoding user_encoding): writer(writer_), bufsize(0), encoding(get_write_encoding(user_encoding))
+               {
+                       PUGI__STATIC_ASSERT(bufcapacity >= 8);
+               }
+
+               ~xml_buffered_writer()
+               {
+                       flush();
+               }
+
+               size_t flush()
+               {
+                       flush(buffer, bufsize);
+                       bufsize = 0;
+                       return 0;
+               }
+
+               void flush(const char_t* data, size_t size)
+               {
+                       if (size == 0) return;
+
+                       // fast path, just write data
+                       if (encoding == get_write_native_encoding())
+                               writer.write(data, size * sizeof(char_t));
+                       else
+                       {
+                               // convert chunk
+                               size_t result = convert_buffer_output(scratch.data_char, scratch.data_u8, scratch.data_u16, scratch.data_u32, data, size, encoding);
+                               assert(result <= sizeof(scratch));
+
+                               // write data
+                               writer.write(scratch.data_u8, result);
+                       }
+               }
+
+               void write_direct(const char_t* data, size_t length)
+               {
+                       // flush the remaining buffer contents
+                       flush();
+
+                       // handle large chunks
+                       if (length > bufcapacity)
+                       {
+                               if (encoding == get_write_native_encoding())
+                               {
+                                       // fast path, can just write data chunk
+                                       writer.write(data, length * sizeof(char_t));
+                                       return;
+                               }
+
+                               // need to convert in suitable chunks
+                               while (length > bufcapacity)
+                               {
+                                       // get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer
+                                       // and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary)
+                                       size_t chunk_size = get_valid_length(data, bufcapacity);
+                                       assert(chunk_size);
+
+                                       // convert chunk and write
+                                       flush(data, chunk_size);
+
+                                       // iterate
+                                       data += chunk_size;
+                                       length -= chunk_size;
+                               }
+
+                               // small tail is copied below
+                               bufsize = 0;
+                       }
+
+                       memcpy(buffer + bufsize, data, length * sizeof(char_t));
+                       bufsize += length;
+               }
+
+               void write_buffer(const char_t* data, size_t length)
+               {
+                       size_t offset = bufsize;
+
+                       if (offset + length <= bufcapacity)
+                       {
+                               memcpy(buffer + offset, data, length * sizeof(char_t));
+                               bufsize = offset + length;
+                       }
+                       else
+                       {
+                               write_direct(data, length);
+                       }
+               }
+
+               void write_string(const char_t* data)
+               {
+                       // write the part of the string that fits in the buffer
+                       size_t offset = bufsize;
+
+                       while (*data && offset < bufcapacity)
+                               buffer[offset++] = *data++;
+
+                       // write the rest
+                       if (offset < bufcapacity)
+                       {
+                               bufsize = offset;
+                       }
+                       else
+                       {
+                               // backtrack a bit if we have split the codepoint
+                               size_t length = offset - bufsize;
+                               size_t extra = length - get_valid_length(data - length, length);
+
+                               bufsize = offset - extra;
+
+                               write_direct(data - extra, strlength(data) + extra);
+                       }
+               }
+
+               void write(char_t d0)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 1) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       bufsize = offset + 1;
+               }
+
+               void write(char_t d0, char_t d1)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 2) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       bufsize = offset + 2;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 3) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       bufsize = offset + 3;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2, char_t d3)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 4) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       buffer[offset + 3] = d3;
+                       bufsize = offset + 4;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 5) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       buffer[offset + 3] = d3;
+                       buffer[offset + 4] = d4;
+                       bufsize = offset + 5;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 6) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       buffer[offset + 3] = d3;
+                       buffer[offset + 4] = d4;
+                       buffer[offset + 5] = d5;
+                       bufsize = offset + 6;
+               }
+
+               // utf8 maximum expansion: x4 (-> utf32)
+               // utf16 maximum expansion: x2 (-> utf32)
+               // utf32 maximum expansion: x1
+               enum
+               {
+                       bufcapacitybytes =
+                       #ifdef PUGIXML_MEMORY_OUTPUT_STACK
+                               PUGIXML_MEMORY_OUTPUT_STACK
+                       #else
+                               10240
+                       #endif
+                       ,
+                       bufcapacity = bufcapacitybytes / (sizeof(char_t) + 4)
+               };
+
+               char_t buffer[bufcapacity];
+
+               union
+               {
+                       uint8_t data_u8[4 * bufcapacity];
+                       uint16_t data_u16[2 * bufcapacity];
+                       uint32_t data_u32[bufcapacity];
+                       char_t data_char[bufcapacity];
+               } scratch;
+
+               xml_writer& writer;
+               size_t bufsize;
+               xml_encoding encoding;
+       };
+
+       PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type)
+       {
+               while (*s)
+               {
+                       const char_t* prev = s;
+                       
+                       // While *s is a usual symbol
+                       PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type));
+               
+                       writer.write_buffer(prev, static_cast<size_t>(s - prev));
+
+                       switch (*s)
+                       {
+                               case 0: break;
+                               case '&':
+                                       writer.write('&', 'a', 'm', 'p', ';');
+                                       ++s;
+                                       break;
+                               case '<':
+                                       writer.write('&', 'l', 't', ';');
+                                       ++s;
+                                       break;
+                               case '>':
+                                       writer.write('&', 'g', 't', ';');
+                                       ++s;
+                                       break;
+                               case '"':
+                                       writer.write('&', 'q', 'u', 'o', 't', ';');
+                                       ++s;
+                                       break;
+                               default: // s is not a usual symbol
+                               {
+                                       unsigned int ch = static_cast<unsigned int>(*s++);
+                                       assert(ch < 32);
+
+                                       writer.write('&', '#', static_cast<char_t>((ch / 10) + '0'), static_cast<char_t>((ch % 10) + '0'), ';');
+                               }
+                       }
+               }
+       }
+
+       PUGI__FN void text_output(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags)
+       {
+               if (flags & format_no_escapes)
+                       writer.write_string(s);
+               else
+                       text_output_escaped(writer, s, type);
+       }
+
+       PUGI__FN void text_output_cdata(xml_buffered_writer& writer, const char_t* s)
+       {
+               do
+               {
+                       writer.write('<', '!', '[', 'C', 'D');
+                       writer.write('A', 'T', 'A', '[');
+
+                       const char_t* prev = s;
+
+                       // look for ]]> sequence - we can't output it as is since it terminates CDATA
+                       while (*s && !(s[0] == ']' && s[1] == ']' && s[2] == '>')) ++s;
+
+                       // skip ]] if we stopped at ]]>, > will go to the next CDATA section
+                       if (*s) s += 2;
+
+                       writer.write_buffer(prev, static_cast<size_t>(s - prev));
+
+                       writer.write(']', ']', '>');
+               }
+               while (*s);
+       }
+
+       PUGI__FN void text_output_indent(xml_buffered_writer& writer, const char_t* indent, size_t indent_length, unsigned int depth)
+       {
+               switch (indent_length)
+               {
+               case 1:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0]);
+                       break;
+               }
+
+               case 2:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0], indent[1]);
+                       break;
+               }
+
+               case 3:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0], indent[1], indent[2]);
+                       break;
+               }
+
+               case 4:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0], indent[1], indent[2], indent[3]);
+                       break;
+               }
+
+               default:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write_buffer(indent, indent_length);
+               }
+               }
+       }
+
+       PUGI__FN void node_output_comment(xml_buffered_writer& writer, const char_t* s)
+       {
+               writer.write('<', '!', '-', '-');
+
+               while (*s)
+               {
+                       const char_t* prev = s;
+
+                       // look for -\0 or -- sequence - we can't output it since -- is illegal in comment body
+                       while (*s && !(s[0] == '-' && (s[1] == '-' || s[1] == 0))) ++s;
+
+                       writer.write_buffer(prev, static_cast<size_t>(s - prev));
+
+                       if (*s)
+                       {
+                               assert(*s == '-');
+
+                               writer.write('-', ' ');
+                               ++s;
+                       }
+               }
+
+               writer.write('-', '-', '>');
+       }
+
+       PUGI__FN void node_output_attributes(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+               for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute)
+               {
+                       writer.write(' ');
+                       writer.write_string(a->name ? a->name : default_name);
+                       writer.write('=', '"');
+
+                       if (a->value)
+                               text_output(writer, a->value, ctx_special_attr, flags);
+
+                       writer.write('"');
+               }
+       }
+
+       PUGI__FN bool node_output_start(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+               const char_t* name = node->name ? node->name : default_name;
+
+               writer.write('<');
+               writer.write_string(name);
+
+               if (node->first_attribute)
+                       node_output_attributes(writer, node, flags);
+
+               if (flags & format_raw)
+               {
+                       if (!node->first_child)
+                               writer.write(' ', '/', '>');
+                       else
+                       {
+                               writer.write('>');
+
+                               return true;
+                       }
+               }
+               else
+               {
+                       xml_node_struct* first = node->first_child;
+
+                       if (!first)
+                               writer.write(' ', '/', '>', '\n');
+                       else if (!first->next_sibling && (PUGI__NODETYPE(first) == node_pcdata || PUGI__NODETYPE(first) == node_cdata))
+                       {
+                               writer.write('>');
+
+                               const char_t* value = first->value ? first->value : PUGIXML_TEXT("");
+
+                               if (PUGI__NODETYPE(first) == node_pcdata)
+                                       text_output(writer, value, ctx_special_pcdata, flags);
+                               else
+                                       text_output_cdata(writer, value);
+
+                               writer.write('<', '/');
+                               writer.write_string(name);
+                               writer.write('>', '\n');
+                       }
+                       else
+                       {
+                               writer.write('>', '\n');
+
+                               return true;
+                       }
+               }
+
+               return false;
+       }
+
+       PUGI__FN void node_output_end(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+               const char_t* name = node->name ? node->name : default_name;
+
+               writer.write('<', '/');
+               writer.write_string(name);
+
+               if (flags & format_raw)
+                       writer.write('>');
+               else
+                       writer.write('>', '\n');
+       }
+
+       PUGI__FN void node_output_simple(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+               switch (PUGI__NODETYPE(node))
+               {
+                       case node_pcdata:
+                               text_output(writer, node->value ? node->value : PUGIXML_TEXT(""), ctx_special_pcdata, flags);
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_cdata:
+                               text_output_cdata(writer, node->value ? node->value : PUGIXML_TEXT(""));
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_comment:
+                               node_output_comment(writer, node->value ? node->value : PUGIXML_TEXT(""));
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_pi:
+                               writer.write('<', '?');
+                               writer.write_string(node->name ? node->name : default_name);
+
+                               if (node->value)
+                               {
+                                       writer.write(' ');
+                                       writer.write_string(node->value);
+                               }
+
+                               writer.write('?', '>');
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_declaration:
+                               writer.write('<', '?');
+                               writer.write_string(node->name ? node->name : default_name);
+                               node_output_attributes(writer, node, flags);
+                               writer.write('?', '>');
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_doctype:
+                               writer.write('<', '!', 'D', 'O', 'C');
+                               writer.write('T', 'Y', 'P', 'E');
+
+                               if (node->value)
+                               {
+                                       writer.write(' ');
+                                       writer.write_string(node->value);
+                               }
+
+                               writer.write('>');
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       default:
+                               assert(!"Invalid node type");
+               }
+       }
+
+       PUGI__FN void node_output(xml_buffered_writer& writer, xml_node_struct* root, const char_t* indent, unsigned int flags, unsigned int depth)
+       {
+               size_t indent_length = ((flags & (format_indent | format_raw)) == format_indent) ? strlength(indent) : 0;
+
+               xml_node_struct* node = root;
+
+               do
+               {
+                       assert(node);
+
+                       // begin writing current node
+                       if (indent_length)
+                               text_output_indent(writer, indent, indent_length, depth);
+
+                       if (PUGI__NODETYPE(node) == node_element)
+                       {
+                               if (node_output_start(writer, node, flags))
+                               {
+                                       node = node->first_child;
+                                       depth++;
+                                       continue;
+                               }
+                       }
+                       else if (PUGI__NODETYPE(node) == node_document)
+                       {
+                               if (node->first_child)
+                               {
+                                       node = node->first_child;
+                                       continue;
+                               }
+                       }
+                       else
+                       {
+                               node_output_simple(writer, node, flags);
+                       }
+
+                       // continue to the next node
+                       while (node != root)
+                       {
+                               if (node->next_sibling)
+                               {
+                                       node = node->next_sibling;
+                                       break;
+                               }
+
+                               node = node->parent;
+
+                               // write closing node
+                               if (PUGI__NODETYPE(node) == node_element)
+                               {
+                                       depth--;
+
+                                       if (indent_length)
+                                               text_output_indent(writer, indent, indent_length, depth);
+
+                                       node_output_end(writer, node, flags);
+                               }
+                       }
+               }
+               while (node != root);
+       }
+
+       PUGI__FN bool has_declaration(xml_node_struct* node)
+       {
+               for (xml_node_struct* child = node->first_child; child; child = child->next_sibling)
+               {
+                       xml_node_type type = PUGI__NODETYPE(child);
+
+                       if (type == node_declaration) return true;
+                       if (type == node_element) return false;
+               }
+
+               return false;
+       }
+
+       PUGI__FN bool is_attribute_of(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute)
+                       if (a == attr)
+                               return true;
+
+               return false;
+       }
+
+       PUGI__FN bool allow_insert_attribute(xml_node_type parent)
+       {
+               return parent == node_element || parent == node_declaration;
+       }
+
+       PUGI__FN bool allow_insert_child(xml_node_type parent, xml_node_type child)
+       {
+               if (parent != node_document && parent != node_element) return false;
+               if (child == node_document || child == node_null) return false;
+               if (parent != node_document && (child == node_declaration || child == node_doctype)) return false;
+
+               return true;
+       }
+
+       PUGI__FN bool allow_move(xml_node parent, xml_node child)
+       {
+               // check that child can be a child of parent
+               if (!allow_insert_child(parent.type(), child.type()))
+                       return false;
+
+               // check that node is not moved between documents
+               if (parent.root() != child.root())
+                       return false;
+
+               // check that new parent is not in the child subtree
+               xml_node cur = parent;
+
+               while (cur)
+               {
+                       if (cur == child)
+                               return false;
+
+                       cur = cur.parent();
+               }
+
+               return true;
+       }
+
+       PUGI__FN void node_copy_string(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char_t* source, uintptr_t& source_header, xml_allocator* alloc)
+       {
+               assert(!dest && (header & header_mask) == 0);
+
+               if (source)
+               {
+                       if (alloc && (source_header & header_mask) == 0)
+                       {
+                               dest = source;
+
+                               // since strcpy_insitu can reuse document buffer memory we need to mark both source and dest as shared
+                               header |= xml_memory_page_contents_shared_mask;
+                               source_header |= xml_memory_page_contents_shared_mask;
+                       }
+                       else
+                               strcpy_insitu(dest, header, header_mask, source);
+               }
+       }
+
+       PUGI__FN void node_copy_contents(xml_node_struct* dn, xml_node_struct* sn, xml_allocator* shared_alloc)
+       {
+               node_copy_string(dn->name, dn->header, xml_memory_page_name_allocated_mask, sn->name, sn->header, shared_alloc);
+               node_copy_string(dn->value, dn->header, xml_memory_page_value_allocated_mask, sn->value, sn->header, shared_alloc);
+
+               for (xml_attribute_struct* sa = sn->first_attribute; sa; sa = sa->next_attribute)
+               {
+                       xml_attribute_struct* da = append_new_attribute(dn, get_allocator(dn));
+
+                       if (da)
+                       {
+                               node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, shared_alloc);
+                               node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, shared_alloc);
+                       }
+               }
+       }
+
+       PUGI__FN void node_copy_tree(xml_node_struct* dn, xml_node_struct* sn)
+       {
+               xml_allocator& alloc = get_allocator(dn);
+               xml_allocator* shared_alloc = (&alloc == &get_allocator(sn)) ? &alloc : 0;
+
+               node_copy_contents(dn, sn, shared_alloc);
+
+               xml_node_struct* dit = dn;
+               xml_node_struct* sit = sn->first_child;
+
+               while (sit && sit != sn)
+               {
+                       if (sit != dn)
+                       {
+                               xml_node_struct* copy = append_new_node(dit, alloc, PUGI__NODETYPE(sit));
+
+                               if (copy)
+                               {
+                                       node_copy_contents(copy, sit, shared_alloc);
+
+                                       if (sit->first_child)
+                                       {
+                                               dit = copy;
+                                               sit = sit->first_child;
+                                               continue;
+                                       }
+                               }
+                       }
+
+                       // continue to the next node
+                       do
+                       {
+                               if (sit->next_sibling)
+                               {
+                                       sit = sit->next_sibling;
+                                       break;
+                               }
+
+                               sit = sit->parent;
+                               dit = dit->parent;
+                       }
+                       while (sit != sn);
+               }
+       }
+
+       inline bool is_text_node(xml_node_struct* node)
+       {
+               xml_node_type type = PUGI__NODETYPE(node);
+
+               return type == node_pcdata || type == node_cdata;
+       }
+
+       // get value with conversion functions
+       PUGI__FN int get_integer_base(const char_t* value)
+       {
+               const char_t* s = value;
+
+               while (PUGI__IS_CHARTYPE(*s, ct_space))
+                       s++;
+
+               if (*s == '-')
+                       s++;
+
+               return (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) ? 16 : 10;
+       }
+
+       PUGI__FN int get_value_int(const char_t* value, int def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return static_cast<int>(wcstol(value, 0, base));
+       #else
+               return static_cast<int>(strtol(value, 0, base));
+       #endif
+       }
+
+       PUGI__FN unsigned int get_value_uint(const char_t* value, unsigned int def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return static_cast<unsigned int>(wcstoul(value, 0, base));
+       #else
+               return static_cast<unsigned int>(strtoul(value, 0, base));
+       #endif
+       }
+
+       PUGI__FN double get_value_double(const char_t* value, double def)
+       {
+               if (!value) return def;
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcstod(value, 0);
+       #else
+               return strtod(value, 0);
+       #endif
+       }
+
+       PUGI__FN float get_value_float(const char_t* value, float def)
+       {
+               if (!value) return def;
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return static_cast<float>(wcstod(value, 0));
+       #else
+               return static_cast<float>(strtod(value, 0));
+       #endif
+       }
+
+       PUGI__FN bool get_value_bool(const char_t* value, bool def)
+       {
+               if (!value) return def;
+
+               // only look at first char
+               char_t first = *value;
+
+               // 1*, t* (true), T* (True), y* (yes), Y* (YES)
+               return (first == '1' || first == 't' || first == 'T' || first == 'y' || first == 'Y');
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN long long get_value_llong(const char_t* value, long long def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _wcstoi64(value, 0, base);
+               #else
+                       return wcstoll(value, 0, base);
+               #endif
+       #else
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _strtoi64(value, 0, base);
+               #else
+                       return strtoll(value, 0, base);
+               #endif
+       #endif
+       }
+
+       PUGI__FN unsigned long long get_value_ullong(const char_t* value, unsigned long long def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _wcstoui64(value, 0, base);
+               #else
+                       return wcstoull(value, 0, base);
+               #endif
+       #else
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _strtoui64(value, 0, base);
+               #else
+                       return strtoull(value, 0, base);
+               #endif
+       #endif
+       }
+#endif
+
+       // set value with conversion functions
+       PUGI__FN bool set_value_buffer(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char (&buf)[128])
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               char_t wbuf[128];
+               impl::widen_ascii(wbuf, buf);
+
+               return strcpy_insitu(dest, header, header_mask, wbuf);
+       #else
+               return strcpy_insitu(dest, header, header_mask, buf);
+       #endif
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, int value)
+       {
+               char buf[128];
+               sprintf(buf, "%d", value);
+       
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned int value)
+       {
+               char buf[128];
+               sprintf(buf, "%u", value);
+
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, float value)
+       {
+               char buf[128];
+               sprintf(buf, "%.9g", value);
+
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+       
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, double value)
+       {
+               char buf[128];
+               sprintf(buf, "%.17g", value);
+
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+       
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, bool value)
+       {
+               return strcpy_insitu(dest, header, header_mask, value ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, long long value)
+       {
+               char buf[128];
+               sprintf(buf, "%lld", value);
+       
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned long long value)
+       {
+               char buf[128];
+               sprintf(buf, "%llu", value);
+       
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+#endif
+
+       // we need to get length of entire file to load it in memory; the only (relatively) sane way to do it is via seek/tell trick
+       PUGI__FN xml_parse_status get_file_size(FILE* file, size_t& out_result)
+       {
+       #if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+               // there are 64-bit versions of fseek/ftell, let's use them
+               typedef __int64 length_type;
+
+               _fseeki64(file, 0, SEEK_END);
+               length_type length = _ftelli64(file);
+               _fseeki64(file, 0, SEEK_SET);
+       #elif defined(__MINGW32__) && !defined(__NO_MINGW_LFS) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR))
+               // there are 64-bit versions of fseek/ftell, let's use them
+               typedef off64_t length_type;
+
+               fseeko64(file, 0, SEEK_END);
+               length_type length = ftello64(file);
+               fseeko64(file, 0, SEEK_SET);
+       #else
+               // if this is a 32-bit OS, long is enough; if this is a unix system, long is 64-bit, which is enough; otherwise we can't do anything anyway.
+               typedef long length_type;
+
+               fseek(file, 0, SEEK_END);
+               length_type length = ftell(file);
+               fseek(file, 0, SEEK_SET);
+       #endif
+
+               // check for I/O errors
+               if (length < 0) return status_io_error;
+               
+               // check for overflow
+               size_t result = static_cast<size_t>(length);
+
+               if (static_cast<length_type>(result) != length) return status_out_of_memory;
+
+               // finalize
+               out_result = result;
+
+               return status_ok;
+       }
+
+       PUGI__FN size_t zero_terminate_buffer(void* buffer, size_t size, xml_encoding encoding) 
+       {
+               // We only need to zero-terminate if encoding conversion does not do it for us
+       #ifdef PUGIXML_WCHAR_MODE
+               xml_encoding wchar_encoding = get_wchar_encoding();
+
+               if (encoding == wchar_encoding || need_endian_swap_utf(encoding, wchar_encoding))
+               {
+                       size_t length = size / sizeof(char_t);
+
+                       static_cast<char_t*>(buffer)[length] = 0;
+                       return (length + 1) * sizeof(char_t);
+               }
+       #else
+               if (encoding == encoding_utf8)
+               {
+                       static_cast<char*>(buffer)[size] = 0;
+                       return size + 1;
+               }
+       #endif
+
+               return size;
+       }
+
+       PUGI__FN xml_parse_result load_file_impl(xml_document& doc, FILE* file, unsigned int options, xml_encoding encoding)
+       {
+               if (!file) return make_parse_result(status_file_not_found);
+
+               // get file size (can result in I/O errors)
+               size_t size = 0;
+               xml_parse_status size_status = get_file_size(file, size);
+
+               if (size_status != status_ok)
+               {
+                       fclose(file);
+                       return make_parse_result(size_status);
+               }
+               
+               size_t max_suffix_size = sizeof(char_t);
+
+               // allocate buffer for the whole file
+               char* contents = static_cast<char*>(xml_memory::allocate(size + max_suffix_size));
+
+               if (!contents)
+               {
+                       fclose(file);
+                       return make_parse_result(status_out_of_memory);
+               }
+
+               // read file in memory
+               size_t read_size = fread(contents, 1, size, file);
+               fclose(file);
+
+               if (read_size != size)
+               {
+                       xml_memory::deallocate(contents);
+                       return make_parse_result(status_io_error);
+               }
+
+               xml_encoding real_encoding = get_buffer_encoding(encoding, contents, size);
+               
+               return doc.load_buffer_inplace_own(contents, zero_terminate_buffer(contents, size, real_encoding), options, real_encoding);
+       }
+
+#ifndef PUGIXML_NO_STL
+       template <typename T> struct xml_stream_chunk
+       {
+               static xml_stream_chunk* create()
+               {
+                       void* memory = xml_memory::allocate(sizeof(xml_stream_chunk));
+                       
+                       return new (memory) xml_stream_chunk();
+               }
+
+               static void destroy(void* ptr)
+               {
+                       xml_stream_chunk* chunk = static_cast<xml_stream_chunk*>(ptr);
+
+                       // free chunk chain
+                       while (chunk)
+                       {
+                               xml_stream_chunk* next_ = chunk->next;
+
+                               xml_memory::deallocate(chunk);
+
+                               chunk = next_;
+                       }
+               }
+
+               xml_stream_chunk(): next(0), size(0)
+               {
+               }
+
+               xml_stream_chunk* next;
+               size_t size;
+
+               T data[xml_memory_page_size / sizeof(T)];
+       };
+
+       template <typename T> PUGI__FN xml_parse_status load_stream_data_noseek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+       {
+               buffer_holder chunks(0, xml_stream_chunk<T>::destroy);
+
+               // read file to a chunk list
+               size_t total = 0;
+               xml_stream_chunk<T>* last = 0;
+
+               while (!stream.eof())
+               {
+                       // allocate new chunk
+                       xml_stream_chunk<T>* chunk = xml_stream_chunk<T>::create();
+                       if (!chunk) return status_out_of_memory;
+
+                       // append chunk to list
+                       if (last) last = last->next = chunk;
+                       else chunks.data = last = chunk;
+
+                       // read data to chunk
+                       stream.read(chunk->data, static_cast<std::streamsize>(sizeof(chunk->data) / sizeof(T)));
+                       chunk->size = static_cast<size_t>(stream.gcount()) * sizeof(T);
+
+                       // read may set failbit | eofbit in case gcount() is less than read length, so check for other I/O errors
+                       if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+                       // guard against huge files (chunk size is small enough to make this overflow check work)
+                       if (total + chunk->size < total) return status_out_of_memory;
+                       total += chunk->size;
+               }
+
+               size_t max_suffix_size = sizeof(char_t);
+
+               // copy chunk list to a contiguous buffer
+               char* buffer = static_cast<char*>(xml_memory::allocate(total + max_suffix_size));
+               if (!buffer) return status_out_of_memory;
+
+               char* write = buffer;
+
+               for (xml_stream_chunk<T>* chunk = static_cast<xml_stream_chunk<T>*>(chunks.data); chunk; chunk = chunk->next)
+               {
+                       assert(write + chunk->size <= buffer + total);
+                       memcpy(write, chunk->data, chunk->size);
+                       write += chunk->size;
+               }
+
+               assert(write == buffer + total);
+
+               // return buffer
+               *out_buffer = buffer;
+               *out_size = total;
+
+               return status_ok;
+       }
+
+       template <typename T> PUGI__FN xml_parse_status load_stream_data_seek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+       {
+               // get length of remaining data in stream
+               typename std::basic_istream<T>::pos_type pos = stream.tellg();
+               stream.seekg(0, std::ios::end);
+               std::streamoff length = stream.tellg() - pos;
+               stream.seekg(pos);
+
+               if (stream.fail() || pos < 0) return status_io_error;
+
+               // guard against huge files
+               size_t read_length = static_cast<size_t>(length);
+
+               if (static_cast<std::streamsize>(read_length) != length || length < 0) return status_out_of_memory;
+
+               size_t max_suffix_size = sizeof(char_t);
+
+               // read stream data into memory (guard against stream exceptions with buffer holder)
+               buffer_holder buffer(xml_memory::allocate(read_length * sizeof(T) + max_suffix_size), xml_memory::deallocate);
+               if (!buffer.data) return status_out_of_memory;
+
+               stream.read(static_cast<T*>(buffer.data), static_cast<std::streamsize>(read_length));
+
+               // read may set failbit | eofbit in case gcount() is less than read_length (i.e. line ending conversion), so check for other I/O errors
+               if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+               // return buffer
+               size_t actual_length = static_cast<size_t>(stream.gcount());
+               assert(actual_length <= read_length);
+               
+               *out_buffer = buffer.release();
+               *out_size = actual_length * sizeof(T);
+
+               return status_ok;
+       }
+
+       template <typename T> PUGI__FN xml_parse_result load_stream_impl(xml_document& doc, std::basic_istream<T>& stream, unsigned int options, xml_encoding encoding)
+       {
+               void* buffer = 0;
+               size_t size = 0;
+               xml_parse_status status = status_ok;
+
+               // if stream has an error bit set, bail out (otherwise tellg() can fail and we'll clear error bits)
+               if (stream.fail()) return make_parse_result(status_io_error);
+
+               // load stream to memory (using seek-based implementation if possible, since it's faster and takes less memory)
+               if (stream.tellg() < 0)
+               {
+                       stream.clear(); // clear error flags that could be set by a failing tellg
+                       status = load_stream_data_noseek(stream, &buffer, &size);
+               }
+               else
+                       status = load_stream_data_seek(stream, &buffer, &size);
+
+               if (status != status_ok) return make_parse_result(status);
+
+               xml_encoding real_encoding = get_buffer_encoding(encoding, buffer, size);
+               
+               return doc.load_buffer_inplace_own(buffer, zero_terminate_buffer(buffer, size, real_encoding), options, real_encoding);
+       }
+#endif
+
+#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) || (defined(__MINGW32__) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR)))
+       PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+       {
+               return _wfopen(path, mode);
+       }
+#else
+       PUGI__FN char* convert_path_heap(const wchar_t* str)
+       {
+               assert(str);
+
+               // first pass: get length in utf8 characters
+               size_t length = strlength_wide(str);
+               size_t size = as_utf8_begin(str, length);
+
+               // allocate resulting string
+               char* result = static_cast<char*>(xml_memory::allocate(size + 1));
+               if (!result) return 0;
+
+               // second pass: convert to utf8
+               as_utf8_end(result, size, str, length);
+
+               return result;
+       }
+
+       PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+       {
+               // there is no standard function to open wide paths, so our best bet is to try utf8 path
+               char* path_utf8 = convert_path_heap(path);
+               if (!path_utf8) return 0;
+
+               // convert mode to ASCII (we mirror _wfopen interface)
+               char mode_ascii[4] = {0};
+               for (size_t i = 0; mode[i]; ++i) mode_ascii[i] = static_cast<char>(mode[i]);
+
+               // try to open the utf8 path
+               FILE* result = fopen(path_utf8, mode_ascii);
+
+               // free dummy buffer
+               xml_memory::deallocate(path_utf8);
+
+               return result;
+       }
+#endif
+
+       PUGI__FN bool save_file_impl(const xml_document& doc, FILE* file, const char_t* indent, unsigned int flags, xml_encoding encoding)
+       {
+               if (!file) return false;
+
+               xml_writer_file writer(file);
+               doc.save(writer, indent, flags, encoding);
+
+               int result = ferror(file);
+
+               fclose(file);
+
+               return result == 0;
+       }
+
+       PUGI__FN xml_parse_result load_buffer_impl(xml_document_struct* doc, xml_node_struct* root, void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own, char_t** out_buffer)
+       {
+               // check input buffer
+               assert(contents || size == 0);
+
+               // get actual encoding
+               xml_encoding buffer_encoding = impl::get_buffer_encoding(encoding, contents, size);
+
+               // get private buffer
+               char_t* buffer = 0;
+               size_t length = 0;
+
+               if (!impl::convert_buffer(buffer, length, buffer_encoding, contents, size, is_mutable)) return impl::make_parse_result(status_out_of_memory);
+               
+               // delete original buffer if we performed a conversion
+               if (own && buffer != contents && contents) impl::xml_memory::deallocate(contents);
+
+               // store buffer for offset_debug
+               doc->buffer = buffer;
+
+               // parse
+               xml_parse_result res = impl::xml_parser::parse(buffer, length, doc, root, options);
+
+               // remember encoding
+               res.encoding = buffer_encoding;
+
+               // grab onto buffer if it's our buffer, user is responsible for deallocating contents himself
+               if (own || buffer != contents) *out_buffer = buffer;
+
+               return res;
+       }
+PUGI__NS_END
+
+namespace pugi
+{
+       PUGI__FN xml_writer_file::xml_writer_file(void* file_): file(file_)
+       {
+       }
+
+       PUGI__FN void xml_writer_file::write(const void* data, size_t size)
+       {
+               size_t result = fwrite(data, 1, size, static_cast<FILE*>(file));
+               (void)!result; // unfortunately we can't do proper error handling here
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream): narrow_stream(&stream), wide_stream(0)
+       {
+       }
+
+       PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream): narrow_stream(0), wide_stream(&stream)
+       {
+       }
+
+       PUGI__FN void xml_writer_stream::write(const void* data, size_t size)
+       {
+               if (narrow_stream)
+               {
+                       assert(!wide_stream);
+                       narrow_stream->write(reinterpret_cast<const char*>(data), static_cast<std::streamsize>(size));
+               }
+               else
+               {
+                       assert(wide_stream);
+                       assert(size % sizeof(wchar_t) == 0);
+
+                       wide_stream->write(reinterpret_cast<const wchar_t*>(data), static_cast<std::streamsize>(size / sizeof(wchar_t)));
+               }
+       }
+#endif
+
+       PUGI__FN xml_tree_walker::xml_tree_walker(): _depth(0)
+       {
+       }
+       
+       PUGI__FN xml_tree_walker::~xml_tree_walker()
+       {
+       }
+
+       PUGI__FN int xml_tree_walker::depth() const
+       {
+               return _depth;
+       }
+
+       PUGI__FN bool xml_tree_walker::begin(xml_node&)
+       {
+               return true;
+       }
+
+       PUGI__FN bool xml_tree_walker::end(xml_node&)
+       {
+               return true;
+       }
+
+       PUGI__FN xml_attribute::xml_attribute(): _attr(0)
+       {
+       }
+
+       PUGI__FN xml_attribute::xml_attribute(xml_attribute_struct* attr): _attr(attr)
+       {
+       }
+
+       PUGI__FN static void unspecified_bool_xml_attribute(xml_attribute***)
+       {
+       }
+
+       PUGI__FN xml_attribute::operator xml_attribute::unspecified_bool_type() const
+       {
+               return _attr ? unspecified_bool_xml_attribute : 0;
+       }
+
+       PUGI__FN bool xml_attribute::operator!() const
+       {
+               return !_attr;
+       }
+
+       PUGI__FN bool xml_attribute::operator==(const xml_attribute& r) const
+       {
+               return (_attr == r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator!=(const xml_attribute& r) const
+       {
+               return (_attr != r._attr);
+       }
+
+       PUGI__FN bool xml_attribute::operator<(const xml_attribute& r) const
+       {
+               return (_attr < r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator>(const xml_attribute& r) const
+       {
+               return (_attr > r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator<=(const xml_attribute& r) const
+       {
+               return (_attr <= r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator>=(const xml_attribute& r) const
+       {
+               return (_attr >= r._attr);
+       }
+
+       PUGI__FN xml_attribute xml_attribute::next_attribute() const
+       {
+               return _attr ? xml_attribute(_attr->next_attribute) : xml_attribute();
+       }
+
+       PUGI__FN xml_attribute xml_attribute::previous_attribute() const
+       {
+               return _attr && _attr->prev_attribute_c->next_attribute ? xml_attribute(_attr->prev_attribute_c) : xml_attribute();
+       }
+
+       PUGI__FN const char_t* xml_attribute::as_string(const char_t* def) const
+       {
+               return (_attr && _attr->value) ? _attr->value : def;
+       }
+
+       PUGI__FN int xml_attribute::as_int(int def) const
+       {
+               return impl::get_value_int(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN unsigned int xml_attribute::as_uint(unsigned int def) const
+       {
+               return impl::get_value_uint(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN double xml_attribute::as_double(double def) const
+       {
+               return impl::get_value_double(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN float xml_attribute::as_float(float def) const
+       {
+               return impl::get_value_float(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN bool xml_attribute::as_bool(bool def) const
+       {
+               return impl::get_value_bool(_attr ? _attr->value : 0, def);
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN long long xml_attribute::as_llong(long long def) const
+       {
+               return impl::get_value_llong(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN unsigned long long xml_attribute::as_ullong(unsigned long long def) const
+       {
+               return impl::get_value_ullong(_attr ? _attr->value : 0, def);
+       }
+#endif
+
+       PUGI__FN bool xml_attribute::empty() const
+       {
+               return !_attr;
+       }
+
+       PUGI__FN const char_t* xml_attribute::name() const
+       {
+               return (_attr && _attr->name) ? _attr->name : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* xml_attribute::value() const
+       {
+               return (_attr && _attr->value) ? _attr->value : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN size_t xml_attribute::hash_value() const
+       {
+               return static_cast<size_t>(reinterpret_cast<uintptr_t>(_attr) / sizeof(xml_attribute_struct));
+       }
+
+       PUGI__FN xml_attribute_struct* xml_attribute::internal_object() const
+       {
+               return _attr;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(const char_t* rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+       
+       PUGI__FN xml_attribute& xml_attribute::operator=(int rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(unsigned int rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(double rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+       
+       PUGI__FN xml_attribute& xml_attribute::operator=(float rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+       
+       PUGI__FN xml_attribute& xml_attribute::operator=(bool rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN xml_attribute& xml_attribute::operator=(long long rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(unsigned long long rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+#endif
+
+       PUGI__FN bool xml_attribute::set_name(const char_t* rhs)
+       {
+               if (!_attr) return false;
+               
+               return impl::strcpy_insitu(_attr->name, _attr->header, impl::xml_memory_page_name_allocated_mask, rhs);
+       }
+               
+       PUGI__FN bool xml_attribute::set_value(const char_t* rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::strcpy_insitu(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(int rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(unsigned int rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(double rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+       
+       PUGI__FN bool xml_attribute::set_value(float rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+       
+       PUGI__FN bool xml_attribute::set_value(bool rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN bool xml_attribute::set_value(long long rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(unsigned long long rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+#endif
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xml_attribute& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xml_attribute& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN xml_node::xml_node(): _root(0)
+       {
+       }
+
+       PUGI__FN xml_node::xml_node(xml_node_struct* p): _root(p)
+       {
+       }
+       
+       PUGI__FN static void unspecified_bool_xml_node(xml_node***)
+       {
+       }
+
+       PUGI__FN xml_node::operator xml_node::unspecified_bool_type() const
+       {
+               return _root ? unspecified_bool_xml_node : 0;
+       }
+
+       PUGI__FN bool xml_node::operator!() const
+       {
+               return !_root;
+       }
+
+       PUGI__FN xml_node::iterator xml_node::begin() const
+       {
+               return iterator(_root ? _root->first_child : 0, _root);
+       }
+
+       PUGI__FN xml_node::iterator xml_node::end() const
+       {
+               return iterator(0, _root);
+       }
+       
+       PUGI__FN xml_node::attribute_iterator xml_node::attributes_begin() const
+       {
+               return attribute_iterator(_root ? _root->first_attribute : 0, _root);
+       }
+
+       PUGI__FN xml_node::attribute_iterator xml_node::attributes_end() const
+       {
+               return attribute_iterator(0, _root);
+       }
+       
+       PUGI__FN xml_object_range<xml_node_iterator> xml_node::children() const
+       {
+               return xml_object_range<xml_node_iterator>(begin(), end());
+       }
+
+       PUGI__FN xml_object_range<xml_named_node_iterator> xml_node::children(const char_t* name_) const
+       {
+               return xml_object_range<xml_named_node_iterator>(xml_named_node_iterator(child(name_)._root, _root, name_), xml_named_node_iterator(0, _root, name_));
+       }
+
+       PUGI__FN xml_object_range<xml_attribute_iterator> xml_node::attributes() const
+       {
+               return xml_object_range<xml_attribute_iterator>(attributes_begin(), attributes_end());
+       }
+
+       PUGI__FN bool xml_node::operator==(const xml_node& r) const
+       {
+               return (_root == r._root);
+       }
+
+       PUGI__FN bool xml_node::operator!=(const xml_node& r) const
+       {
+               return (_root != r._root);
+       }
+
+       PUGI__FN bool xml_node::operator<(const xml_node& r) const
+       {
+               return (_root < r._root);
+       }
+       
+       PUGI__FN bool xml_node::operator>(const xml_node& r) const
+       {
+               return (_root > r._root);
+       }
+       
+       PUGI__FN bool xml_node::operator<=(const xml_node& r) const
+       {
+               return (_root <= r._root);
+       }
+       
+       PUGI__FN bool xml_node::operator>=(const xml_node& r) const
+       {
+               return (_root >= r._root);
+       }
+
+       PUGI__FN bool xml_node::empty() const
+       {
+               return !_root;
+       }
+       
+       PUGI__FN const char_t* xml_node::name() const
+       {
+               return (_root && _root->name) ? _root->name : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN xml_node_type xml_node::type() const
+       {
+               return _root ? PUGI__NODETYPE(_root) : node_null;
+       }
+       
+       PUGI__FN const char_t* xml_node::value() const
+       {
+               return (_root && _root->value) ? _root->value : PUGIXML_TEXT("");
+       }
+       
+       PUGI__FN xml_node xml_node::child(const char_t* name_) const
+       {
+               if (!_root) return xml_node();
+
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_attribute xml_node::attribute(const char_t* name_) const
+       {
+               if (!_root) return xml_attribute();
+
+               for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute)
+                       if (i->name && impl::strequal(name_, i->name))
+                               return xml_attribute(i);
+               
+               return xml_attribute();
+       }
+       
+       PUGI__FN xml_node xml_node::next_sibling(const char_t* name_) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling)
+                       if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::next_sibling() const
+       {
+               return _root ? xml_node(_root->next_sibling) : xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::previous_sibling(const char_t* name_) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->prev_sibling_c; i->next_sibling; i = i->prev_sibling_c)
+                       if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::previous_sibling() const
+       {
+               if (!_root) return xml_node();
+               
+               if (_root->prev_sibling_c->next_sibling) return xml_node(_root->prev_sibling_c);
+               else return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::parent() const
+       {
+               return _root ? xml_node(_root->parent) : xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::root() const
+       {
+               return _root ? xml_node(&impl::get_document(_root)) : xml_node();
+       }
+
+       PUGI__FN xml_text xml_node::text() const
+       {
+               return xml_text(_root);
+       }
+
+       PUGI__FN const char_t* xml_node::child_value() const
+       {
+               if (!_root) return PUGIXML_TEXT("");
+               
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (i->value && impl::is_text_node(i))
+                               return i->value;
+
+               return PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* xml_node::child_value(const char_t* name_) const
+       {
+               return child(name_).child_value();
+       }
+
+       PUGI__FN xml_attribute xml_node::first_attribute() const
+       {
+               return _root ? xml_attribute(_root->first_attribute) : xml_attribute();
+       }
+
+       PUGI__FN xml_attribute xml_node::last_attribute() const
+       {
+               return _root && _root->first_attribute ? xml_attribute(_root->first_attribute->prev_attribute_c) : xml_attribute();
+       }
+
+       PUGI__FN xml_node xml_node::first_child() const
+       {
+               return _root ? xml_node(_root->first_child) : xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::last_child() const
+       {
+               return _root && _root->first_child ? xml_node(_root->first_child->prev_sibling_c) : xml_node();
+       }
+
+       PUGI__FN bool xml_node::set_name(const char_t* rhs)
+       {
+               switch (type())
+               {
+               case node_pi:
+               case node_declaration:
+               case node_element:
+                       return impl::strcpy_insitu(_root->name, _root->header, impl::xml_memory_page_name_allocated_mask, rhs);
+
+               default:
+                       return false;
+               }
+       }
+               
+       PUGI__FN bool xml_node::set_value(const char_t* rhs)
+       {
+               switch (type())
+               {
+               case node_pi:
+               case node_cdata:
+               case node_pcdata:
+               case node_comment:
+               case node_doctype:
+                       return impl::strcpy_insitu(_root->value, _root->header, impl::xml_memory_page_value_allocated_mask, rhs);
+
+               default:
+                       return false;
+               }
+       }
+
+       PUGI__FN xml_attribute xml_node::append_attribute(const char_t* name_)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::append_attribute(a._attr, _root);
+
+               a.set_name(name_);
+               
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::prepend_attribute(const char_t* name_)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::prepend_attribute(a._attr, _root);
+
+               a.set_name(name_);
+
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_attribute_after(const char_t* name_, const xml_attribute& attr)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::insert_attribute_after(a._attr, attr._attr, _root);
+
+               a.set_name(name_);
+
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_attribute_before(const char_t* name_, const xml_attribute& attr)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::insert_attribute_before(a._attr, attr._attr, _root);
+
+               a.set_name(name_);
+
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::append_copy(const xml_attribute& proto)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = append_attribute(proto.name());
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_attribute xml_node::prepend_copy(const xml_attribute& proto)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = prepend_attribute(proto.name());
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_copy_after(const xml_attribute& proto, const xml_attribute& attr)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = insert_attribute_after(proto.name(), attr);
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_copy_before(const xml_attribute& proto, const xml_attribute& attr)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = insert_attribute_before(proto.name(), attr);
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::append_child(xml_node_type type_)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::append_node(n._root, _root);
+
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_child(xml_node_type type_)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::prepend_node(n._root, _root);
+                               
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_before(xml_node_type type_, const xml_node& node)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+       
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_before(n._root, node._root);
+
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_after(xml_node_type type_, const xml_node& node)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+       
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_after(n._root, node._root);
+
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::append_child(const char_t* name_)
+       {
+               xml_node result = append_child(node_element);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_child(const char_t* name_)
+       {
+               xml_node result = prepend_child(node_element);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_after(const char_t* name_, const xml_node& node)
+       {
+               xml_node result = insert_child_after(node_element, node);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_before(const char_t* name_, const xml_node& node)
+       {
+               xml_node result = insert_child_before(node_element, node);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::append_copy(const xml_node& proto)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::append_node(n._root, _root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_copy(const xml_node& proto)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::prepend_node(n._root, _root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_copy_after(const xml_node& proto, const xml_node& node)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_after(n._root, node._root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_copy_before(const xml_node& proto, const xml_node& node)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_before(n._root, node._root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::append_move(const xml_node& moved)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::append_node(moved._root, _root);
+
+               return moved;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_move(const xml_node& moved)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::prepend_node(moved._root, _root);
+
+               return moved;
+       }
+
+       PUGI__FN xml_node xml_node::insert_move_after(const xml_node& moved, const xml_node& node)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+               if (moved._root == node._root) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::insert_node_after(moved._root, node._root);
+
+               return moved;
+       }
+
+       PUGI__FN xml_node xml_node::insert_move_before(const xml_node& moved, const xml_node& node)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+               if (moved._root == node._root) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::insert_node_before(moved._root, node._root);
+
+               return moved;
+       }
+
+       PUGI__FN bool xml_node::remove_attribute(const char_t* name_)
+       {
+               return remove_attribute(attribute(name_));
+       }
+
+       PUGI__FN bool xml_node::remove_attribute(const xml_attribute& a)
+       {
+               if (!_root || !a._attr) return false;
+               if (!impl::is_attribute_of(a._attr, _root)) return false;
+
+               impl::remove_attribute(a._attr, _root);
+               impl::destroy_attribute(a._attr, impl::get_allocator(_root));
+
+               return true;
+       }
+
+       PUGI__FN bool xml_node::remove_child(const char_t* name_)
+       {
+               return remove_child(child(name_));
+       }
+
+       PUGI__FN bool xml_node::remove_child(const xml_node& n)
+       {
+               if (!_root || !n._root || n._root->parent != _root) return false;
+
+               impl::remove_node(n._root);
+               impl::destroy_node(n._root, impl::get_allocator(_root));
+
+               return true;
+       }
+
+       PUGI__FN xml_parse_result xml_node::append_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               // append_buffer is only valid for elements/documents
+               if (!impl::allow_insert_child(type(), node_element)) return impl::make_parse_result(status_append_invalid_root);
+
+               // get document node
+               impl::xml_document_struct* doc = &impl::get_document(_root);
+
+               // disable document_buffer_order optimization since in a document with multiple buffers comparing buffer pointers does not make sense
+               doc->header |= impl::xml_memory_page_contents_shared_mask;
+               
+               // get extra buffer element (we'll store the document fragment buffer there so that we can deallocate it later)
+               impl::xml_memory_page* page = 0;
+               impl::xml_extra_buffer* extra = static_cast<impl::xml_extra_buffer*>(doc->allocate_memory(sizeof(impl::xml_extra_buffer), page));
+               (void)page;
+
+               if (!extra) return impl::make_parse_result(status_out_of_memory);
+
+               // save name; name of the root has to be NULL before parsing - otherwise closing node mismatches will not be detected at the top level
+               char_t* rootname = _root->name;
+               _root->name = 0;
+
+               // parse
+               char_t* buffer = 0;
+               xml_parse_result res = impl::load_buffer_impl(doc, _root, const_cast<void*>(contents), size, options, encoding, false, false, &buffer);
+
+               // restore name
+               _root->name = rootname;
+
+               // add extra buffer to the list
+               extra->buffer = buffer;
+               extra->next = doc->extra_buffers;
+               doc->extra_buffers = extra;
+
+               return res;
+       }
+
+       PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* name_, const char_t* attr_name, const char_t* attr_value) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (i->name && impl::strequal(name_, i->name))
+                       {
+                               for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+                                       if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value : PUGIXML_TEXT("")))
+                                               return xml_node(i);
+                       }
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+                               if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value : PUGIXML_TEXT("")))
+                                       return xml_node(i);
+
+               return xml_node();
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN string_t xml_node::path(char_t delimiter) const
+       {
+               xml_node cursor = *this; // Make a copy.
+               
+               string_t result = cursor.name();
+
+               while (cursor.parent())
+               {
+                       cursor = cursor.parent();
+                       
+                       string_t temp = cursor.name();
+                       temp += delimiter;
+                       temp += result;
+                       result.swap(temp);
+               }
+
+               return result;
+       }
+#endif
+
+       PUGI__FN xml_node xml_node::first_element_by_path(const char_t* path_, char_t delimiter) const
+       {
+               xml_node found = *this; // Current search context.
+
+               if (!_root || !path_ || !path_[0]) return found;
+
+               if (path_[0] == delimiter)
+               {
+                       // Absolute path; e.g. '/foo/bar'
+                       found = found.root();
+                       ++path_;
+               }
+
+               const char_t* path_segment = path_;
+
+               while (*path_segment == delimiter) ++path_segment;
+
+               const char_t* path_segment_end = path_segment;
+
+               while (*path_segment_end && *path_segment_end != delimiter) ++path_segment_end;
+
+               if (path_segment == path_segment_end) return found;
+
+               const char_t* next_segment = path_segment_end;
+
+               while (*next_segment == delimiter) ++next_segment;
+
+               if (*path_segment == '.' && path_segment + 1 == path_segment_end)
+                       return found.first_element_by_path(next_segment, delimiter);
+               else if (*path_segment == '.' && *(path_segment+1) == '.' && path_segment + 2 == path_segment_end)
+                       return found.parent().first_element_by_path(next_segment, delimiter);
+               else
+               {
+                       for (xml_node_struct* j = found._root->first_child; j; j = j->next_sibling)
+                       {
+                               if (j->name && impl::strequalrange(j->name, path_segment, static_cast<size_t>(path_segment_end - path_segment)))
+                               {
+                                       xml_node subsearch = xml_node(j).first_element_by_path(next_segment, delimiter);
+
+                                       if (subsearch) return subsearch;
+                               }
+                       }
+
+                       return xml_node();
+               }
+       }
+
+       PUGI__FN bool xml_node::traverse(xml_tree_walker& walker)
+       {
+               walker._depth = -1;
+               
+               xml_node arg_begin = *this;
+               if (!walker.begin(arg_begin)) return false;
+
+               xml_node cur = first_child();
+                               
+               if (cur)
+               {
+                       ++walker._depth;
+
+                       do 
+                       {
+                               xml_node arg_for_each = cur;
+                               if (!walker.for_each(arg_for_each))
+                                       return false;
+                                               
+                               if (cur.first_child())
+                               {
+                                       ++walker._depth;
+                                       cur = cur.first_child();
+                               }
+                               else if (cur.next_sibling())
+                                       cur = cur.next_sibling();
+                               else
+                               {
+                                       // Borland C++ workaround
+                                       while (!cur.next_sibling() && cur != *this && !cur.parent().empty())
+                                       {
+                                               --walker._depth;
+                                               cur = cur.parent();
+                                       }
+                                               
+                                       if (cur != *this)
+                                               cur = cur.next_sibling();
+                               }
+                       }
+                       while (cur && cur != *this);
+               }
+
+               assert(walker._depth == -1);
+
+               xml_node arg_end = *this;
+               return walker.end(arg_end);
+       }
+
+       PUGI__FN size_t xml_node::hash_value() const
+       {
+               return static_cast<size_t>(reinterpret_cast<uintptr_t>(_root) / sizeof(xml_node_struct));
+       }
+
+       PUGI__FN xml_node_struct* xml_node::internal_object() const
+       {
+               return _root;
+       }
+
+       PUGI__FN void xml_node::print(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+       {
+               if (!_root) return;
+
+               impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+               impl::node_output(buffered_writer, _root, indent, flags, depth);
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN void xml_node::print(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+       {
+               xml_writer_stream writer(stream);
+
+               print(writer, indent, flags, encoding, depth);
+       }
+
+       PUGI__FN void xml_node::print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const
+       {
+               xml_writer_stream writer(stream);
+
+               print(writer, indent, flags, encoding_wchar, depth);
+       }
+#endif
+
+       PUGI__FN ptrdiff_t xml_node::offset_debug() const
+       {
+               if (!_root) return -1;
+
+               impl::xml_document_struct& doc = impl::get_document(_root);
+
+               // we can determine the offset reliably only if there is exactly once parse buffer
+               if (!doc.buffer || doc.extra_buffers) return -1;
+
+               switch (type())
+               {
+               case node_document:
+                       return 0;
+
+               case node_element:
+               case node_declaration:
+               case node_pi:
+                       return _root->name && (_root->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0 ? _root->name - doc.buffer : -1;
+
+               case node_pcdata:
+               case node_cdata:
+               case node_comment:
+               case node_doctype:
+                       return _root->value && (_root->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0 ? _root->value - doc.buffer : -1;
+
+               default:
+                       return -1;
+               }
+       }
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xml_node& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xml_node& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN xml_text::xml_text(xml_node_struct* root): _root(root)
+       {
+       }
+
+       PUGI__FN xml_node_struct* xml_text::_data() const
+       {
+               if (!_root || impl::is_text_node(_root)) return _root;
+
+               for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling)
+                       if (impl::is_text_node(node))
+                               return node;
+
+               return 0;
+       }
+
+       PUGI__FN xml_node_struct* xml_text::_data_new()
+       {
+               xml_node_struct* d = _data();
+               if (d) return d;
+
+               return xml_node(_root).append_child(node_pcdata).internal_object();
+       }
+
+       PUGI__FN xml_text::xml_text(): _root(0)
+       {
+       }
+
+       PUGI__FN static void unspecified_bool_xml_text(xml_text***)
+       {
+       }
+
+       PUGI__FN xml_text::operator xml_text::unspecified_bool_type() const
+       {
+               return _data() ? unspecified_bool_xml_text : 0;
+       }
+
+       PUGI__FN bool xml_text::operator!() const
+       {
+               return !_data();
+       }
+
+       PUGI__FN bool xml_text::empty() const
+       {
+               return _data() == 0;
+       }
+
+       PUGI__FN const char_t* xml_text::get() const
+       {
+               xml_node_struct* d = _data();
+
+               return (d && d->value) ? d->value : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* xml_text::as_string(const char_t* def) const
+       {
+               xml_node_struct* d = _data();
+
+               return (d && d->value) ? d->value : def;
+       }
+
+       PUGI__FN int xml_text::as_int(int def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_int(d ? d->value : 0, def);
+       }
+
+       PUGI__FN unsigned int xml_text::as_uint(unsigned int def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_uint(d ? d->value : 0, def);
+       }
+
+       PUGI__FN double xml_text::as_double(double def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_double(d ? d->value : 0, def);
+       }
+
+       PUGI__FN float xml_text::as_float(float def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_float(d ? d->value : 0, def);
+       }
+
+       PUGI__FN bool xml_text::as_bool(bool def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_bool(d ? d->value : 0, def);
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN long long xml_text::as_llong(long long def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_llong(d ? d->value : 0, def);
+       }
+
+       PUGI__FN unsigned long long xml_text::as_ullong(unsigned long long def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_ullong(d ? d->value : 0, def);
+       }
+#endif
+
+       PUGI__FN bool xml_text::set(const char_t* rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::strcpy_insitu(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(int rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(unsigned int rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(float rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(double rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(bool rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN bool xml_text::set(long long rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(unsigned long long rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+#endif
+
+       PUGI__FN xml_text& xml_text::operator=(const char_t* rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(int rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(unsigned int rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(double rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(float rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(bool rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN xml_text& xml_text::operator=(long long rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(unsigned long long rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+#endif
+
+       PUGI__FN xml_node xml_text::data() const
+       {
+               return xml_node(_data());
+       }
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xml_text& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xml_text& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN xml_node_iterator::xml_node_iterator()
+       {
+       }
+
+       PUGI__FN xml_node_iterator::xml_node_iterator(const xml_node& node): _wrap(node), _parent(node.parent())
+       {
+       }
+
+       PUGI__FN xml_node_iterator::xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+       {
+       }
+
+       PUGI__FN bool xml_node_iterator::operator==(const xml_node_iterator& rhs) const
+       {
+               return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root;
+       }
+       
+       PUGI__FN bool xml_node_iterator::operator!=(const xml_node_iterator& rhs) const
+       {
+               return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root;
+       }
+
+       PUGI__FN xml_node& xml_node_iterator::operator*() const
+       {
+               assert(_wrap._root);
+               return _wrap;
+       }
+
+       PUGI__FN xml_node* xml_node_iterator::operator->() const
+       {
+               assert(_wrap._root);
+               return const_cast<xml_node*>(&_wrap); // BCC32 workaround
+       }
+
+       PUGI__FN const xml_node_iterator& xml_node_iterator::operator++()
+       {
+               assert(_wrap._root);
+               _wrap._root = _wrap._root->next_sibling;
+               return *this;
+       }
+
+       PUGI__FN xml_node_iterator xml_node_iterator::operator++(int)
+       {
+               xml_node_iterator temp = *this;
+               ++*this;
+               return temp;
+       }
+
+       PUGI__FN const xml_node_iterator& xml_node_iterator::operator--()
+       {
+               _wrap = _wrap._root ? _wrap.previous_sibling() : _parent.last_child();
+               return *this;
+       }
+
+       PUGI__FN xml_node_iterator xml_node_iterator::operator--(int)
+       {
+               xml_node_iterator temp = *this;
+               --*this;
+               return temp;
+       }
+
+       PUGI__FN xml_attribute_iterator::xml_attribute_iterator()
+       {
+       }
+
+       PUGI__FN xml_attribute_iterator::xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent): _wrap(attr), _parent(parent)
+       {
+       }
+
+       PUGI__FN xml_attribute_iterator::xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+       {
+       }
+
+       PUGI__FN bool xml_attribute_iterator::operator==(const xml_attribute_iterator& rhs) const
+       {
+               return _wrap._attr == rhs._wrap._attr && _parent._root == rhs._parent._root;
+       }
+       
+       PUGI__FN bool xml_attribute_iterator::operator!=(const xml_attribute_iterator& rhs) const
+       {
+               return _wrap._attr != rhs._wrap._attr || _parent._root != rhs._parent._root;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute_iterator::operator*() const
+       {
+               assert(_wrap._attr);
+               return _wrap;
+       }
+
+       PUGI__FN xml_attribute* xml_attribute_iterator::operator->() const
+       {
+               assert(_wrap._attr);
+               return const_cast<xml_attribute*>(&_wrap); // BCC32 workaround
+       }
+
+       PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator++()
+       {
+               assert(_wrap._attr);
+               _wrap._attr = _wrap._attr->next_attribute;
+               return *this;
+       }
+
+       PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator++(int)
+       {
+               xml_attribute_iterator temp = *this;
+               ++*this;
+               return temp;
+       }
+
+       PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator--()
+       {
+               _wrap = _wrap._attr ? _wrap.previous_attribute() : _parent.last_attribute();
+               return *this;
+       }
+
+       PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator--(int)
+       {
+               xml_attribute_iterator temp = *this;
+               --*this;
+               return temp;
+       }
+
+       PUGI__FN xml_named_node_iterator::xml_named_node_iterator(): _name(0)
+       {
+       }
+
+       PUGI__FN xml_named_node_iterator::xml_named_node_iterator(const xml_node& node, const char_t* name): _wrap(node), _parent(node.parent()), _name(name)
+       {
+       }
+
+       PUGI__FN xml_named_node_iterator::xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name): _wrap(ref), _parent(parent), _name(name)
+       {
+       }
+
+       PUGI__FN bool xml_named_node_iterator::operator==(const xml_named_node_iterator& rhs) const
+       {
+               return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root;
+       }
+
+       PUGI__FN bool xml_named_node_iterator::operator!=(const xml_named_node_iterator& rhs) const
+       {
+               return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root;
+       }
+
+       PUGI__FN xml_node& xml_named_node_iterator::operator*() const
+       {
+               assert(_wrap._root);
+               return _wrap;
+       }
+
+       PUGI__FN xml_node* xml_named_node_iterator::operator->() const
+       {
+               assert(_wrap._root);
+               return const_cast<xml_node*>(&_wrap); // BCC32 workaround
+       }
+
+       PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator++()
+       {
+               assert(_wrap._root);
+               _wrap = _wrap.next_sibling(_name);
+               return *this;
+       }
+
+       PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator++(int)
+       {
+               xml_named_node_iterator temp = *this;
+               ++*this;
+               return temp;
+       }
+
+       PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator--()
+       {
+               if (_wrap._root)
+                       _wrap = _wrap.previous_sibling(_name);
+               else
+               {
+                       _wrap = _parent.last_child();
+
+                       if (!impl::strequal(_wrap.name(), _name))
+                               _wrap = _wrap.previous_sibling(_name);
+               }
+
+               return *this;
+       }
+
+       PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator--(int)
+       {
+               xml_named_node_iterator temp = *this;
+               --*this;
+               return temp;
+       }
+
+       PUGI__FN xml_parse_result::xml_parse_result(): status(status_internal_error), offset(0), encoding(encoding_auto)
+       {
+       }
+
+       PUGI__FN xml_parse_result::operator bool() const
+       {
+               return status == status_ok;
+       }
+
+       PUGI__FN const char* xml_parse_result::description() const
+       {
+               switch (status)
+               {
+               case status_ok: return "No error";
+
+               case status_file_not_found: return "File was not found";
+               case status_io_error: return "Error reading from file/stream";
+               case status_out_of_memory: return "Could not allocate memory";
+               case status_internal_error: return "Internal error occurred";
+
+               case status_unrecognized_tag: return "Could not determine tag type";
+
+               case status_bad_pi: return "Error parsing document declaration/processing instruction";
+               case status_bad_comment: return "Error parsing comment";
+               case status_bad_cdata: return "Error parsing CDATA section";
+               case status_bad_doctype: return "Error parsing document type declaration";
+               case status_bad_pcdata: return "Error parsing PCDATA section";
+               case status_bad_start_element: return "Error parsing start element tag";
+               case status_bad_attribute: return "Error parsing element attribute";
+               case status_bad_end_element: return "Error parsing end element tag";
+               case status_end_element_mismatch: return "Start-end tags mismatch";
+
+               case status_append_invalid_root: return "Unable to append nodes: root is not an element or document";
+
+               case status_no_document_element: return "No document element found";
+
+               default: return "Unknown error";
+               }
+       }
+
+       PUGI__FN xml_document::xml_document(): _buffer(0)
+       {
+               create();
+       }
+
+       PUGI__FN xml_document::~xml_document()
+       {
+               destroy();
+       }
+
+       PUGI__FN void xml_document::reset()
+       {
+               destroy();
+               create();
+       }
+
+       PUGI__FN void xml_document::reset(const xml_document& proto)
+       {
+               reset();
+
+               for (xml_node cur = proto.first_child(); cur; cur = cur.next_sibling())
+                       append_copy(cur);
+       }
+
+       PUGI__FN void xml_document::create()
+       {
+               assert(!_root);
+
+               // initialize sentinel page
+               PUGI__STATIC_ASSERT(sizeof(impl::xml_memory_page) + sizeof(impl::xml_document_struct) + impl::xml_memory_page_alignment - sizeof(void*) <= sizeof(_memory));
+
+               // align upwards to page boundary
+               void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(_memory) + (impl::xml_memory_page_alignment - 1)) & ~(impl::xml_memory_page_alignment - 1));
+
+               // prepare page structure
+               impl::xml_memory_page* page = impl::xml_memory_page::construct(page_memory);
+               assert(page);
+
+               page->busy_size = impl::xml_memory_page_size;
+
+               // allocate new root
+               _root = new (reinterpret_cast<char*>(page) + sizeof(impl::xml_memory_page)) impl::xml_document_struct(page);
+               _root->prev_sibling_c = _root;
+
+               // setup sentinel page
+               page->allocator = static_cast<impl::xml_document_struct*>(_root);
+
+               // verify the document allocation
+               assert(reinterpret_cast<char*>(_root) + sizeof(impl::xml_document_struct) <= _memory + sizeof(_memory));
+       }
+
+       PUGI__FN void xml_document::destroy()
+       {
+               assert(_root);
+
+               // destroy static storage
+               if (_buffer)
+               {
+                       impl::xml_memory::deallocate(_buffer);
+                       _buffer = 0;
+               }
+
+               // destroy extra buffers (note: no need to destroy linked list nodes, they're allocated using document allocator)
+               for (impl::xml_extra_buffer* extra = static_cast<impl::xml_document_struct*>(_root)->extra_buffers; extra; extra = extra->next)
+               {
+                       if (extra->buffer) impl::xml_memory::deallocate(extra->buffer);
+               }
+
+               // destroy dynamic storage, leave sentinel page (it's in static memory)
+               impl::xml_memory_page* root_page = reinterpret_cast<impl::xml_memory_page*>(_root->header & impl::xml_memory_page_pointer_mask);
+               assert(root_page && !root_page->prev);
+               assert(reinterpret_cast<char*>(root_page) >= _memory && reinterpret_cast<char*>(root_page) < _memory + sizeof(_memory));
+
+               for (impl::xml_memory_page* page = root_page->next; page; )
+               {
+                       impl::xml_memory_page* next = page->next;
+
+                       impl::xml_allocator::deallocate_page(page);
+
+                       page = next;
+               }
+
+               _root = 0;
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN xml_parse_result xml_document::load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_stream_impl(*this, stream, options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options)
+       {
+               reset();
+
+               return impl::load_stream_impl(*this, stream, options, encoding_wchar);
+       }
+#endif
+
+       PUGI__FN xml_parse_result xml_document::load_string(const char_t* contents, unsigned int options)
+       {
+               // Force native encoding (skip autodetection)
+       #ifdef PUGIXML_WCHAR_MODE
+               xml_encoding encoding = encoding_wchar;
+       #else
+               xml_encoding encoding = encoding_utf8;
+       #endif
+
+               return load_buffer(contents, impl::strlength(contents) * sizeof(char_t), options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load(const char_t* contents, unsigned int options)
+       {
+               return load_string(contents, options);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_file(const char* path_, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               FILE* file = fopen(path_, "rb");
+
+               return impl::load_file_impl(*this, file, options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_file(const wchar_t* path_, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               FILE* file = impl::open_file_wide(path_, L"rb");
+
+               return impl::load_file_impl(*this, file, options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, const_cast<void*>(contents), size, options, encoding, false, false, &_buffer);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_buffer_inplace(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, contents, size, options, encoding, true, false, &_buffer);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_buffer_inplace_own(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, contents, size, options, encoding, true, true, &_buffer);
+       }
+
+       PUGI__FN void xml_document::save(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+               if ((flags & format_write_bom) && encoding != encoding_latin1)
+               {
+                       // BOM always represents the codepoint U+FEFF, so just write it in native encoding
+               #ifdef PUGIXML_WCHAR_MODE
+                       unsigned int bom = 0xfeff;
+                       buffered_writer.write(static_cast<wchar_t>(bom));
+               #else
+                       buffered_writer.write('\xef', '\xbb', '\xbf');
+               #endif
+               }
+
+               if (!(flags & format_no_declaration) && !impl::has_declaration(_root))
+               {
+                       buffered_writer.write_string(PUGIXML_TEXT("<?xml version=\"1.0\""));
+                       if (encoding == encoding_latin1) buffered_writer.write_string(PUGIXML_TEXT(" encoding=\"ISO-8859-1\""));
+                       buffered_writer.write('?', '>');
+                       if (!(flags & format_raw)) buffered_writer.write('\n');
+               }
+
+               impl::node_output(buffered_writer, _root, indent, flags, 0);
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN void xml_document::save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               xml_writer_stream writer(stream);
+
+               save(writer, indent, flags, encoding);
+       }
+
+       PUGI__FN void xml_document::save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags) const
+       {
+               xml_writer_stream writer(stream);
+
+               save(writer, indent, flags, encoding_wchar);
+       }
+#endif
+
+       PUGI__FN bool xml_document::save_file(const char* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               FILE* file = fopen(path_, (flags & format_save_file_text) ? "w" : "wb");
+               return impl::save_file_impl(*this, file, indent, flags, encoding);
+       }
+
+       PUGI__FN bool xml_document::save_file(const wchar_t* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               FILE* file = impl::open_file_wide(path_, (flags & format_save_file_text) ? L"w" : L"wb");
+               return impl::save_file_impl(*this, file, indent, flags, encoding);
+       }
+
+       PUGI__FN xml_node xml_document::document_element() const
+       {
+               assert(_root);
+
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (PUGI__NODETYPE(i) == node_element)
+                               return xml_node(i);
+
+               return xml_node();
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str)
+       {
+               assert(str);
+
+               return impl::as_utf8_impl(str, impl::strlength_wide(str));
+       }
+
+       PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t>& str)
+       {
+               return impl::as_utf8_impl(str.c_str(), str.size());
+       }
+       
+       PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const char* str)
+       {
+               assert(str);
+
+               return impl::as_wide_impl(str, strlen(str));
+       }
+       
+       PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const std::string& str)
+       {
+               return impl::as_wide_impl(str.c_str(), str.size());
+       }
+#endif
+
+       PUGI__FN void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate)
+       {
+               impl::xml_memory::allocate = allocate;
+               impl::xml_memory::deallocate = deallocate;
+       }
+
+       PUGI__FN allocation_function PUGIXML_FUNCTION get_memory_allocation_function()
+       {
+               return impl::xml_memory::allocate;
+       }
+
+       PUGI__FN deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function()
+       {
+               return impl::xml_memory::deallocate;
+       }
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+       PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_attribute_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_named_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection
+       PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_attribute_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_named_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+}
+#endif
+
+#ifndef PUGIXML_NO_XPATH
+// STL replacements
+PUGI__NS_BEGIN
+       struct equal_to
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs == rhs;
+               }
+       };
+
+       struct not_equal_to
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs != rhs;
+               }
+       };
+
+       struct less
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs < rhs;
+               }
+       };
+
+       struct less_equal
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs <= rhs;
+               }
+       };
+
+       template <typename T> void swap(T& lhs, T& rhs)
+       {
+               T temp = lhs;
+               lhs = rhs;
+               rhs = temp;
+       }
+
+       template <typename I, typename Pred> I min_element(I begin, I end, const Pred& pred)
+       {
+               I result = begin;
+
+               for (I it = begin + 1; it != end; ++it)
+                       if (pred(*it, *result))
+                               result = it;
+
+               return result;
+       }
+
+       template <typename I> void reverse(I begin, I end)
+       {
+               while (end - begin > 1) swap(*begin++, *--end);
+       }
+
+       template <typename I> I unique(I begin, I end)
+       {
+               // fast skip head
+               while (end - begin > 1 && *begin != *(begin + 1)) begin++;
+
+               if (begin == end) return begin;
+
+               // last written element
+               I write = begin++; 
+
+               // merge unique elements
+               while (begin != end)
+               {
+                       if (*begin != *write)
+                               *++write = *begin++;
+                       else
+                               begin++;
+               }
+
+               // past-the-end (write points to live element)
+               return write + 1;
+       }
+
+       template <typename I> void copy_backwards(I begin, I end, I target)
+       {
+               while (begin != end) *--target = *--end;
+       }
+
+       template <typename I, typename Pred, typename T> void insertion_sort(I begin, I end, const Pred& pred, T*)
+       {
+               assert(begin != end);
+
+               for (I it = begin + 1; it != end; ++it)
+               {
+                       T val = *it;
+
+                       if (pred(val, *begin))
+                       {
+                               // move to front
+                               copy_backwards(begin, it, it + 1);
+                               *begin = val;
+                       }
+                       else
+                       {
+                               I hole = it;
+
+                               // move hole backwards
+                               while (pred(val, *(hole - 1)))
+                               {
+                                       *hole = *(hole - 1);
+                                       hole--;
+                               }
+
+                               // fill hole with element
+                               *hole = val;
+                       }
+               }
+       }
+
+       // std variant for elements with ==
+       template <typename I, typename Pred> void partition(I begin, I middle, I end, const Pred& pred, I* out_eqbeg, I* out_eqend)
+       {
+               I eqbeg = middle, eqend = middle + 1;
+
+               // expand equal range
+               while (eqbeg != begin && *(eqbeg - 1) == *eqbeg) --eqbeg;
+               while (eqend != end && *eqend == *eqbeg) ++eqend;
+
+               // process outer elements
+               I ltend = eqbeg, gtbeg = eqend;
+
+               for (;;)
+               {
+                       // find the element from the right side that belongs to the left one
+                       for (; gtbeg != end; ++gtbeg)
+                               if (!pred(*eqbeg, *gtbeg))
+                               {
+                                       if (*gtbeg == *eqbeg) swap(*gtbeg, *eqend++);
+                                       else break;
+                               }
+
+                       // find the element from the left side that belongs to the right one
+                       for (; ltend != begin; --ltend)
+                               if (!pred(*(ltend - 1), *eqbeg))
+                               {
+                                       if (*eqbeg == *(ltend - 1)) swap(*(ltend - 1), *--eqbeg);
+                                       else break;
+                               }
+
+                       // scanned all elements
+                       if (gtbeg == end && ltend == begin)
+                       {
+                               *out_eqbeg = eqbeg;
+                               *out_eqend = eqend;
+                               return;
+                       }
+
+                       // make room for elements by moving equal area
+                       if (gtbeg == end)
+                       {
+                               if (--ltend != --eqbeg) swap(*ltend, *eqbeg);
+                               swap(*eqbeg, *--eqend);
+                       }
+                       else if (ltend == begin)
+                       {
+                               if (eqend != gtbeg) swap(*eqbeg, *eqend);
+                               ++eqend;
+                               swap(*gtbeg++, *eqbeg++);
+                       }
+                       else swap(*gtbeg++, *--ltend);
+               }
+       }
+
+       template <typename I, typename Pred> void median3(I first, I middle, I last, const Pred& pred)
+       {
+               if (pred(*middle, *first)) swap(*middle, *first);
+               if (pred(*last, *middle)) swap(*last, *middle);
+               if (pred(*middle, *first)) swap(*middle, *first);
+       }
+
+       template <typename I, typename Pred> void median(I first, I middle, I last, const Pred& pred)
+       {
+               if (last - first <= 40)
+               {
+                       // median of three for small chunks
+                       median3(first, middle, last, pred);
+               }
+               else
+               {
+                       // median of nine
+                       size_t step = (last - first + 1) / 8;
+
+                       median3(first, first + step, first + 2 * step, pred);
+                       median3(middle - step, middle, middle + step, pred);
+                       median3(last - 2 * step, last - step, last, pred);
+                       median3(first + step, middle, last - step, pred);
+               }
+       }
+
+       template <typename I, typename Pred> void sort(I begin, I end, const Pred& pred)
+       {
+               // sort large chunks
+               while (end - begin > 32)
+               {
+                       // find median element
+                       I middle = begin + (end - begin) / 2;
+                       median(begin, middle, end - 1, pred);
+
+                       // partition in three chunks (< = >)
+                       I eqbeg, eqend;
+                       partition(begin, middle, end, pred, &eqbeg, &eqend);
+
+                       // loop on larger half
+                       if (eqbeg - begin > end - eqend)
+                       {
+                               sort(eqend, end, pred);
+                               end = eqbeg;
+                       }
+                       else
+                       {
+                               sort(begin, eqbeg, pred);
+                               begin = eqend;
+                       }
+               }
+
+               // insertion sort small chunk
+               if (begin != end) insertion_sort(begin, end, pred, &*begin);
+       }
+PUGI__NS_END
+
+// Allocator used for AST and evaluation stacks
+PUGI__NS_BEGIN
+       struct xpath_memory_block
+       {       
+               xpath_memory_block* next;
+               size_t capacity;
+
+               char data[
+       #ifdef PUGIXML_MEMORY_XPATH_PAGE_SIZE
+                       PUGIXML_MEMORY_XPATH_PAGE_SIZE
+       #else
+                       4096
+       #endif
+               ];
+       };
+               
+       class xpath_allocator
+       {
+               xpath_memory_block* _root;
+               size_t _root_size;
+
+       public:
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               jmp_buf* error_handler;
+       #endif
+
+               xpath_allocator(xpath_memory_block* root, size_t root_size = 0): _root(root), _root_size(root_size)
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       error_handler = 0;
+               #endif
+               }
+               
+               void* allocate_nothrow(size_t size)
+               {
+                       // align size so that we're able to store pointers in subsequent blocks
+                       size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+                       if (_root_size + size <= _root->capacity)
+                       {
+                               void* buf = _root->data + _root_size;
+                               _root_size += size;
+                               return buf;
+                       }
+                       else
+                       {
+                               // make sure we have at least 1/4th of the page free after allocation to satisfy subsequent allocation requests
+                               size_t block_capacity_base = sizeof(_root->data);
+                               size_t block_capacity_req = size + block_capacity_base / 4;
+                               size_t block_capacity = (block_capacity_base > block_capacity_req) ? block_capacity_base : block_capacity_req;
+
+                               size_t block_size = block_capacity + offsetof(xpath_memory_block, data);
+
+                               xpath_memory_block* block = static_cast<xpath_memory_block*>(xml_memory::allocate(block_size));
+                               if (!block) return 0;
+                               
+                               block->next = _root;
+                               block->capacity = block_capacity;
+                               
+                               _root = block;
+                               _root_size = size;
+                               
+                               return block->data;
+                       }
+               }
+
+               void* allocate(size_t size)
+               {
+                       void* result = allocate_nothrow(size);
+
+                       if (!result)
+                       {
+                       #ifdef PUGIXML_NO_EXCEPTIONS
+                               assert(error_handler);
+                               longjmp(*error_handler, 1);
+                       #else
+                               throw std::bad_alloc();
+                       #endif
+                       }
+
+                       return result;
+               }
+
+               void* reallocate(void* ptr, size_t old_size, size_t new_size)
+               {
+                       // align size so that we're able to store pointers in subsequent blocks
+                       old_size = (old_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+                       new_size = (new_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+                       // we can only reallocate the last object
+                       assert(ptr == 0 || static_cast<char*>(ptr) + old_size == _root->data + _root_size);
+
+                       // adjust root size so that we have not allocated the object at all
+                       bool only_object = (_root_size == old_size);
+
+                       if (ptr) _root_size -= old_size;
+
+                       // allocate a new version (this will obviously reuse the memory if possible)
+                       void* result = allocate(new_size);
+                       assert(result);
+
+                       // we have a new block
+                       if (result != ptr && ptr)
+                       {
+                               // copy old data
+                               assert(new_size >= old_size);
+                               memcpy(result, ptr, old_size);
+
+                               // free the previous page if it had no other objects
+                               if (only_object)
+                               {
+                                       assert(_root->data == result);
+                                       assert(_root->next);
+
+                                       xpath_memory_block* next = _root->next->next;
+
+                                       if (next)
+                                       {
+                                               // deallocate the whole page, unless it was the first one
+                                               xml_memory::deallocate(_root->next);
+                                               _root->next = next;
+                                       }
+                               }
+                       }
+
+                       return result;
+               }
+
+               void revert(const xpath_allocator& state)
+               {
+                       // free all new pages
+                       xpath_memory_block* cur = _root;
+
+                       while (cur != state._root)
+                       {
+                               xpath_memory_block* next = cur->next;
+
+                               xml_memory::deallocate(cur);
+
+                               cur = next;
+                       }
+
+                       // restore state
+                       _root = state._root;
+                       _root_size = state._root_size;
+               }
+
+               void release()
+               {
+                       xpath_memory_block* cur = _root;
+                       assert(cur);
+
+                       while (cur->next)
+                       {
+                               xpath_memory_block* next = cur->next;
+
+                               xml_memory::deallocate(cur);
+
+                               cur = next;
+                       }
+               }
+       };
+
+       struct xpath_allocator_capture
+       {
+               xpath_allocator_capture(xpath_allocator* alloc): _target(alloc), _state(*alloc)
+               {
+               }
+
+               ~xpath_allocator_capture()
+               {
+                       _target->revert(_state);
+               }
+
+               xpath_allocator* _target;
+               xpath_allocator _state;
+       };
+
+       struct xpath_stack
+       {
+               xpath_allocator* result;
+               xpath_allocator* temp;
+       };
+
+       struct xpath_stack_data
+       {
+               xpath_memory_block blocks[2];
+               xpath_allocator result;
+               xpath_allocator temp;
+               xpath_stack stack;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               jmp_buf error_handler;
+       #endif
+
+               xpath_stack_data(): result(blocks + 0), temp(blocks + 1)
+               {
+                       blocks[0].next = blocks[1].next = 0;
+                       blocks[0].capacity = blocks[1].capacity = sizeof(blocks[0].data);
+
+                       stack.result = &result;
+                       stack.temp = &temp;
+
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       result.error_handler = temp.error_handler = &error_handler;
+               #endif
+               }
+
+               ~xpath_stack_data()
+               {
+                       result.release();
+                       temp.release();
+               }
+       };
+PUGI__NS_END
+
+// String class
+PUGI__NS_BEGIN
+       class xpath_string
+       {
+               const char_t* _buffer;
+               bool _uses_heap;
+               size_t _length_heap;
+
+               static char_t* duplicate_string(const char_t* string, size_t length, xpath_allocator* alloc)
+               {
+                       char_t* result = static_cast<char_t*>(alloc->allocate((length + 1) * sizeof(char_t)));
+                       assert(result);
+
+                       memcpy(result, string, length * sizeof(char_t));
+                       result[length] = 0;
+
+                       return result;
+               }
+
+               xpath_string(const char_t* buffer, bool uses_heap_, size_t length_heap): _buffer(buffer), _uses_heap(uses_heap_), _length_heap(length_heap)
+               {
+               }
+
+       public:
+               static xpath_string from_const(const char_t* str)
+               {
+                       return xpath_string(str, false, 0);
+               }
+
+               static xpath_string from_heap_preallocated(const char_t* begin, const char_t* end)
+               {
+                       assert(begin <= end && *end == 0);
+
+                       return xpath_string(begin, true, static_cast<size_t>(end - begin));
+               }
+
+               static xpath_string from_heap(const char_t* begin, const char_t* end, xpath_allocator* alloc)
+               {
+                       assert(begin <= end);
+
+                       size_t length = static_cast<size_t>(end - begin);
+
+                       return length == 0 ? xpath_string() : xpath_string(duplicate_string(begin, length, alloc), true, length);
+               }
+
+               xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false), _length_heap(0)
+               {
+               }
+
+               void append(const xpath_string& o, xpath_allocator* alloc)
+               {
+                       // skip empty sources
+                       if (!*o._buffer) return;
+
+                       // fast append for constant empty target and constant source
+                       if (!*_buffer && !_uses_heap && !o._uses_heap)
+                       {
+                               _buffer = o._buffer;
+                       }
+                       else
+                       {
+                               // need to make heap copy
+                               size_t target_length = length();
+                               size_t source_length = o.length();
+                               size_t result_length = target_length + source_length;
+
+                               // allocate new buffer
+                               char_t* result = static_cast<char_t*>(alloc->reallocate(_uses_heap ? const_cast<char_t*>(_buffer) : 0, (target_length + 1) * sizeof(char_t), (result_length + 1) * sizeof(char_t)));
+                               assert(result);
+
+                               // append first string to the new buffer in case there was no reallocation
+                               if (!_uses_heap) memcpy(result, _buffer, target_length * sizeof(char_t));
+
+                               // append second string to the new buffer
+                               memcpy(result + target_length, o._buffer, source_length * sizeof(char_t));
+                               result[result_length] = 0;
+
+                               // finalize
+                               _buffer = result;
+                               _uses_heap = true;
+                               _length_heap = result_length;
+                       }
+               }
+
+               const char_t* c_str() const
+               {
+                       return _buffer;
+               }
+
+               size_t length() const
+               {
+                       return _uses_heap ? _length_heap : strlength(_buffer);
+               }
+               
+               char_t* data(xpath_allocator* alloc)
+               {
+                       // make private heap copy
+                       if (!_uses_heap)
+                       {
+                               size_t length_ = strlength(_buffer);
+
+                               _buffer = duplicate_string(_buffer, length_, alloc);
+                               _uses_heap = true;
+                               _length_heap = length_;
+                       }
+
+                       return const_cast<char_t*>(_buffer);
+               }
+
+               bool empty() const
+               {
+                       return *_buffer == 0;
+               }
+
+               bool operator==(const xpath_string& o) const
+               {
+                       return strequal(_buffer, o._buffer);
+               }
+
+               bool operator!=(const xpath_string& o) const
+               {
+                       return !strequal(_buffer, o._buffer);
+               }
+
+               bool uses_heap() const
+               {
+                       return _uses_heap;
+               }
+       };
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+       PUGI__FN bool starts_with(const char_t* string, const char_t* pattern)
+       {
+               while (*pattern && *string == *pattern)
+               {
+                       string++;
+                       pattern++;
+               }
+
+               return *pattern == 0;
+       }
+
+       PUGI__FN const char_t* find_char(const char_t* s, char_t c)
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcschr(s, c);
+       #else
+               return strchr(s, c);
+       #endif
+       }
+
+       PUGI__FN const char_t* find_substring(const char_t* s, const char_t* p)
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               // MSVC6 wcsstr bug workaround (if s is empty it always returns 0)
+               return (*p == 0) ? s : wcsstr(s, p);
+       #else
+               return strstr(s, p);
+       #endif
+       }
+
+       // Converts symbol to lower case, if it is an ASCII one
+       PUGI__FN char_t tolower_ascii(char_t ch)
+       {
+               return static_cast<unsigned int>(ch - 'A') < 26 ? static_cast<char_t>(ch | ' ') : ch;
+       }
+
+       PUGI__FN xpath_string string_value(const xpath_node& na, xpath_allocator* alloc)
+       {
+               if (na.attribute())
+                       return xpath_string::from_const(na.attribute().value());
+               else
+               {
+                       xml_node n = na.node();
+
+                       switch (n.type())
+                       {
+                       case node_pcdata:
+                       case node_cdata:
+                       case node_comment:
+                       case node_pi:
+                               return xpath_string::from_const(n.value());
+                       
+                       case node_document:
+                       case node_element:
+                       {
+                               xpath_string result;
+
+                               xml_node cur = n.first_child();
+                               
+                               while (cur && cur != n)
+                               {
+                                       if (cur.type() == node_pcdata || cur.type() == node_cdata)
+                                               result.append(xpath_string::from_const(cur.value()), alloc);
+
+                                       if (cur.first_child())
+                                               cur = cur.first_child();
+                                       else if (cur.next_sibling())
+                                               cur = cur.next_sibling();
+                                       else
+                                       {
+                                               while (!cur.next_sibling() && cur != n)
+                                                       cur = cur.parent();
+
+                                               if (cur != n) cur = cur.next_sibling();
+                                       }
+                               }
+                               
+                               return result;
+                       }
+                       
+                       default:
+                               return xpath_string();
+                       }
+               }
+       }
+       
+       PUGI__FN bool node_is_before_sibling(xml_node_struct* ln, xml_node_struct* rn)
+       {
+               assert(ln->parent == rn->parent);
+
+               // there is no common ancestor (the shared parent is null), nodes are from different documents
+               if (!ln->parent) return ln < rn;
+
+               // determine sibling order
+               xml_node_struct* ls = ln;
+               xml_node_struct* rs = rn;
+
+               while (ls && rs)
+               {
+                       if (ls == rn) return true;
+                       if (rs == ln) return false;
+
+                       ls = ls->next_sibling;
+                       rs = rs->next_sibling;
+               }
+
+               // if rn sibling chain ended ln must be before rn
+               return !rs;
+       }
+       
+       PUGI__FN bool node_is_before(xml_node_struct* ln, xml_node_struct* rn)
+       {
+               // find common ancestor at the same depth, if any
+               xml_node_struct* lp = ln;
+               xml_node_struct* rp = rn;
+
+               while (lp && rp && lp->parent != rp->parent)
+               {
+                       lp = lp->parent;
+                       rp = rp->parent;
+               }
+
+               // parents are the same!
+               if (lp && rp) return node_is_before_sibling(lp, rp);
+
+               // nodes are at different depths, need to normalize heights
+               bool left_higher = !lp;
+
+               while (lp)
+               {
+                       lp = lp->parent;
+                       ln = ln->parent;
+               }
+
+               while (rp)
+               {
+                       rp = rp->parent;
+                       rn = rn->parent;
+               }
+
+               // one node is the ancestor of the other
+               if (ln == rn) return left_higher;
+
+               // find common ancestor... again
+               while (ln->parent != rn->parent)
+               {
+                       ln = ln->parent;
+                       rn = rn->parent;
+               }
+
+               return node_is_before_sibling(ln, rn);
+       }
+
+       PUGI__FN bool node_is_ancestor(xml_node_struct* parent, xml_node_struct* node)
+       {
+               while (node && node != parent) node = node->parent;
+
+               return parent && node == parent;
+       }
+
+       PUGI__FN const void* document_buffer_order(const xpath_node& xnode)
+       {
+               xml_node_struct* node = xnode.node().internal_object();
+
+               if (node)
+               {
+                       if ((get_document(node).header & xml_memory_page_contents_shared_mask) == 0)
+                       {
+                               if (node->name && (node->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return node->name;
+                               if (node->value && (node->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return node->value;
+                       }
+
+                       return 0;
+               }
+
+               xml_attribute_struct* attr = xnode.attribute().internal_object();
+
+               if (attr)
+               {
+                       if ((get_document(attr).header & xml_memory_page_contents_shared_mask) == 0)
+                       {
+                               if ((attr->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return attr->name;
+                               if ((attr->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return attr->value;
+                       }
+
+                       return 0;
+               }
+
+               return 0;
+       }
+       
+       struct document_order_comparator
+       {
+               bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+               {
+                       // optimized document order based check
+                       const void* lo = document_buffer_order(lhs);
+                       const void* ro = document_buffer_order(rhs);
+
+                       if (lo && ro) return lo < ro;
+
+                       // slow comparison
+                       xml_node ln = lhs.node(), rn = rhs.node();
+
+                       // compare attributes
+                       if (lhs.attribute() && rhs.attribute())
+                       {
+                               // shared parent
+                               if (lhs.parent() == rhs.parent())
+                               {
+                                       // determine sibling order
+                                       for (xml_attribute a = lhs.attribute(); a; a = a.next_attribute())
+                                               if (a == rhs.attribute())
+                                                       return true;
+                                       
+                                       return false;
+                               }
+                               
+                               // compare attribute parents
+                               ln = lhs.parent();
+                               rn = rhs.parent();
+                       }
+                       else if (lhs.attribute())
+                       {
+                               // attributes go after the parent element
+                               if (lhs.parent() == rhs.node()) return false;
+                               
+                               ln = lhs.parent();
+                       }
+                       else if (rhs.attribute())
+                       {
+                               // attributes go after the parent element
+                               if (rhs.parent() == lhs.node()) return true;
+                               
+                               rn = rhs.parent();
+                       }
+
+                       if (ln == rn) return false;
+
+                       if (!ln || !rn) return ln < rn;
+                       
+                       return node_is_before(ln.internal_object(), rn.internal_object());
+               }
+       };
+
+       struct duplicate_comparator
+       {
+               bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+               {
+                       if (lhs.attribute()) return rhs.attribute() ? lhs.attribute() < rhs.attribute() : true;
+                       else return rhs.attribute() ? false : lhs.node() < rhs.node();
+               }
+       };
+       
+       PUGI__FN double gen_nan()
+       {
+       #if defined(__STDC_IEC_559__) || ((FLT_RADIX - 0 == 2) && (FLT_MAX_EXP - 0 == 128) && (FLT_MANT_DIG - 0 == 24))
+               union { float f; uint32_t i; } u[sizeof(float) == sizeof(uint32_t) ? 1 : -1];
+               u[0].i = 0x7fc00000;
+               return u[0].f;
+       #else
+               // fallback
+               const volatile double zero = 0.0;
+               return zero / zero;
+       #endif
+       }
+       
+       PUGI__FN bool is_nan(double value)
+       {
+       #if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+               return !!_isnan(value);
+       #elif defined(fpclassify) && defined(FP_NAN)
+               return fpclassify(value) == FP_NAN;
+       #else
+               // fallback
+               const volatile double v = value;
+               return v != v;
+       #endif
+       }
+       
+       PUGI__FN const char_t* convert_number_to_string_special(double value)
+       {
+       #if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+               if (_finite(value)) return (value == 0) ? PUGIXML_TEXT("0") : 0;
+               if (_isnan(value)) return PUGIXML_TEXT("NaN");
+               return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+       #elif defined(fpclassify) && defined(FP_NAN) && defined(FP_INFINITE) && defined(FP_ZERO)
+               switch (fpclassify(value))
+               {
+               case FP_NAN:
+                       return PUGIXML_TEXT("NaN");
+
+               case FP_INFINITE:
+                       return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+
+               case FP_ZERO:
+                       return PUGIXML_TEXT("0");
+
+               default:
+                       return 0;
+               }
+       #else
+               // fallback
+               const volatile double v = value;
+
+               if (v == 0) return PUGIXML_TEXT("0");
+               if (v != v) return PUGIXML_TEXT("NaN");
+               if (v * 2 == v) return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+               return 0;
+       #endif
+       }
+       
+       PUGI__FN bool convert_number_to_boolean(double value)
+       {
+               return (value != 0 && !is_nan(value));
+       }
+       
+       PUGI__FN void truncate_zeros(char* begin, char* end)
+       {
+               while (begin != end && end[-1] == '0') end--;
+
+               *end = 0;
+       }
+
+       // gets mantissa digits in the form of 0.xxxxx with 0. implied and the exponent
+#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+       PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+       {
+               // get base values
+               int sign, exponent;
+               _ecvt_s(buffer, buffer_size, value, DBL_DIG + 1, &exponent, &sign);
+
+               // truncate redundant zeros
+               truncate_zeros(buffer, buffer + strlen(buffer));
+
+               // fill results
+               *out_mantissa = buffer;
+               *out_exponent = exponent;
+       }
+#else
+       PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+       {
+               // get a scientific notation value with IEEE DBL_DIG decimals
+               sprintf(buffer, "%.*e", DBL_DIG, value);
+               assert(strlen(buffer) < buffer_size);
+               (void)!buffer_size;
+
+               // get the exponent (possibly negative)
+               char* exponent_string = strchr(buffer, 'e');
+               assert(exponent_string);
+
+               int exponent = atoi(exponent_string + 1);
+
+               // extract mantissa string: skip sign
+               char* mantissa = buffer[0] == '-' ? buffer + 1 : buffer;
+               assert(mantissa[0] != '0' && mantissa[1] == '.');
+
+               // divide mantissa by 10 to eliminate integer part
+               mantissa[1] = mantissa[0];
+               mantissa++;
+               exponent++;
+
+               // remove extra mantissa digits and zero-terminate mantissa
+               truncate_zeros(mantissa, exponent_string);
+
+               // fill results
+               *out_mantissa = mantissa;
+               *out_exponent = exponent;
+       }
+#endif
+
+       PUGI__FN xpath_string convert_number_to_string(double value, xpath_allocator* alloc)
+       {
+               // try special number conversion
+               const char_t* special = convert_number_to_string_special(value);
+               if (special) return xpath_string::from_const(special);
+
+               // get mantissa + exponent form
+               char mantissa_buffer[32];
+
+               char* mantissa;
+               int exponent;
+               convert_number_to_mantissa_exponent(value, mantissa_buffer, sizeof(mantissa_buffer), &mantissa, &exponent);
+
+               // allocate a buffer of suitable length for the number
+               size_t result_size = strlen(mantissa_buffer) + (exponent > 0 ? exponent : -exponent) + 4;
+               char_t* result = static_cast<char_t*>(alloc->allocate(sizeof(char_t) * result_size));
+               assert(result);
+
+               // make the number!
+               char_t* s = result;
+
+               // sign
+               if (value < 0) *s++ = '-';
+
+               // integer part
+               if (exponent <= 0)
+               {
+                       *s++ = '0';
+               }
+               else
+               {
+                       while (exponent > 0)
+                       {
+                               assert(*mantissa == 0 || static_cast<unsigned int>(static_cast<unsigned int>(*mantissa) - '0') <= 9);
+                               *s++ = *mantissa ? *mantissa++ : '0';
+                               exponent--;
+                       }
+               }
+
+               // fractional part
+               if (*mantissa)
+               {
+                       // decimal point
+                       *s++ = '.';
+
+                       // extra zeroes from negative exponent
+                       while (exponent < 0)
+                       {
+                               *s++ = '0';
+                               exponent++;
+                       }
+
+                       // extra mantissa digits
+                       while (*mantissa)
+                       {
+                               assert(static_cast<unsigned int>(*mantissa - '0') <= 9);
+                               *s++ = *mantissa++;
+                       }
+               }
+
+               // zero-terminate
+               assert(s < result + result_size);
+               *s = 0;
+
+               return xpath_string::from_heap_preallocated(result, s);
+       }
+       
+       PUGI__FN bool check_string_to_number_format(const char_t* string)
+       {
+               // parse leading whitespace
+               while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+               // parse sign
+               if (*string == '-') ++string;
+
+               if (!*string) return false;
+
+               // if there is no integer part, there should be a decimal part with at least one digit
+               if (!PUGI__IS_CHARTYPEX(string[0], ctx_digit) && (string[0] != '.' || !PUGI__IS_CHARTYPEX(string[1], ctx_digit))) return false;
+
+               // parse integer part
+               while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+
+               // parse decimal part
+               if (*string == '.')
+               {
+                       ++string;
+
+                       while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+               }
+
+               // parse trailing whitespace
+               while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+               return *string == 0;
+       }
+
+       PUGI__FN double convert_string_to_number(const char_t* string)
+       {
+               // check string format
+               if (!check_string_to_number_format(string)) return gen_nan();
+
+               // parse string
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcstod(string, 0);
+       #else
+               return atof(string);
+       #endif
+       }
+
+       PUGI__FN bool convert_string_to_number_scratch(char_t (&buffer)[32], const char_t* begin, const char_t* end, double* out_result)
+       {
+               size_t length = static_cast<size_t>(end - begin);
+               char_t* scratch = buffer;
+
+               if (length >= sizeof(buffer) / sizeof(buffer[0]))
+               {
+                       // need to make dummy on-heap copy
+                       scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!scratch) return false;
+               }
+
+               // copy string to zero-terminated buffer and perform conversion
+               memcpy(scratch, begin, length * sizeof(char_t));
+               scratch[length] = 0;
+
+               *out_result = convert_string_to_number(scratch);
+
+               // free dummy buffer
+               if (scratch != buffer) xml_memory::deallocate(scratch);
+
+               return true;
+       }
+       
+       PUGI__FN double round_nearest(double value)
+       {
+               return floor(value + 0.5);
+       }
+
+       PUGI__FN double round_nearest_nzero(double value)
+       {
+               // same as round_nearest, but returns -0 for [-0.5, -0]
+               // ceil is used to differentiate between +0 and -0 (we return -0 for [-0.5, -0] and +0 for +0)
+               return (value >= -0.5 && value <= 0) ? ceil(value) : floor(value + 0.5);
+       }
+       
+       PUGI__FN const char_t* qualified_name(const xpath_node& node)
+       {
+               return node.attribute() ? node.attribute().name() : node.node().name();
+       }
+       
+       PUGI__FN const char_t* local_name(const xpath_node& node)
+       {
+               const char_t* name = qualified_name(node);
+               const char_t* p = find_char(name, ':');
+               
+               return p ? p + 1 : name;
+       }
+
+       struct namespace_uri_predicate
+       {
+               const char_t* prefix;
+               size_t prefix_length;
+
+               namespace_uri_predicate(const char_t* name)
+               {
+                       const char_t* pos = find_char(name, ':');
+
+                       prefix = pos ? name : 0;
+                       prefix_length = pos ? static_cast<size_t>(pos - name) : 0;
+               }
+
+               bool operator()(xml_attribute a) const
+               {
+                       const char_t* name = a.name();
+
+                       if (!starts_with(name, PUGIXML_TEXT("xmlns"))) return false;
+
+                       return prefix ? name[5] == ':' && strequalrange(name + 6, prefix, prefix_length) : name[5] == 0;
+               }
+       };
+
+       PUGI__FN const char_t* namespace_uri(xml_node node)
+       {
+               namespace_uri_predicate pred = node.name();
+               
+               xml_node p = node;
+               
+               while (p)
+               {
+                       xml_attribute a = p.find_attribute(pred);
+                       
+                       if (a) return a.value();
+                       
+                       p = p.parent();
+               }
+               
+               return PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* namespace_uri(xml_attribute attr, xml_node parent)
+       {
+               namespace_uri_predicate pred = attr.name();
+               
+               // Default namespace does not apply to attributes
+               if (!pred.prefix) return PUGIXML_TEXT("");
+               
+               xml_node p = parent;
+               
+               while (p)
+               {
+                       xml_attribute a = p.find_attribute(pred);
+                       
+                       if (a) return a.value();
+                       
+                       p = p.parent();
+               }
+               
+               return PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* namespace_uri(const xpath_node& node)
+       {
+               return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node());
+       }
+
+       PUGI__FN void normalize_space(char_t* buffer)
+       {
+               char_t* write = buffer;
+
+               for (char_t* it = buffer; *it; )
+               {
+                       char_t ch = *it++;
+
+                       if (PUGI__IS_CHARTYPE(ch, ct_space))
+                       {
+                               // replace whitespace sequence with single space
+                               while (PUGI__IS_CHARTYPE(*it, ct_space)) it++;
+
+                               // avoid leading spaces
+                               if (write != buffer) *write++ = ' ';
+                       }
+                       else *write++ = ch;
+               }
+
+               // remove trailing space
+               if (write != buffer && PUGI__IS_CHARTYPE(write[-1], ct_space)) write--;
+
+               // zero-terminate
+               *write = 0;
+       }
+
+       PUGI__FN void translate(char_t* buffer, const char_t* from, const char_t* to, size_t to_length)
+       {
+               char_t* write = buffer;
+
+               while (*buffer)
+               {
+                       PUGI__DMC_VOLATILE char_t ch = *buffer++;
+
+                       const char_t* pos = find_char(from, ch);
+
+                       if (!pos)
+                               *write++ = ch; // do not process
+                       else if (static_cast<size_t>(pos - from) < to_length)
+                               *write++ = to[pos - from]; // replace
+               }
+
+               // zero-terminate
+               *write = 0;
+       }
+
+       PUGI__FN unsigned char* translate_table_generate(xpath_allocator* alloc, const char_t* from, const char_t* to)
+       {
+               unsigned char table[128] = {0};
+
+               while (*from)
+               {
+                       unsigned int fc = static_cast<unsigned int>(*from);
+                       unsigned int tc = static_cast<unsigned int>(*to);
+
+                       if (fc >= 128 || tc >= 128)
+                               return 0;
+
+                       // code=128 means "skip character"
+                       if (!table[fc])
+                               table[fc] = static_cast<unsigned char>(tc ? tc : 128);
+
+                       from++;
+                       if (tc) to++;
+               }
+
+               for (int i = 0; i < 128; ++i)
+                       if (!table[i])
+                               table[i] = static_cast<unsigned char>(i);
+
+               void* result = alloc->allocate_nothrow(sizeof(table));
+
+               if (result)
+               {
+                       memcpy(result, table, sizeof(table));
+               }
+
+               return static_cast<unsigned char*>(result);
+       }
+
+       PUGI__FN void translate_table(char_t* buffer, const unsigned char* table)
+       {
+               char_t* write = buffer;
+
+               while (*buffer)
+               {
+                       char_t ch = *buffer++;
+                       unsigned int index = static_cast<unsigned int>(ch);
+
+                       if (index < 128)
+                       {
+                               unsigned char code = table[index];
+
+                               // code=128 means "skip character" (table size is 128 so 128 can be a special value)
+                               // this code skips these characters without extra branches
+                               *write = static_cast<char_t>(code);
+                               write += 1 - (code >> 7);
+                       }
+                       else
+                       {
+                               *write++ = ch;
+                       }
+               }
+
+               // zero-terminate
+               *write = 0;
+       }
+
+       inline bool is_xpath_attribute(const char_t* name)
+       {
+               return !(starts_with(name, PUGIXML_TEXT("xmlns")) && (name[5] == 0 || name[5] == ':'));
+       }
+
+       struct xpath_variable_boolean: xpath_variable
+       {
+               xpath_variable_boolean(): value(false)
+               {
+               }
+
+               bool value;
+               char_t name[1];
+       };
+
+       struct xpath_variable_number: xpath_variable
+       {
+               xpath_variable_number(): value(0)
+               {
+               }
+
+               double value;
+               char_t name[1];
+       };
+
+       struct xpath_variable_string: xpath_variable
+       {
+               xpath_variable_string(): value(0)
+               {
+               }
+
+               ~xpath_variable_string()
+               {
+                       if (value) xml_memory::deallocate(value);
+               }
+
+               char_t* value;
+               char_t name[1];
+       };
+
+       struct xpath_variable_node_set: xpath_variable
+       {
+               xpath_node_set value;
+               char_t name[1];
+       };
+
+       static const xpath_node_set dummy_node_set;
+
+       PUGI__FN unsigned int hash_string(const char_t* str)
+       {
+               // Jenkins one-at-a-time hash (http://en.wikipedia.org/wiki/Jenkins_hash_function#one-at-a-time)
+               unsigned int result = 0;
+
+               while (*str)
+               {
+                       result += static_cast<unsigned int>(*str++);
+                       result += result << 10;
+                       result ^= result >> 6;
+               }
+       
+               result += result << 3;
+               result ^= result >> 11;
+               result += result << 15;
+       
+               return result;
+       }
+
+       template <typename T> PUGI__FN T* new_xpath_variable(const char_t* name)
+       {
+               size_t length = strlength(name);
+               if (length == 0) return 0; // empty variable names are invalid
+
+               // $$ we can't use offsetof(T, name) because T is non-POD, so we just allocate additional length characters
+               void* memory = xml_memory::allocate(sizeof(T) + length * sizeof(char_t));
+               if (!memory) return 0;
+
+               T* result = new (memory) T();
+
+               memcpy(result->name, name, (length + 1) * sizeof(char_t));
+
+               return result;
+       }
+
+       PUGI__FN xpath_variable* new_xpath_variable(xpath_value_type type, const char_t* name)
+       {
+               switch (type)
+               {
+               case xpath_type_node_set:
+                       return new_xpath_variable<xpath_variable_node_set>(name);
+
+               case xpath_type_number:
+                       return new_xpath_variable<xpath_variable_number>(name);
+
+               case xpath_type_string:
+                       return new_xpath_variable<xpath_variable_string>(name);
+
+               case xpath_type_boolean:
+                       return new_xpath_variable<xpath_variable_boolean>(name);
+
+               default:
+                       return 0;
+               }
+       }
+
+       template <typename T> PUGI__FN void delete_xpath_variable(T* var)
+       {
+               var->~T();
+               xml_memory::deallocate(var);
+       }
+
+       PUGI__FN void delete_xpath_variable(xpath_value_type type, xpath_variable* var)
+       {
+               switch (type)
+               {
+               case xpath_type_node_set:
+                       delete_xpath_variable(static_cast<xpath_variable_node_set*>(var));
+                       break;
+
+               case xpath_type_number:
+                       delete_xpath_variable(static_cast<xpath_variable_number*>(var));
+                       break;
+
+               case xpath_type_string:
+                       delete_xpath_variable(static_cast<xpath_variable_string*>(var));
+                       break;
+
+               case xpath_type_boolean:
+                       delete_xpath_variable(static_cast<xpath_variable_boolean*>(var));
+                       break;
+
+               default:
+                       assert(!"Invalid variable type");
+               }
+       }
+
+       PUGI__FN xpath_variable* get_variable_scratch(char_t (&buffer)[32], xpath_variable_set* set, const char_t* begin, const char_t* end)
+       {
+               size_t length = static_cast<size_t>(end - begin);
+               char_t* scratch = buffer;
+
+               if (length >= sizeof(buffer) / sizeof(buffer[0]))
+               {
+                       // need to make dummy on-heap copy
+                       scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!scratch) return 0;
+               }
+
+               // copy string to zero-terminated buffer and perform lookup
+               memcpy(scratch, begin, length * sizeof(char_t));
+               scratch[length] = 0;
+
+               xpath_variable* result = set->get(scratch);
+
+               // free dummy buffer
+               if (scratch != buffer) xml_memory::deallocate(scratch);
+
+               return result;
+       }
+PUGI__NS_END
+
+// Internal node set class
+PUGI__NS_BEGIN
+       PUGI__FN xpath_node_set::type_t xpath_get_order(const xpath_node* begin, const xpath_node* end)
+       {
+               if (end - begin < 2)
+                       return xpath_node_set::type_sorted;
+
+               document_order_comparator cmp;
+
+               bool first = cmp(begin[0], begin[1]);
+
+               for (const xpath_node* it = begin + 1; it + 1 < end; ++it)
+                       if (cmp(it[0], it[1]) != first)
+                               return xpath_node_set::type_unsorted;
+
+               return first ? xpath_node_set::type_sorted : xpath_node_set::type_sorted_reverse;
+       }
+
+       PUGI__FN xpath_node_set::type_t xpath_sort(xpath_node* begin, xpath_node* end, xpath_node_set::type_t type, bool rev)
+       {
+               xpath_node_set::type_t order = rev ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted;
+
+               if (type == xpath_node_set::type_unsorted)
+               {
+                       xpath_node_set::type_t sorted = xpath_get_order(begin, end);
+
+                       if (sorted == xpath_node_set::type_unsorted)
+                       {
+                               sort(begin, end, document_order_comparator());
+
+                               type = xpath_node_set::type_sorted;
+                       }
+                       else
+                               type = sorted;
+               }
+               
+               if (type != order) reverse(begin, end);
+                       
+               return order;
+       }
+
+       PUGI__FN xpath_node xpath_first(const xpath_node* begin, const xpath_node* end, xpath_node_set::type_t type)
+       {
+               if (begin == end) return xpath_node();
+
+               switch (type)
+               {
+               case xpath_node_set::type_sorted:
+                       return *begin;
+
+               case xpath_node_set::type_sorted_reverse:
+                       return *(end - 1);
+
+               case xpath_node_set::type_unsorted:
+                       return *min_element(begin, end, document_order_comparator());
+
+               default:
+                       assert(!"Invalid node set type");
+                       return xpath_node();
+               }
+       }
+
+       class xpath_node_set_raw
+       {
+               xpath_node_set::type_t _type;
+
+               xpath_node* _begin;
+               xpath_node* _end;
+               xpath_node* _eos;
+
+       public:
+               xpath_node_set_raw(): _type(xpath_node_set::type_unsorted), _begin(0), _end(0), _eos(0)
+               {
+               }
+
+               xpath_node* begin() const
+               {
+                       return _begin;
+               }
+
+               xpath_node* end() const
+               {
+                       return _end;
+               }
+
+               bool empty() const
+               {
+                       return _begin == _end;
+               }
+
+               size_t size() const
+               {
+                       return static_cast<size_t>(_end - _begin);
+               }
+
+               xpath_node first() const
+               {
+                       return xpath_first(_begin, _end, _type);
+               }
+
+               void push_back_grow(const xpath_node& node, xpath_allocator* alloc);
+
+               void push_back(const xpath_node& node, xpath_allocator* alloc)
+               {
+                       if (_end != _eos)
+                               *_end++ = node;
+                       else
+                               push_back_grow(node, alloc);
+               }
+
+               void append(const xpath_node* begin_, const xpath_node* end_, xpath_allocator* alloc)
+               {
+                       if (begin_ == end_) return;
+
+                       size_t size_ = static_cast<size_t>(_end - _begin);
+                       size_t capacity = static_cast<size_t>(_eos - _begin);
+                       size_t count = static_cast<size_t>(end_ - begin_);
+
+                       if (size_ + count > capacity)
+                       {
+                               // reallocate the old array or allocate a new one
+                               xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), (size_ + count) * sizeof(xpath_node)));
+                               assert(data);
+
+                               // finalize
+                               _begin = data;
+                               _end = data + size_;
+                               _eos = data + size_ + count;
+                       }
+
+                       memcpy(_end, begin_, count * sizeof(xpath_node));
+                       _end += count;
+               }
+
+               void sort_do()
+               {
+                       _type = xpath_sort(_begin, _end, _type, false);
+               }
+
+               void truncate(xpath_node* pos)
+               {
+                       assert(_begin <= pos && pos <= _end);
+
+                       _end = pos;
+               }
+
+               void remove_duplicates()
+               {
+                       if (_type == xpath_node_set::type_unsorted)
+                               sort(_begin, _end, duplicate_comparator());
+               
+                       _end = unique(_begin, _end);
+               }
+
+               xpath_node_set::type_t type() const
+               {
+                       return _type;
+               }
+
+               void set_type(xpath_node_set::type_t value)
+               {
+                       _type = value;
+               }
+       };
+
+       PUGI__FN_NO_INLINE void xpath_node_set_raw::push_back_grow(const xpath_node& node, xpath_allocator* alloc)
+       {
+               size_t capacity = static_cast<size_t>(_eos - _begin);
+
+               // get new capacity (1.5x rule)
+               size_t new_capacity = capacity + capacity / 2 + 1;
+
+               // reallocate the old array or allocate a new one
+               xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), new_capacity * sizeof(xpath_node)));
+               assert(data);
+
+               // finalize
+               _begin = data;
+               _end = data + capacity;
+               _eos = data + new_capacity;
+
+               // push
+               *_end++ = node;
+       }
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+       struct xpath_context
+       {
+               xpath_node n;
+               size_t position, size;
+
+               xpath_context(const xpath_node& n_, size_t position_, size_t size_): n(n_), position(position_), size(size_)
+               {
+               }
+       };
+
+       enum lexeme_t
+       {
+               lex_none = 0,
+               lex_equal,
+               lex_not_equal,
+               lex_less,
+               lex_greater,
+               lex_less_or_equal,
+               lex_greater_or_equal,
+               lex_plus,
+               lex_minus,
+               lex_multiply,
+               lex_union,
+               lex_var_ref,
+               lex_open_brace,
+               lex_close_brace,
+               lex_quoted_string,
+               lex_number,
+               lex_slash,
+               lex_double_slash,
+               lex_open_square_brace,
+               lex_close_square_brace,
+               lex_string,
+               lex_comma,
+               lex_axis_attribute,
+               lex_dot,
+               lex_double_dot,
+               lex_double_colon,
+               lex_eof
+       };
+
+       struct xpath_lexer_string
+       {
+               const char_t* begin;
+               const char_t* end;
+
+               xpath_lexer_string(): begin(0), end(0)
+               {
+               }
+
+               bool operator==(const char_t* other) const
+               {
+                       size_t length = static_cast<size_t>(end - begin);
+
+                       return strequalrange(other, begin, length);
+               }
+       };
+
+       class xpath_lexer
+       {
+               const char_t* _cur;
+               const char_t* _cur_lexeme_pos;
+               xpath_lexer_string _cur_lexeme_contents;
+
+               lexeme_t _cur_lexeme;
+
+       public:
+               explicit xpath_lexer(const char_t* query): _cur(query)
+               {
+                       next();
+               }
+               
+               const char_t* state() const
+               {
+                       return _cur;
+               }
+               
+               void next()
+               {
+                       const char_t* cur = _cur;
+
+                       while (PUGI__IS_CHARTYPE(*cur, ct_space)) ++cur;
+
+                       // save lexeme position for error reporting
+                       _cur_lexeme_pos = cur;
+
+                       switch (*cur)
+                       {
+                       case 0:
+                               _cur_lexeme = lex_eof;
+                               break;
+                       
+                       case '>':
+                               if (*(cur+1) == '=')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_greater_or_equal;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_greater;
+                               }
+                               break;
+
+                       case '<':
+                               if (*(cur+1) == '=')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_less_or_equal;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_less;
+                               }
+                               break;
+
+                       case '!':
+                               if (*(cur+1) == '=')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_not_equal;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+                               break;
+
+                       case '=':
+                               cur += 1;
+                               _cur_lexeme = lex_equal;
+
+                               break;
+                       
+                       case '+':
+                               cur += 1;
+                               _cur_lexeme = lex_plus;
+
+                               break;
+
+                       case '-':
+                               cur += 1;
+                               _cur_lexeme = lex_minus;
+
+                               break;
+
+                       case '*':
+                               cur += 1;
+                               _cur_lexeme = lex_multiply;
+
+                               break;
+
+                       case '|':
+                               cur += 1;
+                               _cur_lexeme = lex_union;
+
+                               break;
+                       
+                       case '$':
+                               cur += 1;
+
+                               if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+                               {
+                                       _cur_lexeme_contents.begin = cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+                                       if (cur[0] == ':' && PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // qname
+                                       {
+                                               cur++; // :
+
+                                               while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+                                       }
+
+                                       _cur_lexeme_contents.end = cur;
+                               
+                                       _cur_lexeme = lex_var_ref;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+
+                               break;
+
+                       case '(':
+                               cur += 1;
+                               _cur_lexeme = lex_open_brace;
+
+                               break;
+
+                       case ')':
+                               cur += 1;
+                               _cur_lexeme = lex_close_brace;
+
+                               break;
+                       
+                       case '[':
+                               cur += 1;
+                               _cur_lexeme = lex_open_square_brace;
+
+                               break;
+
+                       case ']':
+                               cur += 1;
+                               _cur_lexeme = lex_close_square_brace;
+
+                               break;
+
+                       case ',':
+                               cur += 1;
+                               _cur_lexeme = lex_comma;
+
+                               break;
+
+                       case '/':
+                               if (*(cur+1) == '/')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_double_slash;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_slash;
+                               }
+                               break;
+               
+                       case '.':
+                               if (*(cur+1) == '.')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_double_dot;
+                               }
+                               else if (PUGI__IS_CHARTYPEX(*(cur+1), ctx_digit))
+                               {
+                                       _cur_lexeme_contents.begin = cur; // .
+
+                                       ++cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+
+                                       _cur_lexeme_contents.end = cur;
+                                       
+                                       _cur_lexeme = lex_number;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_dot;
+                               }
+                               break;
+
+                       case '@':
+                               cur += 1;
+                               _cur_lexeme = lex_axis_attribute;
+
+                               break;
+
+                       case '"':
+                       case '\'':
+                       {
+                               char_t terminator = *cur;
+
+                               ++cur;
+
+                               _cur_lexeme_contents.begin = cur;
+                               while (*cur && *cur != terminator) cur++;
+                               _cur_lexeme_contents.end = cur;
+                               
+                               if (!*cur)
+                                       _cur_lexeme = lex_none;
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_quoted_string;
+                               }
+
+                               break;
+                       }
+
+                       case ':':
+                               if (*(cur+1) == ':')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_double_colon;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+                               break;
+
+                       default:
+                               if (PUGI__IS_CHARTYPEX(*cur, ctx_digit))
+                               {
+                                       _cur_lexeme_contents.begin = cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+                               
+                                       if (*cur == '.')
+                                       {
+                                               cur++;
+
+                                               while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+                                       }
+
+                                       _cur_lexeme_contents.end = cur;
+
+                                       _cur_lexeme = lex_number;
+                               }
+                               else if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+                               {
+                                       _cur_lexeme_contents.begin = cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+                                       if (cur[0] == ':')
+                                       {
+                                               if (cur[1] == '*') // namespace test ncname:*
+                                               {
+                                                       cur += 2; // :*
+                                               }
+                                               else if (PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // namespace test qname
+                                               {
+                                                       cur++; // :
+
+                                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+                                               }
+                                       }
+
+                                       _cur_lexeme_contents.end = cur;
+                               
+                                       _cur_lexeme = lex_string;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+                       }
+
+                       _cur = cur;
+               }
+
+               lexeme_t current() const
+               {
+                       return _cur_lexeme;
+               }
+
+               const char_t* current_pos() const
+               {
+                       return _cur_lexeme_pos;
+               }
+
+               const xpath_lexer_string& contents() const
+               {
+                       assert(_cur_lexeme == lex_var_ref || _cur_lexeme == lex_number || _cur_lexeme == lex_string || _cur_lexeme == lex_quoted_string);
+
+                       return _cur_lexeme_contents;
+               }
+       };
+
+       enum ast_type_t
+       {
+               ast_unknown,
+               ast_op_or,                                              // left or right
+               ast_op_and,                                             // left and right
+               ast_op_equal,                                   // left = right
+               ast_op_not_equal,                               // left != right
+               ast_op_less,                                    // left < right
+               ast_op_greater,                                 // left > right
+               ast_op_less_or_equal,                   // left <= right
+               ast_op_greater_or_equal,                // left >= right
+               ast_op_add,                                             // left + right
+               ast_op_subtract,                                // left - right
+               ast_op_multiply,                                // left * right
+               ast_op_divide,                                  // left / right
+               ast_op_mod,                                             // left % right
+               ast_op_negate,                                  // left - right
+               ast_op_union,                                   // left | right
+               ast_predicate,                                  // apply predicate to set; next points to next predicate
+               ast_filter,                                             // select * from left where right
+               ast_string_constant,                    // string constant
+               ast_number_constant,                    // number constant
+               ast_variable,                                   // variable
+               ast_func_last,                                  // last()
+               ast_func_position,                              // position()
+               ast_func_count,                                 // count(left)
+               ast_func_id,                                    // id(left)
+               ast_func_local_name_0,                  // local-name()
+               ast_func_local_name_1,                  // local-name(left)
+               ast_func_namespace_uri_0,               // namespace-uri()
+               ast_func_namespace_uri_1,               // namespace-uri(left)
+               ast_func_name_0,                                // name()
+               ast_func_name_1,                                // name(left)
+               ast_func_string_0,                              // string()
+               ast_func_string_1,                              // string(left)
+               ast_func_concat,                                // concat(left, right, siblings)
+               ast_func_starts_with,                   // starts_with(left, right)
+               ast_func_contains,                              // contains(left, right)
+               ast_func_substring_before,              // substring-before(left, right)
+               ast_func_substring_after,               // substring-after(left, right)
+               ast_func_substring_2,                   // substring(left, right)
+               ast_func_substring_3,                   // substring(left, right, third)
+               ast_func_string_length_0,               // string-length()
+               ast_func_string_length_1,               // string-length(left)
+               ast_func_normalize_space_0,             // normalize-space()
+               ast_func_normalize_space_1,             // normalize-space(left)
+               ast_func_translate,                             // translate(left, right, third)
+               ast_func_boolean,                               // boolean(left)
+               ast_func_not,                                   // not(left)
+               ast_func_true,                                  // true()
+               ast_func_false,                                 // false()
+               ast_func_lang,                                  // lang(left)
+               ast_func_number_0,                              // number()
+               ast_func_number_1,                              // number(left)
+               ast_func_sum,                                   // sum(left)
+               ast_func_floor,                                 // floor(left)
+               ast_func_ceiling,                               // ceiling(left)
+               ast_func_round,                                 // round(left)
+               ast_step,                                               // process set left with step
+               ast_step_root,                                  // select root node
+
+               ast_opt_translate_table,                // translate(left, right, third) where right/third are constants
+               ast_opt_compare_attribute               // @name = 'string'
+       };
+
+       enum axis_t
+       {
+               axis_ancestor,
+               axis_ancestor_or_self,
+               axis_attribute,
+               axis_child,
+               axis_descendant,
+               axis_descendant_or_self,
+               axis_following,
+               axis_following_sibling,
+               axis_namespace,
+               axis_parent,
+               axis_preceding,
+               axis_preceding_sibling,
+               axis_self
+       };
+       
+       enum nodetest_t
+       {
+               nodetest_none,
+               nodetest_name,
+               nodetest_type_node,
+               nodetest_type_comment,
+               nodetest_type_pi,
+               nodetest_type_text,
+               nodetest_pi,
+               nodetest_all,
+               nodetest_all_in_namespace
+       };
+
+       enum predicate_t
+       {
+               predicate_default,
+               predicate_posinv,
+               predicate_constant,
+               predicate_constant_one
+       };
+
+       enum nodeset_eval_t
+       {
+               nodeset_eval_all,
+               nodeset_eval_any,
+               nodeset_eval_first
+       };
+
+       template <axis_t N> struct axis_to_type
+       {
+               static const axis_t axis;
+       };
+
+       template <axis_t N> const axis_t axis_to_type<N>::axis = N;
+               
+       class xpath_ast_node
+       {
+       private:
+               // node type
+               char _type;
+               char _rettype;
+
+               // for ast_step
+               char _axis;
+
+               // for ast_step/ast_predicate/ast_filter
+               char _test;
+
+               // tree node structure
+               xpath_ast_node* _left;
+               xpath_ast_node* _right;
+               xpath_ast_node* _next;
+
+               union
+               {
+                       // value for ast_string_constant
+                       const char_t* string;
+                       // value for ast_number_constant
+                       double number;
+                       // variable for ast_variable
+                       xpath_variable* variable;
+                       // node test for ast_step (node name/namespace/node type/pi target)
+                       const char_t* nodetest;
+                       // table for ast_opt_translate_table
+                       const unsigned char* table;
+               } _data;
+
+               xpath_ast_node(const xpath_ast_node&);
+               xpath_ast_node& operator=(const xpath_ast_node&);
+
+               template <class Comp> static bool compare_eq(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+               {
+                       xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+                       if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+                       {
+                               if (lt == xpath_type_boolean || rt == xpath_type_boolean)
+                                       return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+                               else if (lt == xpath_type_number || rt == xpath_type_number)
+                                       return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+                               else if (lt == xpath_type_string || rt == xpath_type_string)
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       xpath_string ls = lhs->eval_string(c, stack);
+                                       xpath_string rs = rhs->eval_string(c, stack);
+
+                                       return comp(ls, rs);
+                               }
+                       }
+                       else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all);
+                               xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                               for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture cri(stack.result);
+
+                                               if (comp(string_value(*li, stack.result), string_value(*ri, stack.result)))
+                                                       return true;
+                                       }
+
+                               return false;
+                       }
+                       else
+                       {
+                               if (lt == xpath_type_node_set)
+                               {
+                                       swap(lhs, rhs);
+                                       swap(lt, rt);
+                               }
+
+                               if (lt == xpath_type_boolean)
+                                       return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+                               else if (lt == xpath_type_number)
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       double l = lhs->eval_number(c, stack);
+                                       xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture cri(stack.result);
+
+                                               if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+                                                       return true;
+                                       }
+
+                                       return false;
+                               }
+                               else if (lt == xpath_type_string)
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       xpath_string l = lhs->eval_string(c, stack);
+                                       xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture cri(stack.result);
+
+                                               if (comp(l, string_value(*ri, stack.result)))
+                                                       return true;
+                                       }
+
+                                       return false;
+                               }
+                       }
+
+                       assert(!"Wrong types");
+                       return false;
+               }
+
+               static bool eval_once(xpath_node_set::type_t type, nodeset_eval_t eval)
+               {
+                       return type == xpath_node_set::type_sorted ? eval != nodeset_eval_all : eval == nodeset_eval_any;
+               }
+
+               template <class Comp> static bool compare_rel(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+               {
+                       xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+                       if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+                               return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+                       else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all);
+                               xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                               for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       double l = convert_string_to_number(string_value(*li, stack.result).c_str());
+
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture crii(stack.result);
+
+                                               if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+                                                       return true;
+                                       }
+                               }
+
+                               return false;
+                       }
+                       else if (lt != xpath_type_node_set && rt == xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               double l = lhs->eval_number(c, stack);
+                               xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                               for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+                                               return true;
+                               }
+
+                               return false;
+                       }
+                       else if (lt == xpath_type_node_set && rt != xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all);
+                               double r = rhs->eval_number(c, stack);
+
+                               for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       if (comp(convert_string_to_number(string_value(*li, stack.result).c_str()), r))
+                                               return true;
+                               }
+
+                               return false;
+                       }
+                       else
+                       {
+                               assert(!"Wrong types");
+                               return false;
+                       }
+               }
+
+               static void apply_predicate_boolean(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once)
+               {
+                       assert(ns.size() >= first);
+                       assert(expr->rettype() != xpath_type_number);
+
+                       size_t i = 1;
+                       size_t size = ns.size() - first;
+
+                       xpath_node* last = ns.begin() + first;
+
+                       // remove_if... or well, sort of
+                       for (xpath_node* it = last; it != ns.end(); ++it, ++i)
+                       {
+                               xpath_context c(*it, i, size);
+
+                               if (expr->eval_boolean(c, stack))
+                               {
+                                       *last++ = *it;
+
+                                       if (once) break;
+                               }
+                       }
+
+                       ns.truncate(last);
+               }
+
+               static void apply_predicate_number(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once)
+               {
+                       assert(ns.size() >= first);
+                       assert(expr->rettype() == xpath_type_number);
+
+                       size_t i = 1;
+                       size_t size = ns.size() - first;
+
+                       xpath_node* last = ns.begin() + first;
+
+                       // remove_if... or well, sort of
+                       for (xpath_node* it = last; it != ns.end(); ++it, ++i)
+                       {
+                               xpath_context c(*it, i, size);
+
+                               if (expr->eval_number(c, stack) == i)
+                               {
+                                       *last++ = *it;
+
+                                       if (once) break;
+                               }
+                       }
+
+                       ns.truncate(last);
+               }
+
+               static void apply_predicate_number_const(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack)
+               {
+                       assert(ns.size() >= first);
+                       assert(expr->rettype() == xpath_type_number);
+
+                       size_t size = ns.size() - first;
+
+                       xpath_node* last = ns.begin() + first;
+
+                       xpath_context c(xpath_node(), 1, size);
+
+                       double er = expr->eval_number(c, stack);
+
+                       if (er >= 1.0 && er <= size)
+                       {
+                               size_t eri = static_cast<size_t>(er);
+
+                               if (er == eri)
+                               {
+                                       xpath_node r = last[eri - 1];
+
+                                       *last++ = r;
+                               }
+                       }
+
+                       ns.truncate(last);
+               }
+
+               void apply_predicate(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, bool once)
+               {
+                       if (ns.size() == first) return;
+
+                       assert(_type == ast_filter || _type == ast_predicate);
+
+                       if (_test == predicate_constant || _test == predicate_constant_one)
+                               apply_predicate_number_const(ns, first, _right, stack);
+                       else if (_right->rettype() == xpath_type_number)
+                               apply_predicate_number(ns, first, _right, stack, once);
+                       else
+                               apply_predicate_boolean(ns, first, _right, stack, once);
+               }
+
+               void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, nodeset_eval_t eval)
+               {
+                       if (ns.size() == first) return;
+
+                       bool last_once = eval_once(ns.type(), eval);
+
+                       for (xpath_ast_node* pred = _right; pred; pred = pred->_next)
+                               pred->apply_predicate(ns, first, stack, !pred->_next && last_once);
+               }
+
+               bool step_push(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* parent, xpath_allocator* alloc)
+               {
+                       assert(a);
+
+                       const char_t* name = a->name ? a->name : PUGIXML_TEXT("");
+
+                       switch (_test)
+                       {
+                       case nodetest_name:
+                               if (strequal(name, _data.nodetest) && is_xpath_attribute(name))
+                               {
+                                       ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_node:
+                       case nodetest_all:
+                               if (is_xpath_attribute(name))
+                               {
+                                       ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_all_in_namespace:
+                               if (starts_with(name, _data.nodetest) && is_xpath_attribute(name))
+                               {
+                                       ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc);
+                                       return true;
+                               }
+                               break;
+                       
+                       default:
+                               ;
+                       }
+
+                       return false;
+               }
+               
+               bool step_push(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc)
+               {
+                       assert(n);
+
+                       xml_node_type type = PUGI__NODETYPE(n);
+
+                       switch (_test)
+                       {
+                       case nodetest_name:
+                               if (type == node_element && n->name && strequal(n->name, _data.nodetest))
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_node:
+                               ns.push_back(xml_node(n), alloc);
+                               return true;
+                               
+                       case nodetest_type_comment:
+                               if (type == node_comment)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_text:
+                               if (type == node_pcdata || type == node_cdata)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_pi:
+                               if (type == node_pi)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                                                                       
+                       case nodetest_pi:
+                               if (type == node_pi && n->name && strequal(n->name, _data.nodetest))
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_all:
+                               if (type == node_element)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_all_in_namespace:
+                               if (type == node_element && n->name && starts_with(n->name, _data.nodetest))
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+
+                       default:
+                               assert(!"Unknown axis");
+                       }
+
+                       return false;
+               }
+
+               template <class T> void step_fill(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc, bool once, T)
+               {
+                       const axis_t axis = T::axis;
+
+                       switch (axis)
+                       {
+                       case axis_attribute:
+                       {
+                               for (xml_attribute_struct* a = n->first_attribute; a; a = a->next_attribute)
+                                       if (step_push(ns, a, n, alloc) & once)
+                                               return;
+                               
+                               break;
+                       }
+                       
+                       case axis_child:
+                       {
+                               for (xml_node_struct* c = n->first_child; c; c = c->next_sibling)
+                                       if (step_push(ns, c, alloc) & once)
+                                               return;
+                                       
+                               break;
+                       }
+                       
+                       case axis_descendant:
+                       case axis_descendant_or_self:
+                       {
+                               if (axis == axis_descendant_or_self)
+                                       if (step_push(ns, n, alloc) & once)
+                                               return;
+                                       
+                               xml_node_struct* cur = n->first_child;
+                               
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                                       
+                                       if (cur->first_child)
+                                               cur = cur->first_child;
+                                       else
+                                       {
+                                               while (!cur->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (cur == n) return;
+                                               }
+                                       
+                                               cur = cur->next_sibling;
+                                       }
+                               }
+                               
+                               break;
+                       }
+                       
+                       case axis_following_sibling:
+                       {
+                               for (xml_node_struct* c = n->next_sibling; c; c = c->next_sibling)
+                                       if (step_push(ns, c, alloc) & once)
+                                               return;
+                               
+                               break;
+                       }
+                       
+                       case axis_preceding_sibling:
+                       {
+                               for (xml_node_struct* c = n->prev_sibling_c; c->next_sibling; c = c->prev_sibling_c)
+                                       if (step_push(ns, c, alloc) & once)
+                                               return;
+                               
+                               break;
+                       }
+                       
+                       case axis_following:
+                       {
+                               xml_node_struct* cur = n;
+
+                               // exit from this node so that we don't include descendants
+                               while (!cur->next_sibling)
+                               {
+                                       cur = cur->parent;
+
+                                       if (!cur) return;
+                               }
+
+                               cur = cur->next_sibling;
+
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+
+                                       if (cur->first_child)
+                                               cur = cur->first_child;
+                                       else
+                                       {
+                                               while (!cur->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (!cur) return;
+                                               }
+
+                                               cur = cur->next_sibling;
+                                       }
+                               }
+
+                               break;
+                       }
+
+                       case axis_preceding:
+                       {
+                               xml_node_struct* cur = n;
+
+                               // exit from this node so that we don't include descendants
+                               while (!cur->prev_sibling_c->next_sibling)
+                               {
+                                       cur = cur->parent;
+
+                                       if (!cur) return;
+                               }
+
+                               cur = cur->prev_sibling_c;
+
+                               while (cur)
+                               {
+                                       if (cur->first_child)
+                                               cur = cur->first_child->prev_sibling_c;
+                                       else
+                                       {
+                                               // leaf node, can't be ancestor
+                                               if (step_push(ns, cur, alloc) & once)
+                                                       return;
+
+                                               while (!cur->prev_sibling_c->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (!cur) return;
+
+                                                       if (!node_is_ancestor(cur, n))
+                                                               if (step_push(ns, cur, alloc) & once)
+                                                                       return;
+                                               }
+
+                                               cur = cur->prev_sibling_c;
+                                       }
+                               }
+
+                               break;
+                       }
+                       
+                       case axis_ancestor:
+                       case axis_ancestor_or_self:
+                       {
+                               if (axis == axis_ancestor_or_self)
+                                       if (step_push(ns, n, alloc) & once)
+                                               return;
+
+                               xml_node_struct* cur = n->parent;
+                               
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                                       
+                                       cur = cur->parent;
+                               }
+                               
+                               break;
+                       }
+
+                       case axis_self:
+                       {
+                               step_push(ns, n, alloc);
+
+                               break;
+                       }
+
+                       case axis_parent:
+                       {
+                               if (n->parent)
+                                       step_push(ns, n->parent, alloc);
+
+                               break;
+                       }
+                               
+                       default:
+                               assert(!"Unimplemented axis");
+                       }
+               }
+               
+               template <class T> void step_fill(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* p, xpath_allocator* alloc, bool once, T v)
+               {
+                       const axis_t axis = T::axis;
+
+                       switch (axis)
+                       {
+                       case axis_ancestor:
+                       case axis_ancestor_or_self:
+                       {
+                               if (axis == axis_ancestor_or_self && _test == nodetest_type_node) // reject attributes based on principal node type test
+                                       if (step_push(ns, a, p, alloc) & once)
+                                               return;
+
+                               xml_node_struct* cur = p;
+                               
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                                       
+                                       cur = cur->parent;
+                               }
+                               
+                               break;
+                       }
+
+                       case axis_descendant_or_self:
+                       case axis_self:
+                       {
+                               if (_test == nodetest_type_node) // reject attributes based on principal node type test
+                                       step_push(ns, a, p, alloc);
+
+                               break;
+                       }
+
+                       case axis_following:
+                       {
+                               xml_node_struct* cur = p;
+                               
+                               while (cur)
+                               {
+                                       if (cur->first_child)
+                                               cur = cur->first_child;
+                                       else
+                                       {
+                                               while (!cur->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (!cur) return;
+                                               }
+
+                                               cur = cur->next_sibling;
+                                       }
+
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                               }
+
+                               break;
+                       }
+
+                       case axis_parent:
+                       {
+                               step_push(ns, p, alloc);
+
+                               break;
+                       }
+
+                       case axis_preceding:
+                       {
+                               // preceding:: axis does not include attribute nodes and attribute ancestors (they are the same as parent's ancestors), so we can reuse node preceding
+                               step_fill(ns, p, alloc, once, v);
+                               break;
+                       }
+                       
+                       default:
+                               assert(!"Unimplemented axis");
+                       }
+               }
+
+               template <class T> void step_fill(xpath_node_set_raw& ns, const xpath_node& xn, xpath_allocator* alloc, bool once, T v)
+               {
+                       const axis_t axis = T::axis;
+                       const bool axis_has_attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self);
+
+                       if (xn.node())
+                               step_fill(ns, xn.node().internal_object(), alloc, once, v);
+                       else if (axis_has_attributes && xn.attribute() && xn.parent())
+                               step_fill(ns, xn.attribute().internal_object(), xn.parent().internal_object(), alloc, once, v);
+               }
+
+               template <class T> xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval, T v)
+               {
+                       const axis_t axis = T::axis;
+                       const bool axis_reverse = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling);
+                       const xpath_node_set::type_t axis_type = axis_reverse ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted;
+
+                       bool once =
+                               (axis == axis_attribute && _test == nodetest_name) ||
+                               (!_right && eval_once(axis_type, eval)) ||
+                               (_right && !_right->_next && _right->_test == predicate_constant_one);
+
+                       xpath_node_set_raw ns;
+                       ns.set_type(axis_type);
+
+                       if (_left)
+                       {
+                               xpath_node_set_raw s = _left->eval_node_set(c, stack, nodeset_eval_all);
+
+                               // self axis preserves the original order
+                               if (axis == axis_self) ns.set_type(s.type());
+
+                               for (const xpath_node* it = s.begin(); it != s.end(); ++it)
+                               {
+                                       size_t size = ns.size();
+
+                                       // in general, all axes generate elements in a particular order, but there is no order guarantee if axis is applied to two nodes
+                                       if (axis != axis_self && size != 0) ns.set_type(xpath_node_set::type_unsorted);
+                                       
+                                       step_fill(ns, *it, stack.result, once, v);
+                                       if (_right) apply_predicates(ns, size, stack, eval);
+                               }
+                       }
+                       else
+                       {
+                               step_fill(ns, c.n, stack.result, once, v);
+                               if (_right) apply_predicates(ns, 0, stack, eval);
+                       }
+
+                       // child, attribute and self axes always generate unique set of nodes
+                       // for other axis, if the set stayed sorted, it stayed unique because the traversal algorithms do not visit the same node twice
+                       if (axis != axis_child && axis != axis_attribute && axis != axis_self && ns.type() == xpath_node_set::type_unsorted)
+                               ns.remove_duplicates();
+
+                       return ns;
+               }
+               
+       public:
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, const char_t* value):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+               {
+                       assert(type == ast_string_constant);
+                       _data.string = value;
+               }
+
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, double value):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+               {
+                       assert(type == ast_number_constant);
+                       _data.number = value;
+               }
+               
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_variable* value):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+               {
+                       assert(type == ast_variable);
+                       _data.variable = value;
+               }
+               
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_ast_node* left = 0, xpath_ast_node* right = 0):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(left), _right(right), _next(0)
+               {
+               }
+
+               xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents):
+                       _type(static_cast<char>(type)), _rettype(xpath_type_node_set), _axis(static_cast<char>(axis)), _test(static_cast<char>(test)), _left(left), _right(0), _next(0)
+               {
+                       assert(type == ast_step);
+                       _data.nodetest = contents;
+               }
+
+               xpath_ast_node(ast_type_t type, xpath_ast_node* left, xpath_ast_node* right, predicate_t test):
+                       _type(static_cast<char>(type)), _rettype(xpath_type_node_set), _axis(0), _test(static_cast<char>(test)), _left(left), _right(right), _next(0)
+               {
+                       assert(type == ast_filter || type == ast_predicate);
+               }
+
+               void set_next(xpath_ast_node* value)
+               {
+                       _next = value;
+               }
+
+               void set_right(xpath_ast_node* value)
+               {
+                       _right = value;
+               }
+
+               bool eval_boolean(const xpath_context& c, const xpath_stack& stack)
+               {
+                       switch (_type)
+                       {
+                       case ast_op_or:
+                               return _left->eval_boolean(c, stack) || _right->eval_boolean(c, stack);
+                               
+                       case ast_op_and:
+                               return _left->eval_boolean(c, stack) && _right->eval_boolean(c, stack);
+                               
+                       case ast_op_equal:
+                               return compare_eq(_left, _right, c, stack, equal_to());
+
+                       case ast_op_not_equal:
+                               return compare_eq(_left, _right, c, stack, not_equal_to());
+       
+                       case ast_op_less:
+                               return compare_rel(_left, _right, c, stack, less());
+                       
+                       case ast_op_greater:
+                               return compare_rel(_right, _left, c, stack, less());
+
+                       case ast_op_less_or_equal:
+                               return compare_rel(_left, _right, c, stack, less_equal());
+                       
+                       case ast_op_greater_or_equal:
+                               return compare_rel(_right, _left, c, stack, less_equal());
+
+                       case ast_func_starts_with:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_string lr = _left->eval_string(c, stack);
+                               xpath_string rr = _right->eval_string(c, stack);
+
+                               return starts_with(lr.c_str(), rr.c_str());
+                       }
+
+                       case ast_func_contains:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_string lr = _left->eval_string(c, stack);
+                               xpath_string rr = _right->eval_string(c, stack);
+
+                               return find_substring(lr.c_str(), rr.c_str()) != 0;
+                       }
+
+                       case ast_func_boolean:
+                               return _left->eval_boolean(c, stack);
+                               
+                       case ast_func_not:
+                               return !_left->eval_boolean(c, stack);
+                               
+                       case ast_func_true:
+                               return true;
+                               
+                       case ast_func_false:
+                               return false;
+
+                       case ast_func_lang:
+                       {
+                               if (c.n.attribute()) return false;
+                               
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_string lang = _left->eval_string(c, stack);
+                               
+                               for (xml_node n = c.n.node(); n; n = n.parent())
+                               {
+                                       xml_attribute a = n.attribute(PUGIXML_TEXT("xml:lang"));
+                                       
+                                       if (a)
+                                       {
+                                               const char_t* value = a.value();
+                                               
+                                               // strnicmp / strncasecmp is not portable
+                                               for (const char_t* lit = lang.c_str(); *lit; ++lit)
+                                               {
+                                                       if (tolower_ascii(*lit) != tolower_ascii(*value)) return false;
+                                                       ++value;
+                                               }
+                                               
+                                               return *value == 0 || *value == '-';
+                                       }
+                               }
+                               
+                               return false;
+                       }
+
+                       case ast_opt_compare_attribute:
+                       {
+                               const char_t* value = (_right->_type == ast_string_constant) ? _right->_data.string : _right->_data.variable->get_string();
+
+                               xml_attribute attr = c.n.node().attribute(_left->_data.nodetest);
+
+                               return attr && strequal(attr.value(), value) && is_xpath_attribute(attr.name());
+                       }
+
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_boolean)
+                                       return _data.variable->get_boolean();
+
+                               // fallthrough to type conversion
+                       }
+
+                       default:
+                       {
+                               switch (_rettype)
+                               {
+                               case xpath_type_number:
+                                       return convert_number_to_boolean(eval_number(c, stack));
+                                       
+                               case xpath_type_string:
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return !eval_string(c, stack).empty();
+                               }
+                                       
+                               case xpath_type_node_set:                               
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return !eval_node_set(c, stack, nodeset_eval_any).empty();
+                               }
+
+                               default:
+                                       assert(!"Wrong expression for return type boolean");
+                                       return false;
+                               }
+                       }
+                       }
+               }
+
+               double eval_number(const xpath_context& c, const xpath_stack& stack)
+               {
+                       switch (_type)
+                       {
+                       case ast_op_add:
+                               return _left->eval_number(c, stack) + _right->eval_number(c, stack);
+                               
+                       case ast_op_subtract:
+                               return _left->eval_number(c, stack) - _right->eval_number(c, stack);
+
+                       case ast_op_multiply:
+                               return _left->eval_number(c, stack) * _right->eval_number(c, stack);
+
+                       case ast_op_divide:
+                               return _left->eval_number(c, stack) / _right->eval_number(c, stack);
+
+                       case ast_op_mod:
+                               return fmod(_left->eval_number(c, stack), _right->eval_number(c, stack));
+
+                       case ast_op_negate:
+                               return -_left->eval_number(c, stack);
+
+                       case ast_number_constant:
+                               return _data.number;
+
+                       case ast_func_last:
+                               return static_cast<double>(c.size);
+                       
+                       case ast_func_position:
+                               return static_cast<double>(c.position);
+
+                       case ast_func_count:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return static_cast<double>(_left->eval_node_set(c, stack, nodeset_eval_all).size());
+                       }
+                       
+                       case ast_func_string_length_0:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return static_cast<double>(string_value(c.n, stack.result).length());
+                       }
+                       
+                       case ast_func_string_length_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return static_cast<double>(_left->eval_string(c, stack).length());
+                       }
+                       
+                       case ast_func_number_0:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return convert_string_to_number(string_value(c.n, stack.result).c_str());
+                       }
+                       
+                       case ast_func_number_1:
+                               return _left->eval_number(c, stack);
+
+                       case ast_func_sum:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               double r = 0;
+                               
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_all);
+                               
+                               for (const xpath_node* it = ns.begin(); it != ns.end(); ++it)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       r += convert_string_to_number(string_value(*it, stack.result).c_str());
+                               }
+                       
+                               return r;
+                       }
+
+                       case ast_func_floor:
+                       {
+                               double r = _left->eval_number(c, stack);
+                               
+                               return r == r ? floor(r) : r;
+                       }
+
+                       case ast_func_ceiling:
+                       {
+                               double r = _left->eval_number(c, stack);
+                               
+                               return r == r ? ceil(r) : r;
+                       }
+
+                       case ast_func_round:
+                               return round_nearest_nzero(_left->eval_number(c, stack));
+                       
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_number)
+                                       return _data.variable->get_number();
+
+                               // fallthrough to type conversion
+                       }
+
+                       default:
+                       {
+                               switch (_rettype)
+                               {
+                               case xpath_type_boolean:
+                                       return eval_boolean(c, stack) ? 1 : 0;
+                                       
+                               case xpath_type_string:
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return convert_string_to_number(eval_string(c, stack).c_str());
+                               }
+                                       
+                               case xpath_type_node_set:
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return convert_string_to_number(eval_string(c, stack).c_str());
+                               }
+                                       
+                               default:
+                                       assert(!"Wrong expression for return type number");
+                                       return 0;
+                               }
+                               
+                       }
+                       }
+               }
+               
+               xpath_string eval_string_concat(const xpath_context& c, const xpath_stack& stack)
+               {
+                       assert(_type == ast_func_concat);
+
+                       xpath_allocator_capture ct(stack.temp);
+
+                       // count the string number
+                       size_t count = 1;
+                       for (xpath_ast_node* nc = _right; nc; nc = nc->_next) count++;
+
+                       // gather all strings
+                       xpath_string static_buffer[4];
+                       xpath_string* buffer = static_buffer;
+
+                       // allocate on-heap for large concats
+                       if (count > sizeof(static_buffer) / sizeof(static_buffer[0]))
+                       {
+                               buffer = static_cast<xpath_string*>(stack.temp->allocate(count * sizeof(xpath_string)));
+                               assert(buffer);
+                       }
+
+                       // evaluate all strings to temporary stack
+                       xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                       buffer[0] = _left->eval_string(c, swapped_stack);
+
+                       size_t pos = 1;
+                       for (xpath_ast_node* n = _right; n; n = n->_next, ++pos) buffer[pos] = n->eval_string(c, swapped_stack);
+                       assert(pos == count);
+
+                       // get total length
+                       size_t length = 0;
+                       for (size_t i = 0; i < count; ++i) length += buffer[i].length();
+
+                       // create final string
+                       char_t* result = static_cast<char_t*>(stack.result->allocate((length + 1) * sizeof(char_t)));
+                       assert(result);
+
+                       char_t* ri = result;
+
+                       for (size_t j = 0; j < count; ++j)
+                               for (const char_t* bi = buffer[j].c_str(); *bi; ++bi)
+                                       *ri++ = *bi;
+
+                       *ri = 0;
+
+                       return xpath_string::from_heap_preallocated(result, ri);
+               }
+
+               xpath_string eval_string(const xpath_context& c, const xpath_stack& stack)
+               {
+                       switch (_type)
+                       {
+                       case ast_string_constant:
+                               return xpath_string::from_const(_data.string);
+                       
+                       case ast_func_local_name_0:
+                       {
+                               xpath_node na = c.n;
+                               
+                               return xpath_string::from_const(local_name(na));
+                       }
+
+                       case ast_func_local_name_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first);
+                               xpath_node na = ns.first();
+                               
+                               return xpath_string::from_const(local_name(na));
+                       }
+
+                       case ast_func_name_0:
+                       {
+                               xpath_node na = c.n;
+                               
+                               return xpath_string::from_const(qualified_name(na));
+                       }
+
+                       case ast_func_name_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first);
+                               xpath_node na = ns.first();
+                               
+                               return xpath_string::from_const(qualified_name(na));
+                       }
+
+                       case ast_func_namespace_uri_0:
+                       {
+                               xpath_node na = c.n;
+                               
+                               return xpath_string::from_const(namespace_uri(na));
+                       }
+
+                       case ast_func_namespace_uri_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first);
+                               xpath_node na = ns.first();
+                               
+                               return xpath_string::from_const(namespace_uri(na));
+                       }
+
+                       case ast_func_string_0:
+                               return string_value(c.n, stack.result);
+
+                       case ast_func_string_1:
+                               return _left->eval_string(c, stack);
+
+                       case ast_func_concat:
+                               return eval_string_concat(c, stack);
+
+                       case ast_func_substring_before:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               xpath_string p = _right->eval_string(c, swapped_stack);
+
+                               const char_t* pos = find_substring(s.c_str(), p.c_str());
+                               
+                               return pos ? xpath_string::from_heap(s.c_str(), pos, stack.result) : xpath_string();
+                       }
+                       
+                       case ast_func_substring_after:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               xpath_string p = _right->eval_string(c, swapped_stack);
+                               
+                               const char_t* pos = find_substring(s.c_str(), p.c_str());
+                               if (!pos) return xpath_string();
+
+                               const char_t* rbegin = pos + p.length();
+                               const char_t* rend = s.c_str() + s.length();
+
+                               return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin);
+                       }
+
+                       case ast_func_substring_2:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               size_t s_length = s.length();
+
+                               double first = round_nearest(_right->eval_number(c, stack));
+                               
+                               if (is_nan(first)) return xpath_string(); // NaN
+                               else if (first >= s_length + 1) return xpath_string();
+                               
+                               size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+                               assert(1 <= pos && pos <= s_length + 1);
+
+                               const char_t* rbegin = s.c_str() + (pos - 1);
+                               const char_t* rend = s.c_str() + s.length();
+                               
+                               return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin);
+                       }
+                       
+                       case ast_func_substring_3:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               size_t s_length = s.length();
+
+                               double first = round_nearest(_right->eval_number(c, stack));
+                               double last = first + round_nearest(_right->_next->eval_number(c, stack));
+                               
+                               if (is_nan(first) || is_nan(last)) return xpath_string();
+                               else if (first >= s_length + 1) return xpath_string();
+                               else if (first >= last) return xpath_string();
+                               else if (last < 1) return xpath_string();
+                               
+                               size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+                               size_t end = last >= s_length + 1 ? s_length + 1 : static_cast<size_t>(last);
+
+                               assert(1 <= pos && pos <= end && end <= s_length + 1);
+                               const char_t* rbegin = s.c_str() + (pos - 1);
+                               const char_t* rend = s.c_str() + (end - 1);
+
+                               return (end == s_length + 1 && !s.uses_heap()) ? xpath_string::from_const(rbegin) : xpath_string::from_heap(rbegin, rend, stack.result);
+                       }
+
+                       case ast_func_normalize_space_0:
+                       {
+                               xpath_string s = string_value(c.n, stack.result);
+
+                               normalize_space(s.data(stack.result));
+
+                               return s;
+                       }
+
+                       case ast_func_normalize_space_1:
+                       {
+                               xpath_string s = _left->eval_string(c, stack);
+
+                               normalize_space(s.data(stack.result));
+                       
+                               return s;
+                       }
+
+                       case ast_func_translate:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, stack);
+                               xpath_string from = _right->eval_string(c, swapped_stack);
+                               xpath_string to = _right->_next->eval_string(c, swapped_stack);
+
+                               translate(s.data(stack.result), from.c_str(), to.c_str(), to.length());
+
+                               return s;
+                       }
+
+                       case ast_opt_translate_table:
+                       {
+                               xpath_string s = _left->eval_string(c, stack);
+
+                               translate_table(s.data(stack.result), _data.table);
+
+                               return s;
+                       }
+
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_string)
+                                       return xpath_string::from_const(_data.variable->get_string());
+
+                               // fallthrough to type conversion
+                       }
+
+                       default:
+                       {
+                               switch (_rettype)
+                               {
+                               case xpath_type_boolean:
+                                       return xpath_string::from_const(eval_boolean(c, stack) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+                                       
+                               case xpath_type_number:
+                                       return convert_number_to_string(eval_number(c, stack), stack.result);
+                                       
+                               case xpath_type_node_set:
+                               {
+                                       xpath_allocator_capture cr(stack.temp);
+
+                                       xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                                       xpath_node_set_raw ns = eval_node_set(c, swapped_stack, nodeset_eval_first);
+                                       return ns.empty() ? xpath_string() : string_value(ns.first(), stack.result);
+                               }
+                               
+                               default:
+                                       assert(!"Wrong expression for return type string");
+                                       return xpath_string();
+                               }
+                       }
+                       }
+               }
+
+               xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval)
+               {
+                       switch (_type)
+                       {
+                       case ast_op_union:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack, eval);
+                               xpath_node_set_raw rs = _right->eval_node_set(c, stack, eval);
+
+                               // we can optimize merging two sorted sets, but this is a very rare operation, so don't bother
+                               rs.set_type(xpath_node_set::type_unsorted);
+
+                               rs.append(ls.begin(), ls.end(), stack.result);
+                               rs.remove_duplicates();
+
+                               return rs;
+                       }
+
+                       case ast_filter:
+                       {
+                               xpath_node_set_raw set = _left->eval_node_set(c, stack, _test == predicate_constant_one ? nodeset_eval_first : nodeset_eval_all);
+
+                               // either expression is a number or it contains position() call; sort by document order
+                               if (_test != predicate_posinv) set.sort_do();
+
+                               bool once = eval_once(set.type(), eval);
+
+                               apply_predicate(set, 0, stack, once);
+                       
+                               return set;
+                       }
+                       
+                       case ast_func_id:
+                               return xpath_node_set_raw();
+                       
+                       case ast_step:
+                       {
+                               switch (_axis)
+                               {
+                               case axis_ancestor:
+                                       return step_do(c, stack, eval, axis_to_type<axis_ancestor>());
+                                       
+                               case axis_ancestor_or_self:
+                                       return step_do(c, stack, eval, axis_to_type<axis_ancestor_or_self>());
+
+                               case axis_attribute:
+                                       return step_do(c, stack, eval, axis_to_type<axis_attribute>());
+
+                               case axis_child:
+                                       return step_do(c, stack, eval, axis_to_type<axis_child>());
+                               
+                               case axis_descendant:
+                                       return step_do(c, stack, eval, axis_to_type<axis_descendant>());
+
+                               case axis_descendant_or_self:
+                                       return step_do(c, stack, eval, axis_to_type<axis_descendant_or_self>());
+
+                               case axis_following:
+                                       return step_do(c, stack, eval, axis_to_type<axis_following>());
+                               
+                               case axis_following_sibling:
+                                       return step_do(c, stack, eval, axis_to_type<axis_following_sibling>());
+                               
+                               case axis_namespace:
+                                       // namespaced axis is not supported
+                                       return xpath_node_set_raw();
+                               
+                               case axis_parent:
+                                       return step_do(c, stack, eval, axis_to_type<axis_parent>());
+                               
+                               case axis_preceding:
+                                       return step_do(c, stack, eval, axis_to_type<axis_preceding>());
+
+                               case axis_preceding_sibling:
+                                       return step_do(c, stack, eval, axis_to_type<axis_preceding_sibling>());
+                               
+                               case axis_self:
+                                       return step_do(c, stack, eval, axis_to_type<axis_self>());
+
+                               default:
+                                       assert(!"Unknown axis");
+                                       return xpath_node_set_raw();
+                               }
+                       }
+
+                       case ast_step_root:
+                       {
+                               assert(!_right); // root step can't have any predicates
+
+                               xpath_node_set_raw ns;
+
+                               ns.set_type(xpath_node_set::type_sorted);
+
+                               if (c.n.node()) ns.push_back(c.n.node().root(), stack.result);
+                               else if (c.n.attribute()) ns.push_back(c.n.parent().root(), stack.result);
+
+                               return ns;
+                       }
+
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_node_set)
+                               {
+                                       const xpath_node_set& s = _data.variable->get_node_set();
+
+                                       xpath_node_set_raw ns;
+
+                                       ns.set_type(s.type());
+                                       ns.append(s.begin(), s.end(), stack.result);
+
+                                       return ns;
+                               }
+
+                               // fallthrough to type conversion
+                       }
+
+                       default:
+                               assert(!"Wrong expression for return type node set");
+                               return xpath_node_set_raw();
+                       }
+               }
+
+               void optimize(xpath_allocator* alloc)
+               {
+                       if (_left) _left->optimize(alloc);
+                       if (_right) _right->optimize(alloc);
+                       if (_next) _next->optimize(alloc);
+
+                       // Rewrite [position()=expr] with [expr]
+                       // Note that this step has to go before classification to recognize [position()=1]
+                       if ((_type == ast_filter || _type == ast_predicate) &&
+                               _right->_type == ast_op_equal && _right->_left->_type == ast_func_position && _right->_right->_rettype == xpath_type_number)
+                       {
+                               _right = _right->_right;
+                       }
+
+                       // Classify filter/predicate ops to perform various optimizations during evaluation
+                       if (_type == ast_filter || _type == ast_predicate)
+                       {
+                               assert(_test == predicate_default);
+
+                               if (_right->_type == ast_number_constant && _right->_data.number == 1.0)
+                                       _test = predicate_constant_one;
+                               else if (_right->_rettype == xpath_type_number && (_right->_type == ast_number_constant || _right->_type == ast_variable || _right->_type == ast_func_last))
+                                       _test = predicate_constant;
+                               else if (_right->_rettype != xpath_type_number && _right->is_posinv_expr())
+                                       _test = predicate_posinv;
+                       }
+
+                       // Rewrite descendant-or-self::node()/child::foo with descendant::foo
+                       // The former is a full form of //foo, the latter is much faster since it executes the node test immediately
+                       // Do a similar kind of rewrite for self/descendant/descendant-or-self axes
+                       // Note that we only rewrite positionally invariant steps (//foo[1] != /descendant::foo[1])
+                       if (_type == ast_step && (_axis == axis_child || _axis == axis_self || _axis == axis_descendant || _axis == axis_descendant_or_self) && _left &&
+                               _left->_type == ast_step && _left->_axis == axis_descendant_or_self && _left->_test == nodetest_type_node && !_left->_right &&
+                               is_posinv_step())
+                       {
+                               if (_axis == axis_child || _axis == axis_descendant)
+                                       _axis = axis_descendant;
+                               else
+                                       _axis = axis_descendant_or_self;
+
+                               _left = _left->_left;
+                       }
+
+                       // Use optimized lookup table implementation for translate() with constant arguments
+                       if (_type == ast_func_translate && _right->_type == ast_string_constant && _right->_next->_type == ast_string_constant)
+                       {
+                               unsigned char* table = translate_table_generate(alloc, _right->_data.string, _right->_next->_data.string);
+
+                               if (table)
+                               {
+                                       _type = ast_opt_translate_table;
+                                       _data.table = table;
+                               }
+                       }
+
+                       // Use optimized path for @attr = 'value' or @attr = $value
+                       if (_type == ast_op_equal &&
+                               _left->_type == ast_step && _left->_axis == axis_attribute && _left->_test == nodetest_name && !_left->_left && !_left->_right &&
+                               (_right->_type == ast_string_constant || (_right->_type == ast_variable && _right->_rettype == xpath_type_string)))
+                       {
+                               _type = ast_opt_compare_attribute;
+                       }
+               }
+               
+               bool is_posinv_expr() const
+               {
+                       switch (_type)
+                       {
+                       case ast_func_position:
+                       case ast_func_last:
+                               return false;
+
+                       case ast_string_constant:
+                       case ast_number_constant:
+                       case ast_variable:
+                               return true;
+
+                       case ast_step:
+                       case ast_step_root:
+                               return true;
+
+                       case ast_predicate:
+                       case ast_filter:
+                               return true;
+
+                       default:
+                               if (_left && !_left->is_posinv_expr()) return false;
+                               
+                               for (xpath_ast_node* n = _right; n; n = n->_next)
+                                       if (!n->is_posinv_expr()) return false;
+                                       
+                               return true;
+                       }
+               }
+
+               bool is_posinv_step() const
+               {
+                       assert(_type == ast_step);
+
+                       for (xpath_ast_node* n = _right; n; n = n->_next)
+                       {
+                               assert(n->_type == ast_predicate);
+
+                               if (n->_test != predicate_posinv)
+                                       return false;
+                       }
+
+                       return true;
+               }
+
+               xpath_value_type rettype() const
+               {
+                       return static_cast<xpath_value_type>(_rettype);
+               }
+       };
+
+       struct xpath_parser
+       {
+               xpath_allocator* _alloc;
+               xpath_lexer _lexer;
+
+               const char_t* _query;
+               xpath_variable_set* _variables;
+
+               xpath_parse_result* _result;
+
+               char_t _scratch[32];
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               jmp_buf _error_handler;
+       #endif
+
+               void throw_error(const char* message)
+               {
+                       _result->error = message;
+                       _result->offset = _lexer.current_pos() - _query;
+
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       longjmp(_error_handler, 1);
+               #else
+                       throw xpath_exception(*_result);
+               #endif
+               }
+
+               void throw_error_oom()
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       throw_error("Out of memory");
+               #else
+                       throw std::bad_alloc();
+               #endif
+               }
+
+               void* alloc_node()
+               {
+                       void* result = _alloc->allocate_nothrow(sizeof(xpath_ast_node));
+
+                       if (!result) throw_error_oom();
+
+                       return result;
+               }
+
+               const char_t* alloc_string(const xpath_lexer_string& value)
+               {
+                       if (value.begin)
+                       {
+                               size_t length = static_cast<size_t>(value.end - value.begin);
+
+                               char_t* c = static_cast<char_t*>(_alloc->allocate_nothrow((length + 1) * sizeof(char_t)));
+                               if (!c) throw_error_oom();
+                               assert(c); // workaround for clang static analysis
+
+                               memcpy(c, value.begin, length * sizeof(char_t));
+                               c[length] = 0;
+
+                               return c;
+                       }
+                       else return 0;
+               }
+
+               xpath_ast_node* parse_function_helper(ast_type_t type0, ast_type_t type1, size_t argc, xpath_ast_node* args[2])
+               {
+                       assert(argc <= 1);
+
+                       if (argc == 1 && args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+
+                       return new (alloc_node()) xpath_ast_node(argc == 0 ? type0 : type1, xpath_type_string, args[0]);
+               }
+
+               xpath_ast_node* parse_function(const xpath_lexer_string& name, size_t argc, xpath_ast_node* args[2])
+               {
+                       switch (name.begin[0])
+                       {
+                       case 'b':
+                               if (name == PUGIXML_TEXT("boolean") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_boolean, xpath_type_boolean, args[0]);
+                                       
+                               break;
+                       
+                       case 'c':
+                               if (name == PUGIXML_TEXT("count") && argc == 1)
+                               {
+                                       if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+                                       return new (alloc_node()) xpath_ast_node(ast_func_count, xpath_type_number, args[0]);
+                               }
+                               else if (name == PUGIXML_TEXT("contains") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_contains, xpath_type_boolean, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("concat") && argc >= 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_concat, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("ceiling") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_ceiling, xpath_type_number, args[0]);
+                                       
+                               break;
+                       
+                       case 'f':
+                               if (name == PUGIXML_TEXT("false") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_false, xpath_type_boolean);
+                               else if (name == PUGIXML_TEXT("floor") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_floor, xpath_type_number, args[0]);
+                                       
+                               break;
+                       
+                       case 'i':
+                               if (name == PUGIXML_TEXT("id") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_id, xpath_type_node_set, args[0]);
+                                       
+                               break;
+                       
+                       case 'l':
+                               if (name == PUGIXML_TEXT("last") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_last, xpath_type_number);
+                               else if (name == PUGIXML_TEXT("lang") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_lang, xpath_type_boolean, args[0]);
+                               else if (name == PUGIXML_TEXT("local-name") && argc <= 1)
+                                       return parse_function_helper(ast_func_local_name_0, ast_func_local_name_1, argc, args);
+                       
+                               break;
+                       
+                       case 'n':
+                               if (name == PUGIXML_TEXT("name") && argc <= 1)
+                                       return parse_function_helper(ast_func_name_0, ast_func_name_1, argc, args);
+                               else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1)
+                                       return parse_function_helper(ast_func_namespace_uri_0, ast_func_namespace_uri_1, argc, args);
+                               else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("not") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_not, xpath_type_boolean, args[0]);
+                               else if (name == PUGIXML_TEXT("number") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]);
+                       
+                               break;
+                       
+                       case 'p':
+                               if (name == PUGIXML_TEXT("position") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_position, xpath_type_number);
+                               
+                               break;
+                       
+                       case 'r':
+                               if (name == PUGIXML_TEXT("round") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_round, xpath_type_number, args[0]);
+
+                               break;
+                       
+                       case 's':
+                               if (name == PUGIXML_TEXT("string") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]);
+                               else if (name == PUGIXML_TEXT("string-length") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_number, args[0]);
+                               else if (name == PUGIXML_TEXT("starts-with") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("substring-before") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_substring_before, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("substring-after") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_substring_after, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3))
+                                       return new (alloc_node()) xpath_ast_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("sum") && argc == 1)
+                               {
+                                       if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+                                       return new (alloc_node()) xpath_ast_node(ast_func_sum, xpath_type_number, args[0]);
+                               }
+
+                               break;
+                       
+                       case 't':
+                               if (name == PUGIXML_TEXT("translate") && argc == 3)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_translate, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("true") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_true, xpath_type_boolean);
+                                       
+                               break;
+
+                       default:
+                               break;
+                       }
+
+                       throw_error("Unrecognized function or wrong parameter count");
+
+                       return 0;
+               }
+
+               axis_t parse_axis_name(const xpath_lexer_string& name, bool& specified)
+               {
+                       specified = true;
+
+                       switch (name.begin[0])
+                       {
+                       case 'a':
+                               if (name == PUGIXML_TEXT("ancestor"))
+                                       return axis_ancestor;
+                               else if (name == PUGIXML_TEXT("ancestor-or-self"))
+                                       return axis_ancestor_or_self;
+                               else if (name == PUGIXML_TEXT("attribute"))
+                                       return axis_attribute;
+                               
+                               break;
+                       
+                       case 'c':
+                               if (name == PUGIXML_TEXT("child"))
+                                       return axis_child;
+                               
+                               break;
+                       
+                       case 'd':
+                               if (name == PUGIXML_TEXT("descendant"))
+                                       return axis_descendant;
+                               else if (name == PUGIXML_TEXT("descendant-or-self"))
+                                       return axis_descendant_or_self;
+                               
+                               break;
+                       
+                       case 'f':
+                               if (name == PUGIXML_TEXT("following"))
+                                       return axis_following;
+                               else if (name == PUGIXML_TEXT("following-sibling"))
+                                       return axis_following_sibling;
+                               
+                               break;
+                       
+                       case 'n':
+                               if (name == PUGIXML_TEXT("namespace"))
+                                       return axis_namespace;
+                               
+                               break;
+                       
+                       case 'p':
+                               if (name == PUGIXML_TEXT("parent"))
+                                       return axis_parent;
+                               else if (name == PUGIXML_TEXT("preceding"))
+                                       return axis_preceding;
+                               else if (name == PUGIXML_TEXT("preceding-sibling"))
+                                       return axis_preceding_sibling;
+                               
+                               break;
+                       
+                       case 's':
+                               if (name == PUGIXML_TEXT("self"))
+                                       return axis_self;
+                               
+                               break;
+
+                       default:
+                               break;
+                       }
+
+                       specified = false;
+                       return axis_child;
+               }
+
+               nodetest_t parse_node_test_type(const xpath_lexer_string& name)
+               {
+                       switch (name.begin[0])
+                       {
+                       case 'c':
+                               if (name == PUGIXML_TEXT("comment"))
+                                       return nodetest_type_comment;
+
+                               break;
+
+                       case 'n':
+                               if (name == PUGIXML_TEXT("node"))
+                                       return nodetest_type_node;
+
+                               break;
+
+                       case 'p':
+                               if (name == PUGIXML_TEXT("processing-instruction"))
+                                       return nodetest_type_pi;
+
+                               break;
+
+                       case 't':
+                               if (name == PUGIXML_TEXT("text"))
+                                       return nodetest_type_text;
+
+                               break;
+                       
+                       default:
+                               break;
+                       }
+
+                       return nodetest_none;
+               }
+
+               // PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall
+               xpath_ast_node* parse_primary_expression()
+               {
+                       switch (_lexer.current())
+                       {
+                       case lex_var_ref:
+                       {
+                               xpath_lexer_string name = _lexer.contents();
+
+                               if (!_variables)
+                                       throw_error("Unknown variable: variable set is not provided");
+
+                               xpath_variable* var = get_variable_scratch(_scratch, _variables, name.begin, name.end);
+
+                               if (!var)
+                                       throw_error("Unknown variable: variable set does not contain the given name");
+
+                               _lexer.next();
+
+                               return new (alloc_node()) xpath_ast_node(ast_variable, var->type(), var);
+                       }
+
+                       case lex_open_brace:
+                       {
+                               _lexer.next();
+
+                               xpath_ast_node* n = parse_expression();
+
+                               if (_lexer.current() != lex_close_brace)
+                                       throw_error("Unmatched braces");
+
+                               _lexer.next();
+
+                               return n;
+                       }
+
+                       case lex_quoted_string:
+                       {
+                               const char_t* value = alloc_string(_lexer.contents());
+
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_string_constant, xpath_type_string, value);
+                               _lexer.next();
+
+                               return n;
+                       }
+
+                       case lex_number:
+                       {
+                               double value = 0;
+
+                               if (!convert_string_to_number_scratch(_scratch, _lexer.contents().begin, _lexer.contents().end, &value))
+                                       throw_error_oom();
+
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_number_constant, xpath_type_number, value);
+                               _lexer.next();
+
+                               return n;
+                       }
+
+                       case lex_string:
+                       {
+                               xpath_ast_node* args[2] = {0};
+                               size_t argc = 0;
+                               
+                               xpath_lexer_string function = _lexer.contents();
+                               _lexer.next();
+                               
+                               xpath_ast_node* last_arg = 0;
+                               
+                               if (_lexer.current() != lex_open_brace)
+                                       throw_error("Unrecognized function call");
+                               _lexer.next();
+
+                               if (_lexer.current() != lex_close_brace)
+                                       args[argc++] = parse_expression();
+
+                               while (_lexer.current() != lex_close_brace)
+                               {
+                                       if (_lexer.current() != lex_comma)
+                                               throw_error("No comma between function arguments");
+                                       _lexer.next();
+                                       
+                                       xpath_ast_node* n = parse_expression();
+                                       
+                                       if (argc < 2) args[argc] = n;
+                                       else last_arg->set_next(n);
+
+                                       argc++;
+                                       last_arg = n;
+                               }
+                               
+                               _lexer.next();
+
+                               return parse_function(function, argc, args);
+                       }
+
+                       default:
+                               throw_error("Unrecognizable primary expression");
+
+                               return 0;
+                       }
+               }
+               
+               // FilterExpr ::= PrimaryExpr | FilterExpr Predicate
+               // Predicate ::= '[' PredicateExpr ']'
+               // PredicateExpr ::= Expr
+               xpath_ast_node* parse_filter_expression()
+               {
+                       xpath_ast_node* n = parse_primary_expression();
+
+                       while (_lexer.current() == lex_open_square_brace)
+                       {
+                               _lexer.next();
+
+                               xpath_ast_node* expr = parse_expression();
+
+                               if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set");
+
+                               n = new (alloc_node()) xpath_ast_node(ast_filter, n, expr, predicate_default);
+
+                               if (_lexer.current() != lex_close_square_brace)
+                                       throw_error("Unmatched square brace");
+                       
+                               _lexer.next();
+                       }
+                       
+                       return n;
+               }
+               
+               // Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep
+               // AxisSpecifier ::= AxisName '::' | '@'?
+               // NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')'
+               // NameTest ::= '*' | NCName ':' '*' | QName
+               // AbbreviatedStep ::= '.' | '..'
+               xpath_ast_node* parse_step(xpath_ast_node* set)
+               {
+                       if (set && set->rettype() != xpath_type_node_set)
+                               throw_error("Step has to be applied to node set");
+
+                       bool axis_specified = false;
+                       axis_t axis = axis_child; // implied child axis
+
+                       if (_lexer.current() == lex_axis_attribute)
+                       {
+                               axis = axis_attribute;
+                               axis_specified = true;
+                               
+                               _lexer.next();
+                       }
+                       else if (_lexer.current() == lex_dot)
+                       {
+                               _lexer.next();
+                               
+                               return new (alloc_node()) xpath_ast_node(ast_step, set, axis_self, nodetest_type_node, 0);
+                       }
+                       else if (_lexer.current() == lex_double_dot)
+                       {
+                               _lexer.next();
+                               
+                               return new (alloc_node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0);
+                       }
+               
+                       nodetest_t nt_type = nodetest_none;
+                       xpath_lexer_string nt_name;
+                       
+                       if (_lexer.current() == lex_string)
+                       {
+                               // node name test
+                               nt_name = _lexer.contents();
+                               _lexer.next();
+
+                               // was it an axis name?
+                               if (_lexer.current() == lex_double_colon)
+                               {
+                                       // parse axis name
+                                       if (axis_specified) throw_error("Two axis specifiers in one step");
+
+                                       axis = parse_axis_name(nt_name, axis_specified);
+
+                                       if (!axis_specified) throw_error("Unknown axis");
+
+                                       // read actual node test
+                                       _lexer.next();
+
+                                       if (_lexer.current() == lex_multiply)
+                                       {
+                                               nt_type = nodetest_all;
+                                               nt_name = xpath_lexer_string();
+                                               _lexer.next();
+                                       }
+                                       else if (_lexer.current() == lex_string)
+                                       {
+                                               nt_name = _lexer.contents();
+                                               _lexer.next();
+                                       }
+                                       else throw_error("Unrecognized node test");
+                               }
+                               
+                               if (nt_type == nodetest_none)
+                               {
+                                       // node type test or processing-instruction
+                                       if (_lexer.current() == lex_open_brace)
+                                       {
+                                               _lexer.next();
+                                               
+                                               if (_lexer.current() == lex_close_brace)
+                                               {
+                                                       _lexer.next();
+
+                                                       nt_type = parse_node_test_type(nt_name);
+
+                                                       if (nt_type == nodetest_none) throw_error("Unrecognized node type");
+                                                       
+                                                       nt_name = xpath_lexer_string();
+                                               }
+                                               else if (nt_name == PUGIXML_TEXT("processing-instruction"))
+                                               {
+                                                       if (_lexer.current() != lex_quoted_string)
+                                                               throw_error("Only literals are allowed as arguments to processing-instruction()");
+                                               
+                                                       nt_type = nodetest_pi;
+                                                       nt_name = _lexer.contents();
+                                                       _lexer.next();
+                                                       
+                                                       if (_lexer.current() != lex_close_brace)
+                                                               throw_error("Unmatched brace near processing-instruction()");
+                                                       _lexer.next();
+                                               }
+                                               else
+                                                       throw_error("Unmatched brace near node type test");
+
+                                       }
+                                       // QName or NCName:*
+                                       else
+                                       {
+                                               if (nt_name.end - nt_name.begin > 2 && nt_name.end[-2] == ':' && nt_name.end[-1] == '*') // NCName:*
+                                               {
+                                                       nt_name.end--; // erase *
+                                                       
+                                                       nt_type = nodetest_all_in_namespace;
+                                               }
+                                               else nt_type = nodetest_name;
+                                       }
+                               }
+                       }
+                       else if (_lexer.current() == lex_multiply)
+                       {
+                               nt_type = nodetest_all;
+                               _lexer.next();
+                       }
+                       else throw_error("Unrecognized node test");
+                       
+                       xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step, set, axis, nt_type, alloc_string(nt_name));
+                       
+                       xpath_ast_node* last = 0;
+                       
+                       while (_lexer.current() == lex_open_square_brace)
+                       {
+                               _lexer.next();
+                               
+                               xpath_ast_node* expr = parse_expression();
+
+                               xpath_ast_node* pred = new (alloc_node()) xpath_ast_node(ast_predicate, 0, expr, predicate_default);
+                               
+                               if (_lexer.current() != lex_close_square_brace)
+                                       throw_error("Unmatched square brace");
+                               _lexer.next();
+                               
+                               if (last) last->set_next(pred);
+                               else n->set_right(pred);
+                               
+                               last = pred;
+                       }
+
+                       return n;
+               }
+               
+               // RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | RelativeLocationPath '//' Step
+               xpath_ast_node* parse_relative_location_path(xpath_ast_node* set)
+               {
+                       xpath_ast_node* n = parse_step(set);
+                       
+                       while (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+                       {
+                               lexeme_t l = _lexer.current();
+                               _lexer.next();
+
+                               if (l == lex_double_slash)
+                                       n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+                               
+                               n = parse_step(n);
+                       }
+                       
+                       return n;
+               }
+               
+               // LocationPath ::= RelativeLocationPath | AbsoluteLocationPath
+               // AbsoluteLocationPath ::= '/' RelativeLocationPath? | '//' RelativeLocationPath
+               xpath_ast_node* parse_location_path()
+               {
+                       if (_lexer.current() == lex_slash)
+                       {
+                               _lexer.next();
+                               
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+
+                               // relative location path can start from axis_attribute, dot, double_dot, multiply and string lexemes; any other lexeme means standalone root path
+                               lexeme_t l = _lexer.current();
+
+                               if (l == lex_string || l == lex_axis_attribute || l == lex_dot || l == lex_double_dot || l == lex_multiply)
+                                       return parse_relative_location_path(n);
+                               else
+                                       return n;
+                       }
+                       else if (_lexer.current() == lex_double_slash)
+                       {
+                               _lexer.next();
+                               
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+                               n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+                               
+                               return parse_relative_location_path(n);
+                       }
+
+                       // else clause moved outside of if because of bogus warning 'control may reach end of non-void function being inlined' in gcc 4.0.1
+                       return parse_relative_location_path(0);
+               }
+               
+               // PathExpr ::= LocationPath
+               //                              | FilterExpr
+               //                              | FilterExpr '/' RelativeLocationPath
+               //                              | FilterExpr '//' RelativeLocationPath
+               // UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
+               // UnaryExpr ::= UnionExpr | '-' UnaryExpr
+               xpath_ast_node* parse_path_or_unary_expression()
+               {
+                       // Clarification.
+                       // PathExpr begins with either LocationPath or FilterExpr.
+                       // FilterExpr begins with PrimaryExpr
+                       // PrimaryExpr begins with '$' in case of it being a variable reference,
+                       // '(' in case of it being an expression, string literal, number constant or
+                       // function call.
+
+                       if (_lexer.current() == lex_var_ref || _lexer.current() == lex_open_brace || 
+                               _lexer.current() == lex_quoted_string || _lexer.current() == lex_number ||
+                               _lexer.current() == lex_string)
+                       {
+                               if (_lexer.current() == lex_string)
+                               {
+                                       // This is either a function call, or not - if not, we shall proceed with location path
+                                       const char_t* state = _lexer.state();
+                                       
+                                       while (PUGI__IS_CHARTYPE(*state, ct_space)) ++state;
+                                       
+                                       if (*state != '(') return parse_location_path();
+
+                                       // This looks like a function call; however this still can be a node-test. Check it.
+                                       if (parse_node_test_type(_lexer.contents()) != nodetest_none) return parse_location_path();
+                               }
+                               
+                               xpath_ast_node* n = parse_filter_expression();
+
+                               if (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+                               {
+                                       lexeme_t l = _lexer.current();
+                                       _lexer.next();
+                                       
+                                       if (l == lex_double_slash)
+                                       {
+                                               if (n->rettype() != xpath_type_node_set) throw_error("Step has to be applied to node set");
+
+                                               n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+                                       }
+       
+                                       // select from location path
+                                       return parse_relative_location_path(n);
+                               }
+
+                               return n;
+                       }
+                       else if (_lexer.current() == lex_minus)
+                       {
+                               _lexer.next();
+
+                               // precedence 7+ - only parses union expressions
+                               xpath_ast_node* expr = parse_expression_rec(parse_path_or_unary_expression(), 7);
+
+                               return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr);
+                       }
+                       else
+                               return parse_location_path();
+               }
+
+               struct binary_op_t
+               {
+                       ast_type_t asttype;
+                       xpath_value_type rettype;
+                       int precedence;
+
+                       binary_op_t(): asttype(ast_unknown), rettype(xpath_type_none), precedence(0)
+                       {
+                       }
+
+                       binary_op_t(ast_type_t asttype_, xpath_value_type rettype_, int precedence_): asttype(asttype_), rettype(rettype_), precedence(precedence_)
+                       {
+                       }
+
+                       static binary_op_t parse(xpath_lexer& lexer)
+                       {
+                               switch (lexer.current())
+                               {
+                               case lex_string:
+                                       if (lexer.contents() == PUGIXML_TEXT("or"))
+                                               return binary_op_t(ast_op_or, xpath_type_boolean, 1);
+                                       else if (lexer.contents() == PUGIXML_TEXT("and"))
+                                               return binary_op_t(ast_op_and, xpath_type_boolean, 2);
+                                       else if (lexer.contents() == PUGIXML_TEXT("div"))
+                                               return binary_op_t(ast_op_divide, xpath_type_number, 6);
+                                       else if (lexer.contents() == PUGIXML_TEXT("mod"))
+                                               return binary_op_t(ast_op_mod, xpath_type_number, 6);
+                                       else
+                                               return binary_op_t();
+
+                               case lex_equal:
+                                       return binary_op_t(ast_op_equal, xpath_type_boolean, 3);
+
+                               case lex_not_equal:
+                                       return binary_op_t(ast_op_not_equal, xpath_type_boolean, 3);
+
+                               case lex_less:
+                                       return binary_op_t(ast_op_less, xpath_type_boolean, 4);
+
+                               case lex_greater:
+                                       return binary_op_t(ast_op_greater, xpath_type_boolean, 4);
+
+                               case lex_less_or_equal:
+                                       return binary_op_t(ast_op_less_or_equal, xpath_type_boolean, 4);
+
+                               case lex_greater_or_equal:
+                                       return binary_op_t(ast_op_greater_or_equal, xpath_type_boolean, 4);
+
+                               case lex_plus:
+                                       return binary_op_t(ast_op_add, xpath_type_number, 5);
+
+                               case lex_minus:
+                                       return binary_op_t(ast_op_subtract, xpath_type_number, 5);
+
+                               case lex_multiply:
+                                       return binary_op_t(ast_op_multiply, xpath_type_number, 6);
+
+                               case lex_union:
+                                       return binary_op_t(ast_op_union, xpath_type_node_set, 7);
+
+                               default:
+                                       return binary_op_t();
+                               }
+                       }
+               };
+
+               xpath_ast_node* parse_expression_rec(xpath_ast_node* lhs, int limit)
+               {
+                       binary_op_t op = binary_op_t::parse(_lexer);
+
+                       while (op.asttype != ast_unknown && op.precedence >= limit)
+                       {
+                               _lexer.next();
+
+                               xpath_ast_node* rhs = parse_path_or_unary_expression();
+
+                               binary_op_t nextop = binary_op_t::parse(_lexer);
+
+                               while (nextop.asttype != ast_unknown && nextop.precedence > op.precedence)
+                               {
+                                       rhs = parse_expression_rec(rhs, nextop.precedence);
+
+                                       nextop = binary_op_t::parse(_lexer);
+                               }
+
+                               if (op.asttype == ast_op_union && (lhs->rettype() != xpath_type_node_set || rhs->rettype() != xpath_type_node_set))
+                                       throw_error("Union operator has to be applied to node sets");
+
+                               lhs = new (alloc_node()) xpath_ast_node(op.asttype, op.rettype, lhs, rhs);
+
+                               op = binary_op_t::parse(_lexer);
+                       }
+
+                       return lhs;
+               }
+
+               // Expr ::= OrExpr
+               // OrExpr ::= AndExpr | OrExpr 'or' AndExpr
+               // AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr
+               // EqualityExpr ::= RelationalExpr
+               //                                      | EqualityExpr '=' RelationalExpr
+               //                                      | EqualityExpr '!=' RelationalExpr
+               // RelationalExpr ::= AdditiveExpr
+               //                                        | RelationalExpr '<' AdditiveExpr
+               //                                        | RelationalExpr '>' AdditiveExpr
+               //                                        | RelationalExpr '<=' AdditiveExpr
+               //                                        | RelationalExpr '>=' AdditiveExpr
+               // AdditiveExpr ::= MultiplicativeExpr
+               //                                      | AdditiveExpr '+' MultiplicativeExpr
+               //                                      | AdditiveExpr '-' MultiplicativeExpr
+               // MultiplicativeExpr ::= UnaryExpr
+               //                                                | MultiplicativeExpr '*' UnaryExpr
+               //                                                | MultiplicativeExpr 'div' UnaryExpr
+               //                                                | MultiplicativeExpr 'mod' UnaryExpr
+               xpath_ast_node* parse_expression()
+               {
+                       return parse_expression_rec(parse_path_or_unary_expression(), 0);
+               }
+
+               xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result)
+               {
+               }
+
+               xpath_ast_node* parse()
+               {
+                       xpath_ast_node* result = parse_expression();
+                       
+                       if (_lexer.current() != lex_eof)
+                       {
+                               // there are still unparsed tokens left, error
+                               throw_error("Incorrect query");
+                       }
+                       
+                       return result;
+               }
+
+               static xpath_ast_node* parse(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result)
+               {
+                       xpath_parser parser(query, variables, alloc, result);
+
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       int error = setjmp(parser._error_handler);
+
+                       return (error == 0) ? parser.parse() : 0;
+               #else
+                       return parser.parse();
+               #endif
+               }
+       };
+
+       struct xpath_query_impl
+       {
+               static xpath_query_impl* create()
+               {
+                       void* memory = xml_memory::allocate(sizeof(xpath_query_impl));
+
+                       return new (memory) xpath_query_impl();
+               }
+
+               static void destroy(void* ptr)
+               {
+                       if (!ptr) return;
+                       
+                       // free all allocated pages
+                       static_cast<xpath_query_impl*>(ptr)->alloc.release();
+
+                       // free allocator memory (with the first page)
+                       xml_memory::deallocate(ptr);
+               }
+
+               xpath_query_impl(): root(0), alloc(&block)
+               {
+                       block.next = 0;
+                       block.capacity = sizeof(block.data);
+               }
+
+               xpath_ast_node* root;
+               xpath_allocator alloc;
+               xpath_memory_block block;
+       };
+
+       PUGI__FN xpath_string evaluate_string_impl(xpath_query_impl* impl, const xpath_node& n, xpath_stack_data& sd)
+       {
+               if (!impl) return xpath_string();
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return xpath_string();
+       #endif
+
+               xpath_context c(n, 1, 1);
+
+               return impl->root->eval_string(c, sd.stack);
+       }
+
+       PUGI__FN impl::xpath_ast_node* evaluate_node_set_prepare(xpath_query_impl* impl)
+       {
+               if (!impl) return 0;
+
+               if (impl->root->rettype() != xpath_type_node_set)
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       return 0;
+               #else
+                       xpath_parse_result res;
+                       res.error = "Expression does not evaluate to node set";
+
+                       throw xpath_exception(res);
+               #endif
+               }
+
+               return impl->root;
+       }
+PUGI__NS_END
+
+namespace pugi
+{
+#ifndef PUGIXML_NO_EXCEPTIONS
+       PUGI__FN xpath_exception::xpath_exception(const xpath_parse_result& result_): _result(result_)
+       {
+               assert(_result.error);
+       }
+       
+       PUGI__FN const char* xpath_exception::what() const throw()
+       {
+               return _result.error;
+       }
+
+       PUGI__FN const xpath_parse_result& xpath_exception::result() const
+       {
+               return _result;
+       }
+#endif
+       
+       PUGI__FN xpath_node::xpath_node()
+       {
+       }
+               
+       PUGI__FN xpath_node::xpath_node(const xml_node& node_): _node(node_)
+       {
+       }
+               
+       PUGI__FN xpath_node::xpath_node(const xml_attribute& attribute_, const xml_node& parent_): _node(attribute_ ? parent_ : xml_node()), _attribute(attribute_)
+       {
+       }
+
+       PUGI__FN xml_node xpath_node::node() const
+       {
+               return _attribute ? xml_node() : _node;
+       }
+               
+       PUGI__FN xml_attribute xpath_node::attribute() const
+       {
+               return _attribute;
+       }
+       
+       PUGI__FN xml_node xpath_node::parent() const
+       {
+               return _attribute ? _node : _node.parent();
+       }
+
+       PUGI__FN static void unspecified_bool_xpath_node(xpath_node***)
+       {
+       }
+
+       PUGI__FN xpath_node::operator xpath_node::unspecified_bool_type() const
+       {
+               return (_node || _attribute) ? unspecified_bool_xpath_node : 0;
+       }
+       
+       PUGI__FN bool xpath_node::operator!() const
+       {
+               return !(_node || _attribute);
+       }
+
+       PUGI__FN bool xpath_node::operator==(const xpath_node& n) const
+       {
+               return _node == n._node && _attribute == n._attribute;
+       }
+       
+       PUGI__FN bool xpath_node::operator!=(const xpath_node& n) const
+       {
+               return _node != n._node || _attribute != n._attribute;
+       }
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xpath_node& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xpath_node& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN void xpath_node_set::_assign(const_iterator begin_, const_iterator end_)
+       {
+               assert(begin_ <= end_);
+
+               size_t size_ = static_cast<size_t>(end_ - begin_);
+
+               if (size_ <= 1)
+               {
+                       // deallocate old buffer
+                       if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+                       // use internal buffer
+                       if (begin_ != end_) _storage = *begin_;
+
+                       _begin = &_storage;
+                       _end = &_storage + size_;
+               }
+               else
+               {
+                       // make heap copy
+                       xpath_node* storage = static_cast<xpath_node*>(impl::xml_memory::allocate(size_ * sizeof(xpath_node)));
+
+                       if (!storage)
+                       {
+                       #ifdef PUGIXML_NO_EXCEPTIONS
+                               return;
+                       #else
+                               throw std::bad_alloc();
+                       #endif
+                       }
+
+                       memcpy(storage, begin_, size_ * sizeof(xpath_node));
+                       
+                       // deallocate old buffer
+                       if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+                       // finalize
+                       _begin = storage;
+                       _end = storage + size_;
+               }
+       }
+
+       PUGI__FN xpath_node_set::xpath_node_set(): _type(type_unsorted), _begin(&_storage), _end(&_storage)
+       {
+       }
+
+       PUGI__FN xpath_node_set::xpath_node_set(const_iterator begin_, const_iterator end_, type_t type_): _type(type_), _begin(&_storage), _end(&_storage)
+       {
+               _assign(begin_, end_);
+       }
+
+       PUGI__FN xpath_node_set::~xpath_node_set()
+       {
+               if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+       }
+               
+       PUGI__FN xpath_node_set::xpath_node_set(const xpath_node_set& ns): _type(ns._type), _begin(&_storage), _end(&_storage)
+       {
+               _assign(ns._begin, ns._end);
+       }
+       
+       PUGI__FN xpath_node_set& xpath_node_set::operator=(const xpath_node_set& ns)
+       {
+               if (this == &ns) return *this;
+               
+               _type = ns._type;
+               _assign(ns._begin, ns._end);
+
+               return *this;
+       }
+
+       PUGI__FN xpath_node_set::type_t xpath_node_set::type() const
+       {
+               return _type;
+       }
+               
+       PUGI__FN size_t xpath_node_set::size() const
+       {
+               return _end - _begin;
+       }
+               
+       PUGI__FN bool xpath_node_set::empty() const
+       {
+               return _begin == _end;
+       }
+               
+       PUGI__FN const xpath_node& xpath_node_set::operator[](size_t index) const
+       {
+               assert(index < size());
+               return _begin[index];
+       }
+
+       PUGI__FN xpath_node_set::const_iterator xpath_node_set::begin() const
+       {
+               return _begin;
+       }
+               
+       PUGI__FN xpath_node_set::const_iterator xpath_node_set::end() const
+       {
+               return _end;
+       }
+       
+       PUGI__FN void xpath_node_set::sort(bool reverse)
+       {
+               _type = impl::xpath_sort(_begin, _end, _type, reverse);
+       }
+
+       PUGI__FN xpath_node xpath_node_set::first() const
+       {
+               return impl::xpath_first(_begin, _end, _type);
+       }
+
+       PUGI__FN xpath_parse_result::xpath_parse_result(): error("Internal error"), offset(0)
+       {
+       }
+
+       PUGI__FN xpath_parse_result::operator bool() const
+       {
+               return error == 0;
+       }
+
+       PUGI__FN const char* xpath_parse_result::description() const
+       {
+               return error ? error : "No error";
+       }
+
+       PUGI__FN xpath_variable::xpath_variable(): _type(xpath_type_none), _next(0)
+       {
+       }
+
+       PUGI__FN const char_t* xpath_variable::name() const
+       {
+               switch (_type)
+               {
+               case xpath_type_node_set:
+                       return static_cast<const impl::xpath_variable_node_set*>(this)->name;
+
+               case xpath_type_number:
+                       return static_cast<const impl::xpath_variable_number*>(this)->name;
+
+               case xpath_type_string:
+                       return static_cast<const impl::xpath_variable_string*>(this)->name;
+
+               case xpath_type_boolean:
+                       return static_cast<const impl::xpath_variable_boolean*>(this)->name;
+
+               default:
+                       assert(!"Invalid variable type");
+                       return 0;
+               }
+       }
+
+       PUGI__FN xpath_value_type xpath_variable::type() const
+       {
+               return _type;
+       }
+
+       PUGI__FN bool xpath_variable::get_boolean() const
+       {
+               return (_type == xpath_type_boolean) ? static_cast<const impl::xpath_variable_boolean*>(this)->value : false;
+       }
+
+       PUGI__FN double xpath_variable::get_number() const
+       {
+               return (_type == xpath_type_number) ? static_cast<const impl::xpath_variable_number*>(this)->value : impl::gen_nan();
+       }
+
+       PUGI__FN const char_t* xpath_variable::get_string() const
+       {
+               const char_t* value = (_type == xpath_type_string) ? static_cast<const impl::xpath_variable_string*>(this)->value : 0;
+               return value ? value : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const xpath_node_set& xpath_variable::get_node_set() const
+       {
+               return (_type == xpath_type_node_set) ? static_cast<const impl::xpath_variable_node_set*>(this)->value : impl::dummy_node_set;
+       }
+
+       PUGI__FN bool xpath_variable::set(bool value)
+       {
+               if (_type != xpath_type_boolean) return false;
+
+               static_cast<impl::xpath_variable_boolean*>(this)->value = value;
+               return true;
+       }
+
+       PUGI__FN bool xpath_variable::set(double value)
+       {
+               if (_type != xpath_type_number) return false;
+
+               static_cast<impl::xpath_variable_number*>(this)->value = value;
+               return true;
+       }
+
+       PUGI__FN bool xpath_variable::set(const char_t* value)
+       {
+               if (_type != xpath_type_string) return false;
+
+               impl::xpath_variable_string* var = static_cast<impl::xpath_variable_string*>(this);
+
+               // duplicate string
+               size_t size = (impl::strlength(value) + 1) * sizeof(char_t);
+
+               char_t* copy = static_cast<char_t*>(impl::xml_memory::allocate(size));
+               if (!copy) return false;
+
+               memcpy(copy, value, size);
+
+               // replace old string
+               if (var->value) impl::xml_memory::deallocate(var->value);
+               var->value = copy;
+
+               return true;
+       }
+
+       PUGI__FN bool xpath_variable::set(const xpath_node_set& value)
+       {
+               if (_type != xpath_type_node_set) return false;
+
+               static_cast<impl::xpath_variable_node_set*>(this)->value = value;
+               return true;
+       }
+
+       PUGI__FN xpath_variable_set::xpath_variable_set()
+       {
+               for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) _data[i] = 0;
+       }
+
+       PUGI__FN xpath_variable_set::~xpath_variable_set()
+       {
+               for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i)
+               {
+                       xpath_variable* var = _data[i];
+
+                       while (var)
+                       {
+                               xpath_variable* next = var->_next;
+
+                               impl::delete_xpath_variable(var->_type, var);
+
+                               var = next;
+                       }
+               }
+       }
+
+       PUGI__FN xpath_variable* xpath_variable_set::find(const char_t* name) const
+       {
+               const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+               size_t hash = impl::hash_string(name) % hash_size;
+
+               // look for existing variable
+               for (xpath_variable* var = _data[hash]; var; var = var->_next)
+                       if (impl::strequal(var->name(), name))
+                               return var;
+
+               return 0;
+       }
+
+       PUGI__FN xpath_variable* xpath_variable_set::add(const char_t* name, xpath_value_type type)
+       {
+               const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+               size_t hash = impl::hash_string(name) % hash_size;
+
+               // look for existing variable
+               for (xpath_variable* var = _data[hash]; var; var = var->_next)
+                       if (impl::strequal(var->name(), name))
+                               return var->type() == type ? var : 0;
+
+               // add new variable
+               xpath_variable* result = impl::new_xpath_variable(type, name);
+
+               if (result)
+               {
+                       result->_type = type;
+                       result->_next = _data[hash];
+
+                       _data[hash] = result;
+               }
+
+               return result;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, bool value)
+       {
+               xpath_variable* var = add(name, xpath_type_boolean);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, double value)
+       {
+               xpath_variable* var = add(name, xpath_type_number);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, const char_t* value)
+       {
+               xpath_variable* var = add(name, xpath_type_string);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, const xpath_node_set& value)
+       {
+               xpath_variable* var = add(name, xpath_type_node_set);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN xpath_variable* xpath_variable_set::get(const char_t* name)
+       {
+               return find(name);
+       }
+
+       PUGI__FN const xpath_variable* xpath_variable_set::get(const char_t* name) const
+       {
+               return find(name);
+       }
+
+       PUGI__FN xpath_query::xpath_query(const char_t* query, xpath_variable_set* variables): _impl(0)
+       {
+               impl::xpath_query_impl* qimpl = impl::xpath_query_impl::create();
+
+               if (!qimpl)
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       _result.error = "Out of memory";
+               #else
+                       throw std::bad_alloc();
+               #endif
+               }
+               else
+               {
+                       impl::buffer_holder impl_holder(qimpl, impl::xpath_query_impl::destroy);
+
+                       qimpl->root = impl::xpath_parser::parse(query, variables, &qimpl->alloc, &_result);
+
+                       if (qimpl->root)
+                       {
+                               qimpl->root->optimize(&qimpl->alloc);
+
+                               _impl = static_cast<impl::xpath_query_impl*>(impl_holder.release());
+                               _result.error = 0;
+                       }
+               }
+       }
+
+       PUGI__FN xpath_query::~xpath_query()
+       {
+               impl::xpath_query_impl::destroy(_impl);
+       }
+
+       PUGI__FN xpath_value_type xpath_query::return_type() const
+       {
+               if (!_impl) return xpath_type_none;
+
+               return static_cast<impl::xpath_query_impl*>(_impl)->root->rettype();
+       }
+
+       PUGI__FN bool xpath_query::evaluate_boolean(const xpath_node& n) const
+       {
+               if (!_impl) return false;
+               
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return false;
+       #endif
+               
+               return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_boolean(c, sd.stack);
+       }
+       
+       PUGI__FN double xpath_query::evaluate_number(const xpath_node& n) const
+       {
+               if (!_impl) return impl::gen_nan();
+               
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return impl::gen_nan();
+       #endif
+
+               return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_number(c, sd.stack);
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN string_t xpath_query::evaluate_string(const xpath_node& n) const
+       {
+               impl::xpath_stack_data sd;
+
+               impl::xpath_string r = impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd);
+
+               return string_t(r.c_str(), r.length());
+       }
+#endif
+
+       PUGI__FN size_t xpath_query::evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const
+       {
+               impl::xpath_stack_data sd;
+
+               impl::xpath_string r = impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd);
+
+               size_t full_size = r.length() + 1;
+               
+               if (capacity > 0)
+               {
+                       size_t size = (full_size < capacity) ? full_size : capacity;
+                       assert(size > 0);
+
+                       memcpy(buffer, r.c_str(), (size - 1) * sizeof(char_t));
+                       buffer[size - 1] = 0;
+               }
+               
+               return full_size;
+       }
+
+       PUGI__FN xpath_node_set xpath_query::evaluate_node_set(const xpath_node& n) const
+       {
+               impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast<impl::xpath_query_impl*>(_impl));
+               if (!root) return xpath_node_set();
+
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return xpath_node_set();
+       #endif
+
+               impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_all);
+
+               return xpath_node_set(r.begin(), r.end(), r.type());
+       }
+
+       PUGI__FN xpath_node xpath_query::evaluate_node(const xpath_node& n) const
+       {
+               impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast<impl::xpath_query_impl*>(_impl));
+               if (!root) return xpath_node();
+
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return xpath_node();
+       #endif
+
+               impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_first);
+
+               return r.first();
+       }
+
+       PUGI__FN const xpath_parse_result& xpath_query::result() const
+       {
+               return _result;
+       }
+
+       PUGI__FN static void unspecified_bool_xpath_query(xpath_query***)
+       {
+       }
+
+       PUGI__FN xpath_query::operator xpath_query::unspecified_bool_type() const
+       {
+               return _impl ? unspecified_bool_xpath_query : 0;
+       }
+
+       PUGI__FN bool xpath_query::operator!() const
+       {
+               return !_impl;
+       }
+
+       PUGI__FN xpath_node xml_node::select_node(const char_t* query, xpath_variable_set* variables) const
+       {
+               xpath_query q(query, variables);
+               return select_node(q);
+       }
+
+       PUGI__FN xpath_node xml_node::select_node(const xpath_query& query) const
+       {
+               return query.evaluate_node(*this);
+       }
+
+       PUGI__FN xpath_node_set xml_node::select_nodes(const char_t* query, xpath_variable_set* variables) const
+       {
+               xpath_query q(query, variables);
+               return select_nodes(q);
+       }
+
+       PUGI__FN xpath_node_set xml_node::select_nodes(const xpath_query& query) const
+       {
+               return query.evaluate_node_set(*this);
+       }
+
+       PUGI__FN xpath_node xml_node::select_single_node(const char_t* query, xpath_variable_set* variables) const
+       {
+               xpath_query q(query, variables);
+               return select_single_node(q);
+       }
+
+       PUGI__FN xpath_node xml_node::select_single_node(const xpath_query& query) const
+       {
+               return query.evaluate_node(*this);
+       }
+}
+
+#endif
+
+#ifdef __BORLANDC__
+#      pragma option pop
+#endif
+
+// Intel C++ does not properly keep warning state for function templates,
+// so popping warning state at the end of translation unit leads to warnings in the middle.
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#      pragma warning(pop)
+#endif
+
+// Undefine all local macros (makes sure we're not leaking macros in header-only mode)
+#undef PUGI__NO_INLINE
+#undef PUGI__UNLIKELY
+#undef PUGI__STATIC_ASSERT
+#undef PUGI__DMC_VOLATILE
+#undef PUGI__MSVC_CRT_VERSION
+#undef PUGI__NS_BEGIN
+#undef PUGI__NS_END
+#undef PUGI__FN
+#undef PUGI__FN_NO_INLINE
+#undef PUGI__NODETYPE
+#undef PUGI__IS_CHARTYPE_IMPL
+#undef PUGI__IS_CHARTYPE
+#undef PUGI__IS_CHARTYPEX
+#undef PUGI__ENDSWITH
+#undef PUGI__SKIPWS
+#undef PUGI__OPTSET
+#undef PUGI__PUSHNODE
+#undef PUGI__POPNODE
+#undef PUGI__SCANFOR
+#undef PUGI__SCANWHILE
+#undef PUGI__SCANWHILE_UNROLL
+#undef PUGI__ENDSEG
+#undef PUGI__THROW_ERROR
+#undef PUGI__CHECK_ERROR
+
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/src/pugixml/pugixml.hpp b/src/pugixml/pugixml.hpp

new file mode 100644 (file)

index 0000000..9798b46
--- /dev/null
+++ b/src/pugixml/pugixml.hpp
@@ -0,0 +1,1366 @@
+/**
+ * pugixml parser - version 1.5
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef PUGIXML_VERSION
+// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons
+#      define PUGIXML_VERSION 150
+#endif
+
+// Include user configuration file (this can define various configuration macros)
+#include "pugiconfig.hpp"
+
+#ifndef HEADER_PUGIXML_HPP
+#define HEADER_PUGIXML_HPP
+
+// Include stddef.h for size_t and ptrdiff_t
+#include <stddef.h>
+
+// Include exception header for XPath
+#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS)
+#      include <exception>
+#endif
+
+// Include STL headers
+#ifndef PUGIXML_NO_STL
+#      include <iterator>
+#      include <iosfwd>
+#      include <string>
+#endif
+
+// Macro for deprecated features
+#ifndef PUGIXML_DEPRECATED
+#      if defined(__GNUC__)
+#              define PUGIXML_DEPRECATED __attribute__((deprecated))
+#      elif defined(_MSC_VER) && _MSC_VER >= 1300
+#              define PUGIXML_DEPRECATED __declspec(deprecated)
+#      else
+#              define PUGIXML_DEPRECATED
+#      endif
+#endif
+
+// If no API is defined, assume default
+#ifndef PUGIXML_API
+#      define PUGIXML_API
+#endif
+
+// If no API for classes is defined, assume default
+#ifndef PUGIXML_CLASS
+#      define PUGIXML_CLASS PUGIXML_API
+#endif
+
+// If no API for functions is defined, assume default
+#ifndef PUGIXML_FUNCTION
+#      define PUGIXML_FUNCTION PUGIXML_API
+#endif
+
+// If the platform is known to have long long support, enable long long functions
+#ifndef PUGIXML_HAS_LONG_LONG
+#      if defined(__cplusplus) && __cplusplus >= 201103
+#              define PUGIXML_HAS_LONG_LONG
+#      elif defined(_MSC_VER) && _MSC_VER >= 1400
+#              define PUGIXML_HAS_LONG_LONG
+#      endif
+#endif
+
+// Character interface macros
+#ifdef PUGIXML_WCHAR_MODE
+#      define PUGIXML_TEXT(t) L ## t
+#      define PUGIXML_CHAR wchar_t
+#else
+#      define PUGIXML_TEXT(t) t
+#      define PUGIXML_CHAR char
+#endif
+
+namespace pugi
+{
+       // Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE
+       typedef PUGIXML_CHAR char_t;
+
+#ifndef PUGIXML_NO_STL
+       // String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE
+       typedef std::basic_string<PUGIXML_CHAR, std::char_traits<PUGIXML_CHAR>, std::allocator<PUGIXML_CHAR> > string_t;
+#endif
+}
+
+// The PugiXML namespace
+namespace pugi
+{
+       // Tree node types
+       enum xml_node_type
+       {
+               node_null,                      // Empty (null) node handle
+               node_document,          // A document tree's absolute root
+               node_element,           // Element tag, i.e. '<node/>'
+               node_pcdata,            // Plain character data, i.e. 'text'
+               node_cdata,                     // Character data, i.e. '<![CDATA[text]]>'
+               node_comment,           // Comment tag, i.e. '<!-- text -->'
+               node_pi,                        // Processing instruction, i.e. '<?name?>'
+               node_declaration,       // Document declaration, i.e. '<?xml version="1.0"?>'
+               node_doctype            // Document type declaration, i.e. '<!DOCTYPE doc>'
+       };
+
+       // Parsing options
+
+       // Minimal parsing mode (equivalent to turning all other flags off).
+       // Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
+       const unsigned int parse_minimal = 0x0000;
+
+       // This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default.
+       const unsigned int parse_pi = 0x0001;
+
+       // This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default.
+       const unsigned int parse_comments = 0x0002;
+
+       // This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default.
+       const unsigned int parse_cdata = 0x0004;
+
+       // This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree.
+       // This flag is off by default; turning it on usually results in slower parsing and more memory consumption.
+       const unsigned int parse_ws_pcdata = 0x0008;
+
+       // This flag determines if character and entity references are expanded during parsing. This flag is on by default.
+       const unsigned int parse_escapes = 0x0010;
+
+       // This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
+       const unsigned int parse_eol = 0x0020;
+       
+       // This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
+       const unsigned int parse_wconv_attribute = 0x0040;
+
+       // This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
+       const unsigned int parse_wnorm_attribute = 0x0080;
+       
+       // This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
+       const unsigned int parse_declaration = 0x0100;
+
+       // This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
+       const unsigned int parse_doctype = 0x0200;
+
+       // This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only
+       // of whitespace is added to the DOM tree.
+       // This flag is off by default; turning it on may result in slower parsing and more memory consumption.
+       const unsigned int parse_ws_pcdata_single = 0x0400;
+
+       // This flag determines if leading and trailing whitespace is to be removed from plain character data. This flag is off by default.
+       const unsigned int parse_trim_pcdata = 0x0800;
+
+       // This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document
+       // is a valid document. This flag is off by default.
+       const unsigned int parse_fragment = 0x1000;
+
+       // The default parsing mode.
+       // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
+       // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+       const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
+
+       // The full parsing mode.
+       // Nodes of all types are added to the DOM tree, character/reference entities are expanded,
+       // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+       const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
+
+       // These flags determine the encoding of input data for XML document
+       enum xml_encoding
+       {
+               encoding_auto,          // Auto-detect input encoding using BOM or < / <? detection; use UTF8 if BOM is not found
+               encoding_utf8,          // UTF8 encoding
+               encoding_utf16_le,      // Little-endian UTF16
+               encoding_utf16_be,      // Big-endian UTF16
+               encoding_utf16,         // UTF16 with native endianness
+               encoding_utf32_le,      // Little-endian UTF32
+               encoding_utf32_be,      // Big-endian UTF32
+               encoding_utf32,         // UTF32 with native endianness
+               encoding_wchar,         // The same encoding wchar_t has (either UTF16 or UTF32)
+               encoding_latin1
+       };
+
+       // Formatting flags
+       
+       // Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
+       const unsigned int format_indent = 0x01;
+       
+       // Write encoding-specific BOM to the output stream. This flag is off by default.
+       const unsigned int format_write_bom = 0x02;
+
+       // Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
+       const unsigned int format_raw = 0x04;
+       
+       // Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
+       const unsigned int format_no_declaration = 0x08;
+
+       // Don't escape attribute values and PCDATA contents. This flag is off by default.
+       const unsigned int format_no_escapes = 0x10;
+
+       // Open file using text mode in xml_document::save_file. This enables special character (i.e. new-line) conversions on some systems. This flag is off by default.
+       const unsigned int format_save_file_text = 0x20;
+
+       // The default set of formatting flags.
+       // Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
+       const unsigned int format_default = format_indent;
+               
+       // Forward declarations
+       struct xml_attribute_struct;
+       struct xml_node_struct;
+
+       class xml_node_iterator;
+       class xml_attribute_iterator;
+       class xml_named_node_iterator;
+
+       class xml_tree_walker;
+
+       struct xml_parse_result;
+
+       class xml_node;
+
+       class xml_text;
+       
+       #ifndef PUGIXML_NO_XPATH
+       class xpath_node;
+       class xpath_node_set;
+       class xpath_query;
+       class xpath_variable_set;
+       #endif
+
+       // Range-based for loop support
+       template <typename It> class xml_object_range
+       {
+       public:
+               typedef It const_iterator;
+               typedef It iterator;
+
+               xml_object_range(It b, It e): _begin(b), _end(e)
+               {
+               }
+
+               It begin() const { return _begin; }
+               It end() const { return _end; }
+
+       private:
+               It _begin, _end;
+       };
+
+       // Writer interface for node printing (see xml_node::print)
+       class PUGIXML_CLASS xml_writer
+       {
+       public:
+               virtual ~xml_writer() {}
+
+               // Write memory chunk into stream/file/whatever
+               virtual void write(const void* data, size_t size) = 0;
+       };
+
+       // xml_writer implementation for FILE*
+       class PUGIXML_CLASS xml_writer_file: public xml_writer
+       {
+       public:
+               // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
+               xml_writer_file(void* file);
+
+               virtual void write(const void* data, size_t size);
+
+       private:
+               void* file;
+       };
+
+       #ifndef PUGIXML_NO_STL
+       // xml_writer implementation for streams
+       class PUGIXML_CLASS xml_writer_stream: public xml_writer
+       {
+       public:
+               // Construct writer from an output stream object
+               xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
+               xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
+
+               virtual void write(const void* data, size_t size);
+
+       private:
+               std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
+               std::basic_ostream<wchar_t, std::char_traits<wchar_t> >* wide_stream;
+       };
+       #endif
+
+       // A light-weight handle for manipulating attributes in DOM tree
+       class PUGIXML_CLASS xml_attribute
+       {
+               friend class xml_attribute_iterator;
+               friend class xml_node;
+
+       private:
+               xml_attribute_struct* _attr;
+       
+               typedef void (*unspecified_bool_type)(xml_attribute***);
+
+       public:
+               // Default constructor. Constructs an empty attribute.
+               xml_attribute();
+               
+               // Constructs attribute from internal pointer
+               explicit xml_attribute(xml_attribute_struct* attr);
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+
+               // Comparison operators (compares wrapped attribute pointers)
+               bool operator==(const xml_attribute& r) const;
+               bool operator!=(const xml_attribute& r) const;
+               bool operator<(const xml_attribute& r) const;
+               bool operator>(const xml_attribute& r) const;
+               bool operator<=(const xml_attribute& r) const;
+               bool operator>=(const xml_attribute& r) const;
+
+               // Check if attribute is empty
+               bool empty() const;
+
+               // Get attribute name/value, or "" if attribute is empty
+               const char_t* name() const;
+               const char_t* value() const;
+
+               // Get attribute value, or the default value if attribute is empty
+               const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+               // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty
+               int as_int(int def = 0) const;
+               unsigned int as_uint(unsigned int def = 0) const;
+               double as_double(double def = 0) const;
+               float as_float(float def = 0) const;
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               long long as_llong(long long def = 0) const;
+               unsigned long long as_ullong(unsigned long long def = 0) const;
+       #endif
+
+               // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty
+               bool as_bool(bool def = false) const;
+
+               // Set attribute name/value (returns false if attribute is empty or there is not enough memory)
+               bool set_name(const char_t* rhs);
+               bool set_value(const char_t* rhs);
+
+               // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+               bool set_value(int rhs);
+               bool set_value(unsigned int rhs);
+               bool set_value(double rhs);
+               bool set_value(float rhs);
+               bool set_value(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               bool set_value(long long rhs);
+               bool set_value(unsigned long long rhs);
+       #endif
+
+               // Set attribute value (equivalent to set_value without error checking)
+               xml_attribute& operator=(const char_t* rhs);
+               xml_attribute& operator=(int rhs);
+               xml_attribute& operator=(unsigned int rhs);
+               xml_attribute& operator=(double rhs);
+               xml_attribute& operator=(float rhs);
+               xml_attribute& operator=(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               xml_attribute& operator=(long long rhs);
+               xml_attribute& operator=(unsigned long long rhs);
+       #endif
+
+               // Get next/previous attribute in the attribute list of the parent node
+               xml_attribute next_attribute() const;
+               xml_attribute previous_attribute() const;
+
+               // Get hash value (unique for handles to the same object)
+               size_t hash_value() const;
+
+               // Get internal pointer
+               xml_attribute_struct* internal_object() const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs);
+#endif
+
+       // A light-weight handle for manipulating nodes in DOM tree
+       class PUGIXML_CLASS xml_node
+       {
+               friend class xml_attribute_iterator;
+               friend class xml_node_iterator;
+               friend class xml_named_node_iterator;
+
+       protected:
+               xml_node_struct* _root;
+
+               typedef void (*unspecified_bool_type)(xml_node***);
+
+       public:
+               // Default constructor. Constructs an empty node.
+               xml_node();
+
+               // Constructs node from internal pointer
+               explicit xml_node(xml_node_struct* p);
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+       
+               // Comparison operators (compares wrapped node pointers)
+               bool operator==(const xml_node& r) const;
+               bool operator!=(const xml_node& r) const;
+               bool operator<(const xml_node& r) const;
+               bool operator>(const xml_node& r) const;
+               bool operator<=(const xml_node& r) const;
+               bool operator>=(const xml_node& r) const;
+
+               // Check if node is empty.
+               bool empty() const;
+
+               // Get node type
+               xml_node_type type() const;
+
+               // Get node name, or "" if node is empty or it has no name
+               const char_t* name() const;
+
+               // Get node value, or "" if node is empty or it has no value
+               // Note: For <node>text</node> node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes.
+               const char_t* value() const;
+       
+               // Get attribute list
+               xml_attribute first_attribute() const;
+               xml_attribute last_attribute() const;
+
+               // Get children list
+               xml_node first_child() const;
+               xml_node last_child() const;
+
+               // Get next/previous sibling in the children list of the parent node
+               xml_node next_sibling() const;
+               xml_node previous_sibling() const;
+               
+               // Get parent node
+               xml_node parent() const;
+
+               // Get root of DOM tree this node belongs to
+               xml_node root() const;
+
+               // Get text object for the current node
+               xml_text text() const;
+
+               // Get child, attribute or next/previous sibling with the specified name
+               xml_node child(const char_t* name) const;
+               xml_attribute attribute(const char_t* name) const;
+               xml_node next_sibling(const char_t* name) const;
+               xml_node previous_sibling(const char_t* name) const;
+
+               // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA
+               const char_t* child_value() const;
+
+               // Get child value of child with specified name. Equivalent to child(name).child_value().
+               const char_t* child_value(const char_t* name) const;
+
+               // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
+               bool set_name(const char_t* rhs);
+               bool set_value(const char_t* rhs);
+               
+               // Add attribute with specified name. Returns added attribute, or empty attribute on errors.
+               xml_attribute append_attribute(const char_t* name);
+               xml_attribute prepend_attribute(const char_t* name);
+               xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr);
+               xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr);
+
+               // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors.
+               xml_attribute append_copy(const xml_attribute& proto);
+               xml_attribute prepend_copy(const xml_attribute& proto);
+               xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr);
+               xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr);
+
+               // Add child node with specified type. Returns added node, or empty node on errors.
+               xml_node append_child(xml_node_type type = node_element);
+               xml_node prepend_child(xml_node_type type = node_element);
+               xml_node insert_child_after(xml_node_type type, const xml_node& node);
+               xml_node insert_child_before(xml_node_type type, const xml_node& node);
+
+               // Add child element with specified name. Returns added node, or empty node on errors.
+               xml_node append_child(const char_t* name);
+               xml_node prepend_child(const char_t* name);
+               xml_node insert_child_after(const char_t* name, const xml_node& node);
+               xml_node insert_child_before(const char_t* name, const xml_node& node);
+
+               // Add a copy of the specified node as a child. Returns added node, or empty node on errors.
+               xml_node append_copy(const xml_node& proto);
+               xml_node prepend_copy(const xml_node& proto);
+               xml_node insert_copy_after(const xml_node& proto, const xml_node& node);
+               xml_node insert_copy_before(const xml_node& proto, const xml_node& node);
+
+               // Move the specified node to become a child of this node. Returns moved node, or empty node on errors.
+               xml_node append_move(const xml_node& moved);
+               xml_node prepend_move(const xml_node& moved);
+               xml_node insert_move_after(const xml_node& moved, const xml_node& node);
+               xml_node insert_move_before(const xml_node& moved, const xml_node& node);
+
+               // Remove specified attribute
+               bool remove_attribute(const xml_attribute& a);
+               bool remove_attribute(const char_t* name);
+
+               // Remove specified child
+               bool remove_child(const xml_node& n);
+               bool remove_child(const char_t* name);
+
+               // Parses buffer as an XML document fragment and appends all nodes as children of the current node.
+               // Copies/converts the buffer, so it may be deleted or changed after the function returns.
+               // Note: append_buffer allocates memory that has the lifetime of the owning document; removing the appended nodes does not immediately reclaim that memory.
+               xml_parse_result append_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Find attribute using predicate. Returns first attribute for which predicate returned true.
+               template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
+               {
+                       if (!_root) return xml_attribute();
+                       
+                       for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
+                               if (pred(attrib))
+                                       return attrib;
+               
+                       return xml_attribute();
+               }
+
+               // Find child node using predicate. Returns first child for which predicate returned true.
+               template <typename Predicate> xml_node find_child(Predicate pred) const
+               {
+                       if (!_root) return xml_node();
+       
+                       for (xml_node node = first_child(); node; node = node.next_sibling())
+                               if (pred(node))
+                                       return node;
+               
+                       return xml_node();
+               }
+
+               // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true.
+               template <typename Predicate> xml_node find_node(Predicate pred) const
+               {
+                       if (!_root) return xml_node();
+
+                       xml_node cur = first_child();
+                       
+                       while (cur._root && cur._root != _root)
+                       {
+                               if (pred(cur)) return cur;
+
+                               if (cur.first_child()) cur = cur.first_child();
+                               else if (cur.next_sibling()) cur = cur.next_sibling();
+                               else
+                               {
+                                       while (!cur.next_sibling() && cur._root != _root) cur = cur.parent();
+
+                                       if (cur._root != _root) cur = cur.next_sibling();
+                               }
+                       }
+
+                       return xml_node();
+               }
+
+               // Find child node by attribute name/value
+               xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const;
+               xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const;
+
+       #ifndef PUGIXML_NO_STL
+               // Get the absolute node path from root as a text string.
+               string_t path(char_t delimiter = '/') const;
+       #endif
+
+               // Search for a node by path consisting of node names and . or .. elements.
+               xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const;
+
+               // Recursively traverse subtree with xml_tree_walker
+               bool traverse(xml_tree_walker& walker);
+       
+       #ifndef PUGIXML_NO_XPATH
+               // Select single node by evaluating XPath query. Returns first node from the resulting node set.
+               xpath_node select_node(const char_t* query, xpath_variable_set* variables = 0) const;
+               xpath_node select_node(const xpath_query& query) const;
+
+               // Select node set by evaluating XPath query
+               xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const;
+               xpath_node_set select_nodes(const xpath_query& query) const;
+
+               // (deprecated: use select_node instead) Select single node by evaluating XPath query.
+               xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
+               xpath_node select_single_node(const xpath_query& query) const;
+
+       #endif
+               
+               // Print subtree using a writer object
+               void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+
+       #ifndef PUGIXML_NO_STL
+               // Print subtree to stream
+               void print(std::basic_ostream<char, std::char_traits<char> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+               void print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const;
+       #endif
+
+               // Child nodes iterators
+               typedef xml_node_iterator iterator;
+
+               iterator begin() const;
+               iterator end() const;
+
+               // Attribute iterators
+               typedef xml_attribute_iterator attribute_iterator;
+
+               attribute_iterator attributes_begin() const;
+               attribute_iterator attributes_end() const;
+
+               // Range-based for support
+               xml_object_range<xml_node_iterator> children() const;
+               xml_object_range<xml_named_node_iterator> children(const char_t* name) const;
+               xml_object_range<xml_attribute_iterator> attributes() const;
+
+               // Get node offset in parsed file/string (in char_t units) for debugging purposes
+               ptrdiff_t offset_debug() const;
+
+               // Get hash value (unique for handles to the same object)
+               size_t hash_value() const;
+
+               // Get internal pointer
+               xml_node_struct* internal_object() const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs);
+#endif
+
+       // A helper for working with text inside PCDATA nodes
+       class PUGIXML_CLASS xml_text
+       {
+               friend class xml_node;
+
+               xml_node_struct* _root;
+
+               typedef void (*unspecified_bool_type)(xml_text***);
+
+               explicit xml_text(xml_node_struct* root);
+
+               xml_node_struct* _data_new();
+               xml_node_struct* _data() const;
+
+       public:
+               // Default constructor. Constructs an empty object.
+               xml_text();
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+
+               // Check if text object is empty
+               bool empty() const;
+
+               // Get text, or "" if object is empty
+               const char_t* get() const;
+
+               // Get text, or the default value if object is empty
+               const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+               // Get text as a number, or the default value if conversion did not succeed or object is empty
+               int as_int(int def = 0) const;
+               unsigned int as_uint(unsigned int def = 0) const;
+               double as_double(double def = 0) const;
+               float as_float(float def = 0) const;
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               long long as_llong(long long def = 0) const;
+               unsigned long long as_ullong(unsigned long long def = 0) const;
+       #endif
+
+               // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty
+               bool as_bool(bool def = false) const;
+
+               // Set text (returns false if object is empty or there is not enough memory)
+               bool set(const char_t* rhs);
+
+               // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+               bool set(int rhs);
+               bool set(unsigned int rhs);
+               bool set(double rhs);
+               bool set(float rhs);
+               bool set(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               bool set(long long rhs);
+               bool set(unsigned long long rhs);
+       #endif
+
+               // Set text (equivalent to set without error checking)
+               xml_text& operator=(const char_t* rhs);
+               xml_text& operator=(int rhs);
+               xml_text& operator=(unsigned int rhs);
+               xml_text& operator=(double rhs);
+               xml_text& operator=(float rhs);
+               xml_text& operator=(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               xml_text& operator=(long long rhs);
+               xml_text& operator=(unsigned long long rhs);
+       #endif
+
+               // Get the data node (node_pcdata or node_cdata) for this object
+               xml_node data() const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs);
+#endif
+
+       // Child node iterator (a bidirectional iterator over a collection of xml_node)
+       class PUGIXML_CLASS xml_node_iterator
+       {
+               friend class xml_node;
+
+       private:
+               mutable xml_node _wrap;
+               xml_node _parent;
+
+               xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent);
+
+       public:
+               // Iterator traits
+               typedef ptrdiff_t difference_type;
+               typedef xml_node value_type;
+               typedef xml_node* pointer;
+               typedef xml_node& reference;
+
+       #ifndef PUGIXML_NO_STL
+               typedef std::bidirectional_iterator_tag iterator_category;
+       #endif
+
+               // Default constructor
+               xml_node_iterator();
+
+               // Construct an iterator which points to the specified node
+               xml_node_iterator(const xml_node& node);
+
+               // Iterator operators
+               bool operator==(const xml_node_iterator& rhs) const;
+               bool operator!=(const xml_node_iterator& rhs) const;
+
+               xml_node& operator*() const;
+               xml_node* operator->() const;
+
+               const xml_node_iterator& operator++();
+               xml_node_iterator operator++(int);
+
+               const xml_node_iterator& operator--();
+               xml_node_iterator operator--(int);
+       };
+
+       // Attribute iterator (a bidirectional iterator over a collection of xml_attribute)
+       class PUGIXML_CLASS xml_attribute_iterator
+       {
+               friend class xml_node;
+
+       private:
+               mutable xml_attribute _wrap;
+               xml_node _parent;
+
+               xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent);
+
+       public:
+               // Iterator traits
+               typedef ptrdiff_t difference_type;
+               typedef xml_attribute value_type;
+               typedef xml_attribute* pointer;
+               typedef xml_attribute& reference;
+
+       #ifndef PUGIXML_NO_STL
+               typedef std::bidirectional_iterator_tag iterator_category;
+       #endif
+
+               // Default constructor
+               xml_attribute_iterator();
+
+               // Construct an iterator which points to the specified attribute
+               xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent);
+
+               // Iterator operators
+               bool operator==(const xml_attribute_iterator& rhs) const;
+               bool operator!=(const xml_attribute_iterator& rhs) const;
+
+               xml_attribute& operator*() const;
+               xml_attribute* operator->() const;
+
+               const xml_attribute_iterator& operator++();
+               xml_attribute_iterator operator++(int);
+
+               const xml_attribute_iterator& operator--();
+               xml_attribute_iterator operator--(int);
+       };
+
+       // Named node range helper
+       class PUGIXML_CLASS xml_named_node_iterator
+       {
+               friend class xml_node;
+
+       public:
+               // Iterator traits
+               typedef ptrdiff_t difference_type;
+               typedef xml_node value_type;
+               typedef xml_node* pointer;
+               typedef xml_node& reference;
+
+       #ifndef PUGIXML_NO_STL
+               typedef std::bidirectional_iterator_tag iterator_category;
+       #endif
+
+               // Default constructor
+               xml_named_node_iterator();
+
+               // Construct an iterator which points to the specified node
+               xml_named_node_iterator(const xml_node& node, const char_t* name);
+
+               // Iterator operators
+               bool operator==(const xml_named_node_iterator& rhs) const;
+               bool operator!=(const xml_named_node_iterator& rhs) const;
+
+               xml_node& operator*() const;
+               xml_node* operator->() const;
+
+               const xml_named_node_iterator& operator++();
+               xml_named_node_iterator operator++(int);
+
+               const xml_named_node_iterator& operator--();
+               xml_named_node_iterator operator--(int);
+
+       private:
+               mutable xml_node _wrap;
+               xml_node _parent;
+               const char_t* _name;
+
+               xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name);
+       };
+
+       // Abstract tree walker class (see xml_node::traverse)
+       class PUGIXML_CLASS xml_tree_walker
+       {
+               friend class xml_node;
+
+       private:
+               int _depth;
+       
+       protected:
+               // Get current traversal depth
+               int depth() const;
+       
+       public:
+               xml_tree_walker();
+               virtual ~xml_tree_walker();
+
+               // Callback that is called when traversal begins
+               virtual bool begin(xml_node& node);
+
+               // Callback that is called for each node traversed
+               virtual bool for_each(xml_node& node) = 0;
+
+               // Callback that is called when traversal ends
+               virtual bool end(xml_node& node);
+       };
+
+       // Parsing status, returned as part of xml_parse_result object
+       enum xml_parse_status
+       {
+               status_ok = 0,                          // No error
+
+               status_file_not_found,          // File was not found during load_file()
+               status_io_error,                        // Error reading from file/stream
+               status_out_of_memory,           // Could not allocate memory
+               status_internal_error,          // Internal error occurred
+
+               status_unrecognized_tag,        // Parser could not determine tag type
+
+               status_bad_pi,                          // Parsing error occurred while parsing document declaration/processing instruction
+               status_bad_comment,                     // Parsing error occurred while parsing comment
+               status_bad_cdata,                       // Parsing error occurred while parsing CDATA section
+               status_bad_doctype,                     // Parsing error occurred while parsing document type declaration
+               status_bad_pcdata,                      // Parsing error occurred while parsing PCDATA section
+               status_bad_start_element,       // Parsing error occurred while parsing start element tag
+               status_bad_attribute,           // Parsing error occurred while parsing element attribute
+               status_bad_end_element,         // Parsing error occurred while parsing end element tag
+               status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
+
+               status_append_invalid_root,     // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer)
+
+               status_no_document_element      // Parsing resulted in a document without element nodes
+       };
+
+       // Parsing result
+       struct PUGIXML_CLASS xml_parse_result
+       {
+               // Parsing status (see xml_parse_status)
+               xml_parse_status status;
+
+               // Last parsed offset (in char_t units from start of input data)
+               ptrdiff_t offset;
+
+               // Source document encoding
+               xml_encoding encoding;
+
+               // Default constructor, initializes object to failed state
+               xml_parse_result();
+
+               // Cast to bool operator
+               operator bool() const;
+
+               // Get error description
+               const char* description() const;
+       };
+
+       // Document class (DOM tree root)
+       class PUGIXML_CLASS xml_document: public xml_node
+       {
+       private:
+               char_t* _buffer;
+
+               char _memory[192];
+               
+               // Non-copyable semantics
+               xml_document(const xml_document&);
+               const xml_document& operator=(const xml_document&);
+
+               void create();
+               void destroy();
+
+       public:
+               // Default constructor, makes empty document
+               xml_document();
+
+               // Destructor, invalidates all node/attribute handles to this document
+               ~xml_document();
+
+               // Removes all nodes, leaving the empty document
+               void reset();
+
+               // Removes all nodes, then copies the entire contents of the specified document
+               void reset(const xml_document& proto);
+
+       #ifndef PUGIXML_NO_STL
+               // Load document from stream.
+               xml_parse_result load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+               xml_parse_result load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options = parse_default);
+       #endif
+
+               // (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied.
+               xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
+
+               // Load document from zero-terminated string. No encoding conversions are applied.
+               xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default);
+
+               // Load document from file
+               xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+               xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns.
+               xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+               // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed.
+               xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+               // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore).
+               xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details).
+               void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+       #ifndef PUGIXML_NO_STL
+               // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details).
+               void save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+               void save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const;
+       #endif
+
+               // Save XML to file
+               bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+               bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+               // Get document element
+               xml_node document_element() const;
+       };
+
+#ifndef PUGIXML_NO_XPATH
+       // XPath query return type
+       enum xpath_value_type
+       {
+               xpath_type_none,          // Unknown type (query failed to compile)
+               xpath_type_node_set,  // Node set (xpath_node_set)
+               xpath_type_number,        // Number
+               xpath_type_string,        // String
+               xpath_type_boolean        // Boolean
+       };
+
+       // XPath parsing result
+       struct PUGIXML_CLASS xpath_parse_result
+       {
+               // Error message (0 if no error)
+               const char* error;
+
+               // Last parsed offset (in char_t units from string start)
+               ptrdiff_t offset;
+
+               // Default constructor, initializes object to failed state
+               xpath_parse_result();
+
+               // Cast to bool operator
+               operator bool() const;
+
+               // Get error description
+               const char* description() const;
+       };
+
+       // A single XPath variable
+       class PUGIXML_CLASS xpath_variable
+       {
+               friend class xpath_variable_set;
+
+       protected:
+               xpath_value_type _type;
+               xpath_variable* _next;
+
+               xpath_variable();
+
+               // Non-copyable semantics
+               xpath_variable(const xpath_variable&);
+               xpath_variable& operator=(const xpath_variable&);
+               
+       public:
+               // Get variable name
+               const char_t* name() const;
+
+               // Get variable type
+               xpath_value_type type() const;
+
+               // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error
+               bool get_boolean() const;
+               double get_number() const;
+               const char_t* get_string() const;
+               const xpath_node_set& get_node_set() const;
+
+               // Set variable value; no type conversion is performed, false is returned on type mismatch error
+               bool set(bool value);
+               bool set(double value);
+               bool set(const char_t* value);
+               bool set(const xpath_node_set& value);
+       };
+
+       // A set of XPath variables
+       class PUGIXML_CLASS xpath_variable_set
+       {
+       private:
+               xpath_variable* _data[64];
+
+               // Non-copyable semantics
+               xpath_variable_set(const xpath_variable_set&);
+               xpath_variable_set& operator=(const xpath_variable_set&);
+
+               xpath_variable* find(const char_t* name) const;
+
+       public:
+               // Default constructor/destructor
+               xpath_variable_set();
+               ~xpath_variable_set();
+
+               // Add a new variable or get the existing one, if the types match
+               xpath_variable* add(const char_t* name, xpath_value_type type);
+
+               // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch
+               bool set(const char_t* name, bool value);
+               bool set(const char_t* name, double value);
+               bool set(const char_t* name, const char_t* value);
+               bool set(const char_t* name, const xpath_node_set& value);
+
+               // Get existing variable by name
+               xpath_variable* get(const char_t* name);
+               const xpath_variable* get(const char_t* name) const;
+       };
+
+       // A compiled XPath query object
+       class PUGIXML_CLASS xpath_query
+       {
+       private:
+               void* _impl;
+               xpath_parse_result _result;
+
+               typedef void (*unspecified_bool_type)(xpath_query***);
+
+               // Non-copyable semantics
+               xpath_query(const xpath_query&);
+               xpath_query& operator=(const xpath_query&);
+
+       public:
+               // Construct a compiled object from XPath expression.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors.
+               explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0);
+
+               // Destructor
+               ~xpath_query();
+
+               // Get query expression return type
+               xpath_value_type return_type() const;
+               
+               // Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               bool evaluate_boolean(const xpath_node& n) const;
+               
+               // Evaluate expression as double value in the specified context; performs type conversion if necessary.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               double evaluate_number(const xpath_node& n) const;
+               
+       #ifndef PUGIXML_NO_STL
+               // Evaluate expression as string value in the specified context; performs type conversion if necessary.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               string_t evaluate_string(const xpath_node& n) const;
+       #endif
+               
+               // Evaluate expression as string value in the specified context; performs type conversion if necessary.
+               // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               // If PUGIXML_NO_EXCEPTIONS is defined, returns empty  set instead.
+               size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const;
+
+               // Evaluate expression as node set in the specified context.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
+               // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead.
+               xpath_node_set evaluate_node_set(const xpath_node& n) const;
+
+               // Evaluate expression as node set in the specified context.
+               // Return first node in document order, or empty node if node set is empty.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
+               // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node instead.
+               xpath_node evaluate_node(const xpath_node& n) const;
+
+               // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode)
+               const xpath_parse_result& result() const;
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+       };
+       
+       #ifndef PUGIXML_NO_EXCEPTIONS
+       // XPath exception class
+       class PUGIXML_CLASS xpath_exception: public std::exception
+       {
+       private:
+               xpath_parse_result _result;
+
+       public:
+               // Construct exception from parse result
+               explicit xpath_exception(const xpath_parse_result& result);
+
+               // Get error message
+               virtual const char* what() const throw();
+
+               // Get parse result
+               const xpath_parse_result& result() const;
+       };
+       #endif
+       
+       // XPath node class (either xml_node or xml_attribute)
+       class PUGIXML_CLASS xpath_node
+       {
+       private:
+               xml_node _node;
+               xml_attribute _attribute;
+       
+               typedef void (*unspecified_bool_type)(xpath_node***);
+
+       public:
+               // Default constructor; constructs empty XPath node
+               xpath_node();
+               
+               // Construct XPath node from XML node/attribute
+               xpath_node(const xml_node& node);
+               xpath_node(const xml_attribute& attribute, const xml_node& parent);
+
+               // Get node/attribute, if any
+               xml_node node() const;
+               xml_attribute attribute() const;
+               
+               // Get parent of contained node/attribute
+               xml_node parent() const;
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+               
+               // Borland C++ workaround
+               bool operator!() const;
+
+               // Comparison operators
+               bool operator==(const xpath_node& n) const;
+               bool operator!=(const xpath_node& n) const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs);
+#endif
+
+       // A fixed-size collection of XPath nodes
+       class PUGIXML_CLASS xpath_node_set
+       {
+       public:
+               // Collection type
+               enum type_t
+               {
+                       type_unsorted,                  // Not ordered
+                       type_sorted,                    // Sorted by document order (ascending)
+                       type_sorted_reverse             // Sorted by document order (descending)
+               };
+               
+               // Constant iterator type
+               typedef const xpath_node* const_iterator;
+
+               // We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work
+               typedef const xpath_node* iterator;
+       
+               // Default constructor. Constructs empty set.
+               xpath_node_set();
+
+               // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful
+               xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted);
+
+               // Destructor
+               ~xpath_node_set();
+               
+               // Copy constructor/assignment operator
+               xpath_node_set(const xpath_node_set& ns);
+               xpath_node_set& operator=(const xpath_node_set& ns);
+
+               // Get collection type
+               type_t type() const;
+               
+               // Get collection size
+               size_t size() const;
+
+               // Indexing operator
+               const xpath_node& operator[](size_t index) const;
+               
+               // Collection iterators
+               const_iterator begin() const;
+               const_iterator end() const;
+
+               // Sort the collection in ascending/descending order by document order
+               void sort(bool reverse = false);
+               
+               // Get first node in the collection by document order
+               xpath_node first() const;
+               
+               // Check if collection is empty
+               bool empty() const;
+       
+       private:
+               type_t _type;
+               
+               xpath_node _storage;
+               
+               xpath_node* _begin;
+               xpath_node* _end;
+
+               void _assign(const_iterator begin, const_iterator end);
+       };
+#endif
+
+#ifndef PUGIXML_NO_STL
+       // Convert wide string to UTF8
+       std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
+       std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
+       
+       // Convert UTF8 to wide string
+       std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
+       std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
+#endif
+
+       // Memory allocation function interface; returns pointer to allocated memory or NULL on failure
+       typedef void* (*allocation_function)(size_t size);
+       
+       // Memory deallocation function interface
+       typedef void (*deallocation_function)(void* ptr);
+
+       // Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
+       void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
+       
+       // Get current memory management functions
+       allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
+       deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#endif
+
+// Make sure implementation is included in header-only mode
+// Use macro expansion in #include to work around QMake (QTBUG-11923)
+#if defined(PUGIXML_HEADER_ONLY) && !defined(PUGIXML_SOURCE)
+#      define PUGIXML_SOURCE "pugixml.cpp"
+#      include PUGIXML_SOURCE
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/src/swig/Accuracy.i b/src/swig/Accuracy.i

new file mode 100644 (file)

index 0000000..1e6015c
--- /dev/null
+++ b/src/swig/Accuracy.i
@@ -0,0 +1,17 @@
+/* Accuracy.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/Accuracy.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+#ifdef SWIGPYTHON
+%rename(__float__) PacBio::BAM::Accuracy::operator float;
+#else // C#, R
+%rename(ToFloat) PacBio::BAM::Accuracy::operator float;
+#endif
+
+%include <pbbam/Accuracy.h>
+\ No newline at end of file
diff --git a/src/swig/AlignmentPrinter.i b/src/swig/AlignmentPrinter.i

new file mode 100644 (file)

index 0000000..6c4fc69
--- /dev/null
+++ b/src/swig/AlignmentPrinter.i
@@ -0,0 +1,11 @@
+/* AlignmentPrinter.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/AlignmentPrinter.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/AligmentPrinter.h>
+\ No newline at end of file
diff --git a/src/swig/BamFile.i b/src/swig/BamFile.i

new file mode 100644 (file)

index 0000000..4a429e9
--- /dev/null
+++ b/src/swig/BamFile.i
@@ -0,0 +1,18 @@
+/* BamFile.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/BamFile.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+#ifdef SWIGR
+%ignore PacBio::BAM::BamFile::BamFile(const BamFile&);
+#endif 
+
+%ignore PacBio::BAM::BamFile::BamFile(BamFile&&);
+%ignore PacBio::BAM::BamFile::operator=;
+
+%include <pbbam/BamFile.h>
diff --git a/src/swig/BamHeader.i b/src/swig/BamHeader.i

new file mode 100644 (file)

index 0000000..3572f04
--- /dev/null
+++ b/src/swig/BamHeader.i
@@ -0,0 +1,21 @@
+/* BamHeader.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/BamHeader.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+// Hide warnings about "internal" being a C# reserved word
+%warnfilter(314) PacBio::BAM::internal;
+
+%ignore PacBio::BAM::BamHeader::BamHeader(BamHeader&&);      // move ctors not used
+%ignore PacBio::BAM::BamHeader::operator=;                   // assignment operators not used
+
+%template(ProgramInfoList)   std::vector<PacBio::BAM::ProgramInfo>;
+%template(ReadGroupInfoList) std::vector<PacBio::BAM::ReadGroupInfo>;
+%template(SequenceInfoList)  std::vector<PacBio::BAM::SequenceInfo>;
+
+%include <pbbam/BamHeader.h>
diff --git a/src/swig/BamRecord.i b/src/swig/BamRecord.i

new file mode 100644 (file)

index 0000000..4b8cee6
--- /dev/null
+++ b/src/swig/BamRecord.i
@@ -0,0 +1,33 @@
+/* BamRecord.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/BamRecord.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+// Hide warnings about "internal" being a C# reserved word
+%warnfilter(314) PacBio::BAM::internal;
+
+// hide warnings about unused methods
+%ignore PacBio::BAM::BamRecord::BamRecord(BamRecordImpl&&);
+%ignore PacBio::BAM::BamRecord::BamRecord(BamRecord&&);
+%ignore PacBio::BAM::BamRecord::operator=;
+
+// ignore static methods, to allow member
+%ignore PacBio::BAM::BamRecord::Clipped(const BamRecord&, const ClipType, const PacBio::BAM::Position, const PacBio::BAM::Position);
+%ignore PacBio::BAM::BamRecord::Mapped(const BamRecord&, const int32_t, const Position, const Strand, const Cigar&, const uint8_t);
+
+// C# gets confused by the const and nonconst overloads
+%ignore PacBio::BAM::BamRecord::Impl() const;
+
+#if defined(SWIGR) || defined(SWIGPYTHON)
+%rename("EncodedPkmean") PacBio::BAM::BamRecord::Pkmean(const std::vector<uint16_t>&);
+%rename("EncodedPkmid")  PacBio::BAM::BamRecord::Pkmid(const std::vector<uint16_t>&);
+%rename("EncodedPkmean2") PacBio::BAM::BamRecord::Pkmean2(const std::vector<uint16_t>&);
+%rename("EncodedPkmid2")  PacBio::BAM::BamRecord::Pkmid2(const std::vector<uint16_t>&);
+#endif
+
+%include <pbbam/BamRecord.h>
diff --git a/src/swig/BamRecordBuilder.i b/src/swig/BamRecordBuilder.i

new file mode 100644 (file)

index 0000000..52e7690
--- /dev/null
+++ b/src/swig/BamRecordBuilder.i
@@ -0,0 +1,18 @@
+/* BamRecord.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/BamRecordBuilder.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::BamRecordBuilder::BamRecordBuilder(BamRecordBuilder&&);      // move ctors not used
+%ignore PacBio::BAM::BamRecordBuilder::operator=;
+
+%ignore PacBio::BAM::BamRecordBuilder::Reset(BamRecord&&);
+%ignore PacBio::BAM::BamRecordBuilder::Cigar(PacBio::BAM::Cigar&&);
+%ignore PacBio::BAM::BamRecordBuilder::Tags(TagCollection&&);
+
+%include <pbbam/BamRecordBuilder.h>
diff --git a/src/swig/BamRecordImpl.i b/src/swig/BamRecordImpl.i

new file mode 100644 (file)

index 0000000..2c8a48f
--- /dev/null
+++ b/src/swig/BamRecordImpl.i
@@ -0,0 +1,14 @@
+/* BamRecordImpl.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/BamRecordImpl.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::BamRecordImpl::BamRecordImpl(BamRecordImpl&&); 
+%ignore PacBio::BAM::BamRecordImpl::operator=;
+
+%include <pbbam/BamRecordImpl.h>
diff --git a/src/swig/BamRecordTag.i b/src/swig/BamRecordTag.i

new file mode 100644 (file)

index 0000000..a34b661
--- /dev/null
+++ b/src/swig/BamRecordTag.i
@@ -0,0 +1,11 @@
+/* BamRecordTag.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/BamRecordTag.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/BamRecordTag.h>
diff --git a/src/swig/BamTagCodec.i b/src/swig/BamTagCodec.i

new file mode 100644 (file)

index 0000000..4a4326b
--- /dev/null
+++ b/src/swig/BamTagCodec.i
@@ -0,0 +1,11 @@
+/* BamTagCodec.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/BamTagCodec.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/BamTagCodec.h>
+\ No newline at end of file
diff --git a/src/swig/BamWriter.i b/src/swig/BamWriter.i

new file mode 100644 (file)

index 0000000..dd23e5b
--- /dev/null
+++ b/src/swig/BamWriter.i
@@ -0,0 +1,15 @@
+/* BamWriter.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/BamWriter.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+ 
+%ignore PacBio::BAM::BamWriter(const BamWriter&);  // copy ctor not used
+%ignore PacBio::BAM::BamWriter(BamWriter&&);       // move ctor not used
+%ignore PacBio::BAM::BamWriter::operator=;         // assignment operators not used
+
+%include <pbbam/BamWriter.h>
diff --git a/src/swig/CMakeLists.txt b/src/swig/CMakeLists.txt

new file mode 100644 (file)

index 0000000..a8869c3
--- /dev/null
+++ b/src/swig/CMakeLists.txt
@@ -0,0 +1,68 @@
+# --------------------------------------------- @
+# SWIG
+# --------------------------------------------- @
+
+# general SWIG
+if(${wrapping_swig})
+
+    find_package(SWIG 3.0.5 REQUIRED)
+
+    include(${SWIG_USE_FILE})
+    include_directories(${PacBioBAM_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR})
+
+    #
+    # quash compiler warnings from SWIG-generated code
+    #
+    check_cxx_compiler_flag("-Wno-unused-local-typedefs" HAS_NO_UNUSED_BUT_SET_VARIABLE)
+    if(HAS_NO_UNUSED_BUT_SET_VARIABLE)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-but-set-variable")
+    endif()
+
+    check_cxx_compiler_flag("-Wno-dynamic-class-memaccess" HAS_NO_DYNAMIC_CLASS_MEMACCESS)
+    if (HAS_NO_DYNAMIC_CLASS_MEMACCESS)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-dynamic-class-memaccess")
+    endif()
+
+    check_cxx_compiler_flag("-Wno-unused-parameter" HAS_NO_UNUSED_PARAMETER)
+    if (HAS_NO_UNUSED_PARAMETER)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
+    endif()
+
+    check_cxx_compiler_flag("-Wno-return-local-addr" HAS_NO_RETURN_LOCAL_ADDR)
+    if (HAS_NO_RETURN_LOCAL_ADDR)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-return-local-addr")
+    endif()
+
+    check_cxx_compiler_flag("-Wno-return-type" HAS_NO_RETURN_TYPE)
+    if (HAS_NO_RETURN_TYPE)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-return-type")
+    endif()
+
+    #
+    # SWIG source file properties
+    #
+    set_source_files_properties(PacBioBam.i PROPERTIES CPLUSPLUS ON)
+
+    if (APPLE)
+    else()
+        if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+            set_source_files_properties(PacBioBam.i PROPERTIES SWIG_FLAGS "-DSWIGWORDSIZE64")
+        endif()
+    endif()
+
+endif()
+
+# Python
+if(PacBioBAM_wrap_python)
+    include(WrapPython.cmake)
+endif()
+
+# R
+if(PacBioBAM_wrap_r)
+    include(WrapR.cmake)
+endif()
+
+# CSharp
+if(PacBioBAM_wrap_csharp)
+    include(WrapCSharp.cmake)
+endif()
diff --git a/src/swig/Cigar.i b/src/swig/Cigar.i

new file mode 100644 (file)

index 0000000..2c5cf8c
--- /dev/null
+++ b/src/swig/Cigar.i
@@ -0,0 +1,16 @@
+/* Cigar.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/Cigar.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%template(CigarOpList) std::vector<PacBio::BAM::CigarOperation>;
+
+%ignore PacBio::BAM::Cigar::Cigar(Cigar&&);
+%ignore PacBio::BAM::Cigar::operator=;
+
+%include <pbbam/Cigar.h>
diff --git a/src/swig/CigarOperation.i b/src/swig/CigarOperation.i

new file mode 100644 (file)

index 0000000..0a23a17
--- /dev/null
+++ b/src/swig/CigarOperation.i
@@ -0,0 +1,47 @@
+/* CigarOperation.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/CigarOperation.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::CigarOperation::CigarOperation(CigarOperation&&);
+%ignore PacBio::BAM::CigarOperation::operator=;
+
+#ifdef SWIGR
+%ignore PacBio::BAM::CigarOperation::CigarOperation(CigarOperationType, uint32_t);
+#endif
+
+%include <pbbam/CigarOperation.h>
+
+// enums aren't always named consistently (at least between Mac/clang/swig & Linux/gcc/swig)
+// so, keep this after the main %include so client source can be consistent 
+#ifdef SWIGPYTHON
+%pythoncode %{
+try:
+       UNKNOWN_OP
+       ALIGNMENT_MATCH
+       INSERTION
+       DELETION
+       REFERENCE_SKIP
+       SOFT_CLIP
+       HARD_CLIP
+       PADDING
+       SEQUENCE_MATCH
+       SEQUENCE_MISMATCH
+except NameError:
+       UNKNOWN_OP = CigarOperationType_UNKNOWN_OP
+       ALIGNMENT_MATCH = CigarOperationType_ALIGNMENT_MATCH
+       INSERTION = CigarOperationType_INSERTION
+       DELETION = CigarOperationType_DELETION
+       REFERENCE_SKIP = CigarOperationType_REFERENCE_SKIP
+       SOFT_CLIP = CigarOperationType_SOFT_CLIP
+       HARD_CLIP = CigarOperationType_HARD_CLIP
+       PADDING = CigarOperationType_PADDING 
+       SEQUENCE_MATCH = CigarOperationType_SEQUENCE_MATCH 
+       SEQUENCE_MISMATCH = CigarOperationType_SEQUENCE_MISMATCH
+%}
+#endif
diff --git a/src/swig/ClipType.i b/src/swig/ClipType.i

new file mode 100644 (file)

index 0000000..350416c
--- /dev/null
+++ b/src/swig/ClipType.i
@@ -0,0 +1,11 @@
+/* ClipType.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/ClipType.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/ClipType.h>
diff --git a/src/swig/DataSet.i b/src/swig/DataSet.i

new file mode 100644 (file)

index 0000000..f8cba2b
--- /dev/null
+++ b/src/swig/DataSet.i
@@ -0,0 +1,49 @@
+/* DataSet.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/DataSet.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+// move ctors not used
+%ignore PacBio::BAM::DataSet::DataSet(DataSet&&);      
+
+// assignment operators not used
+%ignore PacBio::BAM::DataSet::operator=;                 
+
+#ifdef SWIGCSHARP
+
+// ignore non-const accessors
+%ignore PacBio::BAM::DataSet::Attribute(const std::string&);
+%ignore PacBio::BAM::DataSet::CreatedAt();
+%ignore PacBio::BAM::DataSet::Extensions();
+%ignore PacBio::BAM::DataSet::ExternalResources();
+%ignore PacBio::BAM::DataSet::Filters();
+%ignore PacBio::BAM::DataSet::Format();
+%ignore PacBio::BAM::DataSet::Metadata();
+%ignore PacBio::BAM::DataSet::MetaType();
+%ignore PacBio::BAM::DataSet::ModifiedAt();
+%ignore PacBio::BAM::DataSet::Name();
+%ignore PacBio::BAM::DataSet::Namespaces();
+%ignore PacBio::BAM::DataSet::ResourceId();
+%ignore PacBio::BAM::DataSet::SubDataSets();
+%ignore PacBio::BAM::DataSet::Tags();
+%ignore PacBio::BAM::DataSet::TimeStampedName();
+%ignore PacBio::BAM::DataSet::UniqueId();
+%ignore PacBio::BAM::DataSet::Version();
+
+// disable operator(s)
+%ignore PacBio::BAM::DataSet::operator+=;
+
+#endif // C#
+
+#ifdef SWIGR
+%ignore PacBio::BAM::DataSet::DataSet(const DataSet::TypeEnum type);
+/*%ignore PacBio::BAM::DataSet::DataSet(const BamFile& bamFile);*/
+#endif // R
+
+
+%include <pbbam/DataSet.h>
diff --git a/src/swig/DataSetTypes.i b/src/swig/DataSetTypes.i

new file mode 100644 (file)

index 0000000..5644d3f
--- /dev/null
+++ b/src/swig/DataSetTypes.i
@@ -0,0 +1,121 @@
+/* DataSetTypes.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/internal/DataSetElement.h>
+#include <pbbam/internal/DataSetListElement.h>
+#include <pbbam/internal/DataSetBaseTypes.h>
+#include <pbbam/DataSetTypes.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+%}
+
+%ignore PacBio::BAM::internal::DataSetElement::DataSetElement(DataSetElement&&);
+%ignore PacBio::BAM::internal::DataSetElement::operator=;
+%ignore PacBio::BAM::internal::DataSetElement::operator[];
+/*%rename(__getitem__) PacBio::BAM::internal::DataSetElement::operator[];*/
+
+%ignore PacBio::BAM::internal::XmlName::XmlName(XmlName&&);
+%ignore PacBio::BAM::internal::XmlName::operator=;
+
+#ifdef SWIGCSHARP
+
+// ignore non-const accessors
+%ignore PacBio::BAM::DataSetBase::ExternalResources();
+%ignore PacBio::BAM::DataSetBase::Filters();
+%ignore PacBio::BAM::DataSetBase::Metadata();
+%ignore PacBio::BAM::DataSetBase::Namespaces();
+%ignore PacBio::BAM::DataSetBase::SubDataSets();
+%ignore PacBio::BAM::DataSetMetadata::NumRecords();
+%ignore PacBio::BAM::DataSetMetadata::Provenance();
+%ignore PacBio::BAM::DataSetMetadata::TotalLength();
+%ignore PacBio::BAM::ExternalResource::ExternalResources();
+%ignore PacBio::BAM::Filter::Properties();
+%ignore PacBio::BAM::Property::Name();
+%ignore PacBio::BAM::Property::Operator();
+%ignore PacBio::BAM::Property::Value();
+%ignore PacBio::BAM::Provenance::CreatedBy();
+%ignore PacBio::BAM::Provenance::CommonServicesInstanceId();
+%ignore PacBio::BAM::Provenance::CreatorUserId();
+%ignore PacBio::BAM::Provenance::ParentJobId();
+%ignore PacBio::BAM::Provenance::ParentTool();
+%ignore PacBio::BAM::internal::BaseEntityType::Description();
+%ignore PacBio::BAM::internal::BaseEntityType::Extensions();
+%ignore PacBio::BAM::internal::BaseEntityType::Format();
+%ignore PacBio::BAM::internal::BaseEntityType::ModifiedAt();
+%ignore PacBio::BAM::internal::BaseEntityType::Name();
+%ignore PacBio::BAM::internal::BaseEntityType::ResourceId();
+%ignore PacBio::BAM::internal::BaseEntityType::Tags();
+%ignore PacBio::BAM::internal::BaseEntityType::Version();
+%ignore PacBio::BAM::internal::DataEntityType::Checksum();
+%ignore PacBio::BAM::internal::DataEntityType::EncodedValue();
+%ignore PacBio::BAM::internal::DataEntityType::MetaType();
+%ignore PacBio::BAM::internal::DataEntityType::SimpleValue();
+%ignore PacBio::BAM::internal::DataEntityType::TimeStampedName();
+%ignore PacBio::BAM::internal::DataEntityType::UniqueId();
+%ignore PacBio::BAM::internal::DataEntityType::ValueDataType();
+%ignore PacBio::BAM::internal::DataSetElement::Attribute(const std::string&);
+%ignore PacBio::BAM::internal::DataSetElement::Attributes();
+%ignore PacBio::BAM::internal::DataSetElement::Children();
+%ignore PacBio::BAM::internal::DataSetElement::ChildText(const std::string&);
+%ignore PacBio::BAM::internal::DataSetElement::CreatedAt();
+%ignore PacBio::BAM::internal::DataSetElement::Text();
+%ignore PacBio::BAM::internal::IndexedDataType::FileIndices();
+%ignore PacBio::BAM::internal::StrictEntityType::MetaType();
+%ignore PacBio::BAM::internal::StrictEntityType::TimeStampedName();
+%ignore PacBio::BAM::internal::StrictEntityType::UniqueId();
+
+// disable operator(s)
+%ignore PacBio::BAM::DataSetMetadata::operator+=;
+%ignore PacBio::BAM::ExternalResources::operator+=;
+%ignore PacBio::BAM::Filters::operator+=;
+%ignore PacBio::BAM::DataSetBase::operator+=;
+%ignore PacBio::BAM::SubDataSets::operator+=;
+
+#endif // C#
+
+%include <pbbam/internal/DataSetElement.h>
+
+%ignore PacBio::BAM::internal::DataSetElementList::operator[];
+%ignore PacBio::BAM::internal::DataSetListIterator::operator++;
+%ignore PacBio::BAM::internal::DataSetListConstIterator::operator++;
+
+%include <pbbam/internal/DataSetListElement.h>
+
+%template(ExtensionListElement)        PacBio::BAM::internal::DataSetListElement<PacBio::BAM::ExtensionElement>;
+%template(ExternalResourceListElement) PacBio::BAM::internal::DataSetListElement<PacBio::BAM::ExternalResource>;
+%template(FileIndexListElement)        PacBio::BAM::internal::DataSetListElement<PacBio::BAM::FileIndex>;
+%template(FilterListElement)           PacBio::BAM::internal::DataSetListElement<PacBio::BAM::Filter>;
+%template(PropertyListElement)         PacBio::BAM::internal::DataSetListElement<PacBio::BAM::Property>;
+%template(SubDataSetListElement)       PacBio::BAM::internal::DataSetListElement<PacBio::BAM::DataSetBase>;
+
+%extend PacBio::BAM::internal::DataSetListElement<PacBio::BAM::ExtensionElement> {
+    PacBio::BAM::ExtensionElement& __getitem__(unsigned int i)       { return $self->Child<PacBio::BAM::ExtensionElement>(i); }
+    PacBio::BAM::ExtensionElement& __getitem__(const std::string& s) { return $self->Child<PacBio::BAM::ExtensionElement>(s); }
+}
+%extend PacBio::BAM::internal::DataSetListElement<PacBio::BAM::ExternalResource> {
+    PacBio::BAM::ExternalResource& __getitem__(unsigned int i)       { return $self->Child<PacBio::BAM::ExternalResource>(i); }
+    PacBio::BAM::ExternalResource& __getitem__(const std::string& s) { return $self->Child<PacBio::BAM::ExternalResource>(s); }
+}
+%extend PacBio::BAM::internal::DataSetListElement<PacBio::BAM::FileIndex> {
+    PacBio::BAM::FileIndex& __getitem__(unsigned int i)       { return $self->Child<PacBio::BAM::FileIndex>(i);}
+    PacBio::BAM::FileIndex& __getitem__(const std::string& s) { return $self->Child<PacBio::BAM::FileIndex>(s);}
+}
+%extend PacBio::BAM::internal::DataSetListElement<PacBio::BAM::Filter> {
+    PacBio::BAM::Filter& __getitem__(unsigned int i)       { return $self->Child<PacBio::BAM::Filter>(i); }
+    PacBio::BAM::Filter& __getitem__(const std::string& s) { return $self->Child<PacBio::BAM::Filter>(s); }
+}
+%extend PacBio::BAM::internal::DataSetListElement<PacBio::BAM::Property> {
+    PacBio::BAM::Property& __getitem__(unsigned int i)       { return $self->Child<PacBio::BAM::Property>(i); }
+    PacBio::BAM::Property& __getitem__(const std::string& s) { return $self->Child<PacBio::BAM::Property>(s); }
+}
+%extend PacBio::BAM::internal::DataSetListElement<PacBio::BAM::DataSetBase> {
+    PacBio::BAM::DataSetBase& __getitem__(unsigned int i)       { return $self->Child<PacBio::BAM::DataSetBase>(i); }
+    PacBio::BAM::DataSetBase& __getitem__(const std::string& s) { return $self->Child<PacBio::BAM::DataSetBase>(s); }
+}
+
+%include <pbbam/internal/DataSetBaseTypes.h>
+%include <pbbam/DataSetTypes.h>
diff --git a/src/swig/EntireFileQuery.i b/src/swig/EntireFileQuery.i

new file mode 100644 (file)

index 0000000..c7c0b06
--- /dev/null
+++ b/src/swig/EntireFileQuery.i
@@ -0,0 +1,15 @@
+/* EntireFileQuery.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/DataSet.h>
+#include <pbbam/internal/QueryBase.h>
+#include <pbbam/EntireFileQuery.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/DataSet.h>
+%include <pbbam/internal/QueryBase.h>
+%include <pbbam/EntireFileQuery.h>
diff --git a/src/swig/FrameEncodingType.i b/src/swig/FrameEncodingType.i

new file mode 100644 (file)

index 0000000..1bf1552
--- /dev/null
+++ b/src/swig/FrameEncodingType.i
@@ -0,0 +1,11 @@
+/* FrameEncodingType.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/FrameEncodingType.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/FrameEncodingType.h>
diff --git a/src/swig/Frames.i b/src/swig/Frames.i

new file mode 100644 (file)

index 0000000..d3e001a
--- /dev/null
+++ b/src/swig/Frames.i
@@ -0,0 +1,19 @@
+/* Frames.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/Frames.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::Frames::Frames(Frames&&);
+%ignore PacBio::BAM::Frames::Frames(std::vector<uint16_t>&&);
+%ignore PacBio::BAM::Frames::operator=;
+%ignore PacBio::BAM::Frames::Data(std::vector<uint16_t>&&);
+
+%template(UInt8List)  std::vector<uint8_t>;
+%template(UInt16List) std::vector<uint16_t>;
+
+%include <pbbam/Frames.h>
diff --git a/src/swig/GenomicInterval.i b/src/swig/GenomicInterval.i

new file mode 100644 (file)

index 0000000..199a3c3
--- /dev/null
+++ b/src/swig/GenomicInterval.i
@@ -0,0 +1,13 @@
+/* GenomicInterval.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/GenomicInterval.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::GenomicInterval::operator=;
+
+%include <pbbam/GenomicInterval.h>
diff --git a/src/swig/GenomicIntervalQuery.i b/src/swig/GenomicIntervalQuery.i

new file mode 100644 (file)

index 0000000..d3f9fa7
--- /dev/null
+++ b/src/swig/GenomicIntervalQuery.i
@@ -0,0 +1,11 @@
+/* GenomicIntervalQuery.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/GenomicIntervalQuery.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/GenomicIntervalQuery.h>
diff --git a/src/swig/GroupQuery.i b/src/swig/GroupQuery.i

new file mode 100644 (file)

index 0000000..3128e15
--- /dev/null
+++ b/src/swig/GroupQuery.i
@@ -0,0 +1,11 @@
+/* GroupQuery.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/GroupQuery.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/GroupQuery.h>
+\ No newline at end of file
diff --git a/src/swig/GroupQueryBase.i b/src/swig/GroupQueryBase.i

new file mode 100644 (file)

index 0000000..ade4526
--- /dev/null
+++ b/src/swig/GroupQueryBase.i
@@ -0,0 +1,32 @@
+/* GroupQueryBase.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/GroupQueryBase.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::GroupQueryIterator::operator++;
+%ignore PacBio::BAM::GroupQueryConstIterator::operator++;
+
+%include <pbbam/GroupQueryBase.h>
+
+%extend PacBio::BAM::GroupQueryIterator
+{
+    PacBio::BAM::GroupQueryIterator& incr(void)
+    { return $self->operator++(); }
+
+    std::vector<PacBio::BAM::BamRecord>* value(void)
+    { return $self->operator->(); }
+}
+
+%extend PacBio::BAM::GroupQueryConstIterator
+{
+    PacBio::BAM::GroupQueryConstIterator& incr(void)
+    { return $self->operator++(); }
+
+    const std::vector<PacBio::BAM::BamRecord>* value(void) const
+    { return $self->operator->(); }
+}
+\ No newline at end of file
diff --git a/src/swig/IRecordWriter.i b/src/swig/IRecordWriter.i

new file mode 100644 (file)

index 0000000..a64b083
--- /dev/null
+++ b/src/swig/IRecordWriter.i
@@ -0,0 +1,9 @@
+/* IRecordWriter.i */
+%module PacBioBam
+%{
+#include <pbbam/IRecordWriter.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/IRecordWriter.h>
diff --git a/src/swig/IndexedFastaReader.i b/src/swig/IndexedFastaReader.i

new file mode 100644 (file)

index 0000000..25ef650
--- /dev/null
+++ b/src/swig/IndexedFastaReader.i
@@ -0,0 +1,13 @@
+/* IndexedFastaReader.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/IndexedFastaReader.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::IndexedFastaReader::operator=; // assignment operators not used
+
+%include <pbbam/IndexedFastaReader.h>
diff --git a/src/swig/Interval.i b/src/swig/Interval.i

new file mode 100644 (file)

index 0000000..0867eb4
--- /dev/null
+++ b/src/swig/Interval.i
@@ -0,0 +1,12 @@
+/* Interval.i */
+%module PacBioBam
+%{
+#include <pbbam/Interval.h>
+#include <pbbam/Position.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/Interval.h>
+
+%template(PositionInterval) PacBio::BAM::Interval<PacBio::BAM::Position>;
diff --git a/src/swig/LocalContextFlags.i b/src/swig/LocalContextFlags.i

new file mode 100644 (file)

index 0000000..66ee990
--- /dev/null
+++ b/src/swig/LocalContextFlags.i
@@ -0,0 +1,15 @@
+/* LocalContextFlags.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/LocalContextFlags.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+#ifdef SWIGCSHARP
+%ignore operator|(const LocalContextFlags, const LocalContextFlags);
+#endif
+
+%include <pbbam/LocalContextFlags.h>
diff --git a/src/swig/Orientation.i b/src/swig/Orientation.i

new file mode 100644 (file)

index 0000000..2f10a7a
--- /dev/null
+++ b/src/swig/Orientation.i
@@ -0,0 +1,11 @@
+/* Orientation.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/Orientation.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/Orientation.h>
+\ No newline at end of file
diff --git a/src/swig/PacBio.BAM.csproj.in b/src/swig/PacBio.BAM.csproj.in

new file mode 100644 (file)

index 0000000..61872db
--- /dev/null
+++ b/src/swig/PacBio.BAM.csproj.in
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{6E414044-5469-48E4-BA14-1B9888875DD5}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <RootNamespace>PacBio.BAM</RootNamespace>
+    <AssemblyName>PacBio.BAM</AssemblyName>
+    <TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug</OutputPath>
+    <DefineConstants>DEBUG;</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+    <ConsolePause>false</ConsolePause>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>full</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release</OutputPath>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+    <ConsolePause>false</ConsolePause>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="System" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="*.cs" />
+  </ItemGroup>
+  <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
+</Project>
diff --git a/src/swig/PacBioBam.i b/src/swig/PacBioBam.i

new file mode 100644 (file)

index 0000000..213387f
--- /dev/null
+++ b/src/swig/PacBioBam.i
@@ -0,0 +1,161 @@
+/* pbbam.i */
+%module PacBioBam
+%{
+
+/*ifdef SWIGR
+ define SWIG_SHARED_PTR_NAMESPACE boost
+ define SWIG_SHARED_PTR_SUBNAMESPACE
+endif*/
+
+#include <pbbam/Config.h>
+#include <string>
+#include <vector>
+%}
+
+#define SWIG_FILE_WITH_INIT
+#define PBBAM_EXPORT
+
+#ifdef SWIGCSHARP
+%rename(Equals)        *::operator==;
+%rename(ToBool)        *::operator bool;
+%rename(ToInt)         *::operator int;
+%rename(ToUint8)       *::operator uint8_t;
+
+%ignore *::operator !=;
+
+// Iterator interfaces are not useful outside of C++
+%ignore *::begin;
+%ignore *::end;
+
+%csmethodmodifiers *::ToString() const "public override";
+
+#endif // SWIGCSHARP
+
+/********* SWIG includes ************/
+
+%include "stdint.i"
+%include "std_common.i"
+
+#ifdef SWIGR
+%include "boost_shared_ptr.i"
+#else
+%include "std_shared_ptr.i"
+#endif
+
+%include "std_map.i"
+%include "std_pair.i"
+%include "std_string.i"
+%include "std_vector.i"
+
+ // TODO: can we call these vectors!?
+%template(StringList) std::vector<std::string>;
+%template(IntList)    std::vector<int>;
+%template(UIntList)   std::vector<unsigned int>;
+%template(FloatList)  std::vector<float>;
+%template(ShortList)  std::vector<short>;
+%template(CharList)   std::vector<char>;
+
+// exception handling
+%include "exception.i"
+%exception {
+    try {
+        $action
+    } catch (const std::exception& e) {
+        SWIG_exception(SWIG_RuntimeError, e.what());
+    }
+}
+
+/********* PacBioBAM includes ************/
+
+#ifdef SWIGCSHARP
+ // Renames to play nice with C#
+ // (These are used in the dataset support code, where things like
+ //  this happen in C++:
+ //
+ //    void Extensions(Extensions x) { ... }
+ //
+ //  and this poses problems for C#.  Renaming should be fine
+ //  as it is doubtful we will refer to these classes by name anyway.)
+ //
+%rename(ExtensionsType)        PacBio::BAM::Extensions;
+%rename(ExternalResourcesType) PacBio::BAM::ExternalResources;
+%rename(FiltersType)           PacBio::BAM::Filters;
+%rename(SubDataSetsType)       PacBio::BAM::SubDataSets;
+%rename(ProvenanceType)        PacBio::BAM::Provenance;
+%rename(PropertiesType)        PacBio::BAM::Properties;
+%rename(FileIndicesType)       PacBio::BAM::FileIndices;
+%rename(ParentToolType)        PacBio::BAM::ParentTool;
+%rename(CigarType)             PacBio::BAM::Cigar;
+#endif
+
+// Basic types
+%include "Accuracy.i"
+%include "BamRecordTag.i"
+%include "CigarOperation.i"
+%include "ClipType.i"
+%include "FrameEncodingType.i"
+%include "Interval.i"
+%include "LocalContextFlags.i"
+%include "Orientation.i"
+%include "Position.i"
+%include "PulseBehavior.i"
+%include "QualityValue.i"
+%include "RecordType.i"
+%include "Strand.i"
+%include "Tag.i"
+
+// Basic type aggregates
+%include "Cigar.i"
+%include "GenomicInterval.i"
+%include "QualityValues.i"
+%include "TagCollection.i"
+
+// keep this guy after the other basic types, hacky but works
+%include "Frames.i"
+
+// Header API 
+%include "ProgramInfo.i"
+%include "ReadGroupInfo.i"
+%include "SequenceInfo.i"
+%include "BamHeader.i"
+
+// SAM/BAM format
+%include "IRecordWriter.i"
+%include "BamFile.i"
+%include "BamRecordImpl.i"
+%include "BamRecord.i"
+%include "BamRecordBuilder.i"
+%include "BamTagCodec.i"
+%include "BamWriter.i"
+%include "SamTagCodec.i"
+%include "SamWriter.i"
+
+// DataSet
+%include "DataSetTypes.i"
+%include "DataSet.i"
+
+// Query/iterator API
+%include "QueryBase.i"
+%include "EntireFileQuery.i"
+%include "GenomicIntervalQuery.i"
+%include "ZmwQuery.i"
+%include "ZmwGroupQuery.i"
+
+// PBI
+%include "PbiFile.i"
+%include "PbiRawData.i"
+%include "PbiIndex.i"
+
+// FASTA
+%include "IndexedFastaReader.i"
+
+// Virtual record API
+%include "VirtualRegion.i"
+%include "VirtualZmwBamRecord.i"
+%include "ZmwReadStitcher.i"
+%include "WhitelistedZmwReadStitcher.i"
+
+// Virtual record API - deprecated
+%include "VirtualPolymeraseBamRecord.i"
+%include "VirtualPolymeraseReader.i"
+%include "ZmwWhitelistVirtualReader.i"
diff --git a/src/swig/PbiFile.i b/src/swig/PbiFile.i

new file mode 100644 (file)

index 0000000..096ff10
--- /dev/null
+++ b/src/swig/PbiFile.i
@@ -0,0 +1,11 @@
+/* PbiFile.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/PbiFile.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/PbiFile.h>
+\ No newline at end of file
diff --git a/src/swig/PbiIndex.i b/src/swig/PbiIndex.i

new file mode 100644 (file)

index 0000000..d903f67
--- /dev/null
+++ b/src/swig/PbiIndex.i
@@ -0,0 +1,18 @@
+/* PbiIndex.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/PbiIndex.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+/*%ignore PacBio::BAM::IndexResultBlock::IndexResultBlock();*/
+%ignore PacBio::BAM::IndexResultBlock::IndexResultBlock(size_t, size_t);
+
+%ignore PacBio::BAM::PbiIndex::PbiIndex(PbiIndex&&);      // move ctors not used
+%ignore PacBio::BAM::PbiIndex::operator=;                 // assignment operators not used
+%ignore PacBio::BAM::PbiIndeX::VirtualFileOffsets; 
+
+%include <pbbam/PbiIndex.h>
diff --git a/src/swig/PbiRawData.i b/src/swig/PbiRawData.i

new file mode 100644 (file)

index 0000000..3db9ece
--- /dev/null
+++ b/src/swig/PbiRawData.i
@@ -0,0 +1,35 @@
+/* PbiRawData.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/PbiRawData.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+// move ctors not used
+%ignore PacBio::BAM::PbiRawBarcodeData::PbiRawBarcodeData(PbiRawBarcodeData&&);   
+%ignore PacBio::BAM::PbiRawMappedData::PbiRawMappedData(PbiRawMappedData&&);
+%ignore PacBio::BAM::PbiReferenceEntry::PbiReferenceEntry(PbiReferenceEntry&&);
+%ignore PacBio::BAM::PbiRawReferenceData::PbiRawReferenceData(PbiRawReferenceData&&);
+%ignore PacBio::BAM::PbiRawBasicData::PbiRawBasicData(PbiRawBasicData&&);
+%ignore PacBio::BAM::PbiRawData::PbiRawData(PbiRawData&&); 
+
+// assignment operators not used
+%ignore PacBio::BAM::PbiRawBarcodeData::operator=;                   
+%ignore PacBio::BAM::PbiRawMappedData::operator=;
+%ignore PacBio::BAM::PbiReferenceEntry::operator=;
+%ignore PacBio::BAM::PbiRawReferenceData::operator=;
+%ignore PacBio::BAM::PbiRawBasicData::operator=;
+%ignore PacBio::BAM::PbiRawData::operator=;
+
+#ifdef SWIGCSHARP
+// ignore non-const accessors
+%ignore PacBio::BAM::PbiRawData::BarcodeData();
+%ignore PacBio::BAM::PbiRawData::MappedData();
+%ignore PacBio::BAM::PbiRawData::ReferenceData();
+%ignore PacBio::BAM::PbiRawData::BasicData();
+#endif // C#
+
+%include <pbbam/PbiRawData.h>
diff --git a/src/swig/Position.i b/src/swig/Position.i

new file mode 100644 (file)

index 0000000..9917024
--- /dev/null
+++ b/src/swig/Position.i
@@ -0,0 +1,11 @@
+/* Position.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/Position.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/Position.h>
diff --git a/src/swig/ProgramInfo.i b/src/swig/ProgramInfo.i

new file mode 100644 (file)

index 0000000..9f2a2aa
--- /dev/null
+++ b/src/swig/ProgramInfo.i
@@ -0,0 +1,15 @@
+/* ProgramInfo.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/ProgramInfo.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::ProgramInfo::ProgramInfo(ProgramInfo&&);
+%ignore PacBio::BAM::ProgramInfo::operator=;
+%ignore PacBio::BAM::ProgramInfo::ToSam(const ProgramInfo&);    // ignore static method, to allow member
+
+%include <pbbam/ProgramInfo.h>
+\ No newline at end of file
diff --git a/src/swig/PulseBehavior.i b/src/swig/PulseBehavior.i

new file mode 100644 (file)

index 0000000..65d90f0
--- /dev/null
+++ b/src/swig/PulseBehavior.i
@@ -0,0 +1,11 @@
+/* PulseBehavior.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/PulseBehavior.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/PulseBehavior.h>
diff --git a/src/swig/QualityValue.i b/src/swig/QualityValue.i

new file mode 100644 (file)

index 0000000..29874e1
--- /dev/null
+++ b/src/swig/QualityValue.i
@@ -0,0 +1,20 @@
+/* QualityValue.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/QualityValue.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::QualityValue::operator=;
+
+#ifdef SWIGPYTHON
+%rename(__int__) PacBio::BAM::QualityValue::operator uint8_t;
+#else // R, C#
+%rename(ToInt) PacBio::BAM::QualityValue::operator uint8_t;
+#endif
+
+%include <pbbam/QualityValue.h>
+
diff --git a/src/swig/QualityValues.i b/src/swig/QualityValues.i

new file mode 100644 (file)

index 0000000..815a1b2
--- /dev/null
+++ b/src/swig/QualityValues.i
@@ -0,0 +1,17 @@
+/* QualityValues.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/QualityValues.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%template(QualityValueList) std::vector<PacBio::BAM::QualityValue>;
+
+%ignore PacBio::BAM::QualityValues::operator=;
+%ignore PacBio::BAM::QualityValues::QualityValues(QualityValues&&);
+%ignore PacBio::BAM::QualityValues::QualityValues(std::vector<QualityValue>&&);
+
+%include <pbbam/QualityValues.h>
diff --git a/src/swig/QueryBase.i b/src/swig/QueryBase.i

new file mode 100644 (file)

index 0000000..f2220d9
--- /dev/null
+++ b/src/swig/QueryBase.i
@@ -0,0 +1,127 @@
+/* QueryBase.i */
+
+%module PacBioBam
+
+%{
+
+#include <pbbam/internal/QueryBase.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+
+%ignore PacBio::BAM::QueryIterator::operator++;
+%ignore PacBio::BAM::QueryConstIterator::operator++;
+
+%ignore PacBio::BAM::internal::QueryIterator::operator++;
+%ignore PacBio::BAM::internal::QueryConstIterator::operator++;
+
+%typemap(csinterfaces) PacBio::BAM::internal::QueryBase<BamRecord>  "global::System.Collections.IEnumerable\n, global::System.Collections.Generic.IEnumerable<PacBio.BAM.BamRecord>\n";
+%typemap(cscode) PacBio::BAM::internal::QueryBase<BamRecord>
+%{
+
+    public global::System.Collections.Generic.IEnumerator<PacBio.BAM.BamRecord> GetEnumerator()
+    {
+        var i = this.cbegin();
+        var e = this.cend();
+        while (!i.Equals(e))
+        {
+            yield return i.value();
+            i.incr();
+        }
+    }
+
+    global::System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
+    {
+        return GetEnumerator();
+    }
+
+%}
+
+namespace std {
+    %template(BamRecordList) std::vector<PacBio::BAM::BamRecord>;
+}
+
+%typemap(csinterfaces) PacBio::BAM::internal::QueryBase<std::vector<BamRecord> >  "global::System.Collections.IEnumerable\n, global::System.Collections.Generic.IEnumerable<BamRecordList>\n";
+%typemap(cscode) PacBio::BAM::internal::QueryBase<std::vector<BamRecord> >
+%{
+
+    public global::System.Collections.Generic.IEnumerator<BamRecordList> GetEnumerator()
+    {
+        var i = this.cbegin();
+        var e = this.cend();
+        while (!i.Equals(e))
+        {
+            yield return i.value();
+            i.incr();
+        }
+    }
+
+    global::System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
+    {
+        return GetEnumerator();
+    }
+
+%}
+
+%include <pbbam/internal/QueryBase.h>
+
+%template(IQuery)      PacBio::BAM::internal::QueryBase<BamRecord>;
+%template(IGroupQuery) PacBio::BAM::internal::QueryBase<std::vector<BamRecord> >;
+
+// IEnumerable<BamRecord> interfaces for Queries
+%template(BamQueryIteratorBase)       PacBio::BAM::internal::QueryIteratorBase<BamRecord>;
+%template(BamGroupQueryIteratorBase)  PacBio::BAM::internal::QueryIteratorBase<std::vector<BamRecord> >;
+%template(BamQueryIterator)           PacBio::BAM::internal::QueryIterator<BamRecord>;
+%template(BamGroupQueryIterator)      PacBio::BAM::internal::QueryIterator<std::vector<BamRecord> >;
+%template(BamQueryConstIterator)      PacBio::BAM::internal::QueryConstIterator<BamRecord>;
+%template(BamGroupQueryConstIterator) PacBio::BAM::internal::QueryConstIterator<std::vector<BamRecord> >;
+
+// Iterator API
+#ifdef SWIGPYTHON
+%pythoncode %{
+def Iterate(c):
+       i = c.begin()
+       e = c.end()
+       while i != e:
+               yield i.value()
+               i.incr()
+%}
+#endif
+
+%extend PacBio::BAM::internal::QueryIterator<BamRecord>
+{
+        PacBio::BAM::internal::QueryIterator<BamRecord>& incr(void)
+       { return $self->operator++(); }
+       
+       PacBio::BAM::BamRecord* value(void)
+       { return $self->operator->(); }
+}
+
+%extend PacBio::BAM::internal::QueryConstIterator<BamRecord>
+{
+        PacBio::BAM::internal::QueryConstIterator<BamRecord>& incr(void)
+       { return $self->operator++(); }
+       
+       const PacBio::BAM::BamRecord* value(void) const
+       { return $self->operator->(); }
+}
+
+%extend PacBio::BAM::internal::QueryIterator<std::vector<BamRecord> >
+{
+        PacBio::BAM::internal::QueryIterator<std::vector<BamRecord> >& incr(void)
+       { return $self->operator++(); }
+       
+       std::vector<PacBio::BAM::BamRecord>* value(void)
+       { return $self->operator->(); }
+}
+
+%extend PacBio::BAM::internal::QueryConstIterator<std::vector<BamRecord> >
+{
+        PacBio::BAM::internal::QueryConstIterator<std::vector<BamRecord> >& incr(void)
+       { return $self->operator++(); }
+       
+       const std::vector<PacBio::BAM::BamRecord>* value(void) const
+       { return $self->operator->(); }
+}
diff --git a/src/swig/ReadGroupInfo.i b/src/swig/ReadGroupInfo.i

new file mode 100644 (file)

index 0000000..a02deda
--- /dev/null
+++ b/src/swig/ReadGroupInfo.i
@@ -0,0 +1,15 @@
+/* ReadGroupInfo.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/ReadGroupInfo.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::ReadGroupInfo::ReadGroupInfo(ReadGroupInfo&&);
+%ignore PacBio::BAM::ReadGroupInfo::operator=;
+%ignore PacBio::BAM::ReadGroupInfo::ToSam(const ReadGroupInfo&); 
+
+%include <pbbam/ReadGroupInfo.h>
+\ No newline at end of file
diff --git a/src/swig/RecordType.i b/src/swig/RecordType.i

new file mode 100644 (file)

index 0000000..ef7947e
--- /dev/null
+++ b/src/swig/RecordType.i
@@ -0,0 +1,11 @@
+/* RecordType.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/RecordType.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/RecordType.h>
diff --git a/src/swig/SamTagCodec.i b/src/swig/SamTagCodec.i

new file mode 100644 (file)

index 0000000..320d488
--- /dev/null
+++ b/src/swig/SamTagCodec.i
@@ -0,0 +1,11 @@
+/* SamTagCodec.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/SamTagCodec.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/SamTagCodec.h>
+\ No newline at end of file
diff --git a/src/swig/SamWriter.i b/src/swig/SamWriter.i

new file mode 100644 (file)

index 0000000..f01cec4
--- /dev/null
+++ b/src/swig/SamWriter.i
@@ -0,0 +1,15 @@
+/* SamWriter.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/SamWriter.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::SamWriter(const SamWriter&);  // copy ctor not used
+%ignore PacBio::BAM::SamWriter(SamWriter&&);       // move ctor not used
+%ignore PacBio::BAM::SamWriter::operator=;         // assignment operators not used
+
+%include <pbbam/SamWriter.h>
diff --git a/src/swig/SequenceInfo.i b/src/swig/SequenceInfo.i

new file mode 100644 (file)

index 0000000..3b0ce67
--- /dev/null
+++ b/src/swig/SequenceInfo.i
@@ -0,0 +1,15 @@
+/* SequenceInfo.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/SequenceInfo.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::SequenceInfo::SequenceInfo(SequenceInfo&&);
+%ignore PacBio::BAM::SequenceInfo::operator=;
+%ignore PacBio::BAM::SequenceInfo::ToSam(const SequenceInfo&);    // ignore static method, to allow member
+
+%include <pbbam/SequenceInfo.h>
+\ No newline at end of file
diff --git a/src/swig/Strand.i b/src/swig/Strand.i

new file mode 100644 (file)

index 0000000..96f71f1
--- /dev/null
+++ b/src/swig/Strand.i
@@ -0,0 +1,11 @@
+/* Strand.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/Strand.h>      
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/Strand.h>
+\ No newline at end of file
diff --git a/src/swig/Tag.i b/src/swig/Tag.i

new file mode 100644 (file)

index 0000000..832c856
--- /dev/null
+++ b/src/swig/Tag.i
@@ -0,0 +1,114 @@
+/* Tag.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/Tag.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::Tag::Tag(Tag&&);
+%ignore PacBio::BAM::Tag::operator=;
+
+#if defined(SWIGR) || defined(SWIGPYTHON)
+
+%ignore PacBio::BAM::Tag::Tag(int8_t value);
+%ignore PacBio::BAM::Tag::Tag(uint8_t value);
+%ignore PacBio::BAM::Tag::Tag(int16_t value);
+%ignore PacBio::BAM::Tag::Tag(uint16_t value);
+%ignore PacBio::BAM::Tag::Tag(int32_t value);
+%ignore PacBio::BAM::Tag::Tag(uint32_t value);
+%ignore PacBio::BAM::Tag::Tag(float value);
+%ignore PacBio::BAM::Tag::Tag(const std::string& value);
+%ignore PacBio::BAM::Tag::Tag(const std::vector<int8_t>& value);
+%ignore PacBio::BAM::Tag::Tag(const std::vector<uint8_t>& value);
+%ignore PacBio::BAM::Tag::Tag(const std::vector<int16_t>& value);
+%ignore PacBio::BAM::Tag::Tag(const std::vector<uint16_t>& value);
+%ignore PacBio::BAM::Tag::Tag(const std::vector<int32_t>& value);
+%ignore PacBio::BAM::Tag::Tag(const std::vector<uint32_t>& value);
+%ignore PacBio::BAM::Tag::Tag(const std::vector<float>& value);
+
+%extend PacBio::BAM::Tag {
+       
+    PacBio::BAM::Tag FromInt8(int x)   { return PacBio::BAM::Tag(static_cast<int8_t>(x));   }
+    PacBio::BAM::Tag FromUInt8(int x)  { return PacBio::BAM::Tag(static_cast<uint8_t>(x));  }
+    PacBio::BAM::Tag FromInt16(int x)  { return PacBio::BAM::Tag(static_cast<int16_t>(x));  }
+    PacBio::BAM::Tag FromUInt16(int x) { return PacBio::BAM::Tag(static_cast<uint16_t>(x)); }
+    PacBio::BAM::Tag FromInt32(int x)  { return PacBio::BAM::Tag(static_cast<int32_t>(x));  }
+    PacBio::BAM::Tag FromUInt32(int x) { return PacBio::BAM::Tag(static_cast<uint32_t>(x)); }
+    PacBio::BAM::Tag FromFloat(int x)  { return PacBio::BAM::Tag(static_cast<float>(x));    }
+
+    PacBio::BAM::Tag FromInt8Array(const std::vector<int>& v)
+    {
+        std::vector<int8_t> result;
+        const size_t numElements = v.size();
+        result.reserve(numElements);
+        for (size_t i = 0; i < numElements; ++i)
+            result.push_back(static_cast<int8_t>(v.at(i)));
+        return PacBio::BAM::Tag(result);
+    }
+
+    PacBio::BAM::Tag FromUInt8Array(const std::vector<int>& v)
+    {
+        std::vector<uint8_t> result;
+        const size_t numElements = v.size();
+        result.reserve(numElements);
+        for (size_t i = 0; i < numElements; ++i)
+            result.push_back(static_cast<uint8_t>(v.at(i)));
+        return PacBio::BAM::Tag(result);
+    }
+
+    PacBio::BAM::Tag FromInt16Array(const std::vector<int>& v)
+    {
+        std::vector<int16_t> result;
+        const size_t numElements = v.size();
+        result.reserve(numElements);
+        for (size_t i = 0; i < numElements; ++i)
+            result.push_back(static_cast<int16_t>(v.at(i)));
+        return PacBio::BAM::Tag(result);
+    }
+
+    PacBio::BAM::Tag FromUInt16Array(const std::vector<int>& v)
+    {
+        std::vector<int16_t> result;
+        const size_t numElements = v.size();
+        result.reserve(numElements);
+        for (size_t i = 0; i < numElements; ++i)
+            result.push_back(static_cast<uint16_t>(v.at(i)));
+        return PacBio::BAM::Tag(result);
+    }
+
+    PacBio::BAM::Tag FromInt32Array(const std::vector<int>& v)
+    {
+        std::vector<int16_t> result;
+        const size_t numElements = v.size();
+        result.reserve(numElements);
+        for (size_t i = 0; i < numElements; ++i)
+            result.push_back(static_cast<int32_t>(v.at(i)));
+        return PacBio::BAM::Tag(result);
+    }
+
+    PacBio::BAM::Tag FromUInt32Array(const std::vector<int>& v)
+    {
+        std::vector<int16_t> result;
+        const size_t numElements = v.size();
+        result.reserve(numElements);
+        for (size_t i = 0; i < numElements; ++i)
+            result.push_back(static_cast<uint32_t>(v.at(i)));
+        return PacBio::BAM::Tag(result);
+    }
+
+    PacBio::BAM::Tag FromFloatArray(const std::vector<int>& v)
+    {
+        std::vector<int16_t> result;
+        const size_t numElements = v.size();
+        result.reserve(numElements);
+        for (size_t i = 0; i < numElements; ++i)
+            result.push_back(static_cast<float>(v.at(i)));
+        return PacBio::BAM::Tag(result);
+    }
+}
+#endif // SWIGR
+
+%include <pbbam/Tag.h>
diff --git a/src/swig/TagCollection.i b/src/swig/TagCollection.i

new file mode 100644 (file)

index 0000000..bcfc707
--- /dev/null
+++ b/src/swig/TagCollection.i
@@ -0,0 +1,13 @@
+/* TagCollection.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/TagCollection.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%template(TagCollectionType) std::map<std::string, PacBio::BAM::Tag>;
+
+%include <pbbam/TagCollection.h>
+\ No newline at end of file
diff --git a/src/swig/VirtualPolymeraseBamRecord.i b/src/swig/VirtualPolymeraseBamRecord.i

new file mode 100644 (file)

index 0000000..a74e673
--- /dev/null
+++ b/src/swig/VirtualPolymeraseBamRecord.i
@@ -0,0 +1,36 @@
+/* VirtualPolymeraseBamRecord.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/virtual/VirtualRegionType.h>
+#include <pbbam/virtual/VirtualRegion.h>
+#include <pbbam/virtual/VirtualPolymeraseBamRecord.h>
+#include <pbbam/virtual/VirtualZmwBamRecord.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+typedef PacBio::BAM::VirtualZmwBamRecord VirtualPolymeraseBamRecord;
+%}
+
+///*%ignore PacBio::BAM::VirtualPolymeraseBamRecord::VirtualPolymeraseBamRecord(const VirtualPolymeraseBamRecord&);*/
+//%ignore PacBio::BAM::VirtualPolymeraseBamRecord::VirtualPolymeraseBamRecord(VirtualPolymeraseBamRecord&&);
+//%ignore PacBio::BAM::VirtualPolymeraseBamRecord::operator=;
+
+//// disabled - can't get it to work right (at least in Python)
+//// but the same info is available (& correct) from record.VirtualRegionsTable(regionType)
+//%ignore PacBio::BAM::VirtualPolymeraseBamRecord::VirtualRegionsMap;
+
+//%template(VirtualRegionList) std::vector<PacBio::BAM::VirtualRegion>;
+//%template(VirtualRegionsMap) std::map<PacBio::BAM::VirtualRegionType, std::vector<PacBio::BAM::VirtualRegion> >;
+
+%include <pbbam/virtual/VirtualPolymeraseBamRecord.h>
+%include <pbbam/virtual/VirtualZmwBamRecord.h>
+typedef PacBio::BAM::VirtualZmwBamRecord VirtualPolymeraseBamRecord;
+
+#ifdef SWIGPYTHON
+%pythoncode %{
+
+VirtualPolymeraseBamRecord = VirtualZmwBamRecord
+
+%}
+#endif 
diff --git a/src/swig/VirtualPolymeraseReader.i b/src/swig/VirtualPolymeraseReader.i

new file mode 100644 (file)

index 0000000..238768f
--- /dev/null
+++ b/src/swig/VirtualPolymeraseReader.i
@@ -0,0 +1,23 @@
+/* VirtualPolymeraseReader.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/virtual/VirtualPolymeraseReader.h>
+#include <pbbam/virtual/ZmwReadStitcher.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+typedef PacBio::BAM::ZmwReadStitcher VirtualPolymeraseReader;
+%}
+
+%include <pbbam/virtual/VirtualPolymeraseReader.h>
+%include <pbbam/virtual/ZmwReadStitcher.h>
+typedef PacBio::BAM::ZmwReadStitcher VirtualPolymeraseReader;
+
+#ifdef SWIGPYTHON
+%pythoncode %{
+
+VirtualPolymeraseReader = ZmwReadStitcher
+
+%}
+#endif 
+\ No newline at end of file
diff --git a/src/swig/VirtualRegion.i b/src/swig/VirtualRegion.i

new file mode 100644 (file)

index 0000000..2436de2
--- /dev/null
+++ b/src/swig/VirtualRegion.i
@@ -0,0 +1,18 @@
+/* VirtualRegion.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/virtual/VirtualRegionType.h>
+#include <pbbam/virtual/VirtualRegion.h>
+#include <map>
+#include <vector>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%ignore PacBio::BAM::VirtualRegion::VirtualRegion(VirtualRegion&&);
+%ignore PacBio::BAM::VirtualRegion::operator=;
+
+%include <pbbam/virtual/VirtualRegionType.h>
+%include <pbbam/virtual/VirtualRegion.h>
diff --git a/src/swig/VirtualZmwBamRecord.i b/src/swig/VirtualZmwBamRecord.i

new file mode 100644 (file)

index 0000000..edae5fe
--- /dev/null
+++ b/src/swig/VirtualZmwBamRecord.i
@@ -0,0 +1,26 @@
+/* VirtualZmwBamRecord.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/virtual/VirtualRegionType.h>
+#include <pbbam/virtual/VirtualRegion.h>
+#include <pbbam/virtual/VirtualZmwBamRecord.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%feature("valuewrapper") PacBio::BAM::VirtualZmwBamRecord;
+
+/*%ignore PacBio::BAM::VirtualZmwBamRecord::VirtualZmwBamRecord(const VirtualZmwBamRecord&);*/
+%ignore PacBio::BAM::VirtualZmwBamRecord::VirtualZmwBamRecord(VirtualZmwBamRecord&&);
+%ignore PacBio::BAM::VirtualZmwBamRecord::operator=;
+
+// disabled - can't get it to work right (at least in Python)
+// but the same info is available (& correct) from record.VirtualRegionsTable(regionType)
+%ignore PacBio::BAM::VirtualZmwBamRecord::VirtualRegionsMap;
+
+%template(VirtualRegionList) std::vector<PacBio::BAM::VirtualRegion>;
+%template(VirtualRegionsMap) std::map<PacBio::BAM::VirtualRegionType, std::vector<PacBio::BAM::VirtualRegion> >;
+
+%include <pbbam/virtual/VirtualZmwBamRecord.h>
+\ No newline at end of file
diff --git a/src/swig/WhitelistedZmwReadStitcher.i b/src/swig/WhitelistedZmwReadStitcher.i

new file mode 100644 (file)

index 0000000..5ecd9d7
--- /dev/null
+++ b/src/swig/WhitelistedZmwReadStitcher.i
@@ -0,0 +1,11 @@
+/* WhitelistedZmwReadStitcher.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/virtual/WhitelistedZmwReadStitcher.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/virtual/WhitelistedZmwReadStitcher.h>
+\ No newline at end of file
diff --git a/src/swig/WrapCSharp.cmake b/src/swig/WrapCSharp.cmake

new file mode 100644 (file)

index 0000000..cb44d74
--- /dev/null
+++ b/src/swig/WrapCSharp.cmake
@@ -0,0 +1,46 @@
+
+find_package(CSharp REQUIRED)
+include (${CSHARP_USE_FILE})
+
+set(PacBioBAM_CSharpLibDir  ${PacBioBAM_LibDir}/csharp/PacBio.BAM)
+set(PacBioBAM_CSharpDLL     ${PacBioBAM_CSharpLibDir}/bin/Debug/PacBio.BAM.dll)
+set(CSharpTestRootDir       ${PacBioBAM_TestsDir}/src/CSharp)
+set(NativeLibraryPaths      ${PacBioBAM_CSharpLibDir}:${PacBioBAM_LibDir}:${Htslib_LibDir})
+
+#
+# Create SWIG wrapper
+#
+file(MAKE_DIRECTORY ${PacBioBAM_CSharpLibDir})
+set(CMAKE_SWIG_OUTDIR ${PacBioBAM_CSharpLibDir})   # ensure any swig files in lib/csharp
+set_source_files_properties(
+  PacBioBam.i PROPERTIES
+  CPLUSPLUS ON
+  SWIG_FLAGS "-namespace;PacBio.BAM")
+swig_add_module(PacBioBam csharp PacBioBam.i)
+swig_link_libraries(PacBioBam ${PacBioBAM_LIBRARIES}) # add any C# libs you need from <Find|Use>CSharp.cmake
+set_target_properties(
+    ${SWIG_MODULE_PacBioBam_REAL_NAME}            # ensure wrapper lib in lib/csharp
+    PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY ${PacBioBAM_CSharpLibDir}
+)
+add_dependencies(${SWIG_MODULE_PacBioBam_REAL_NAME} pbbam)
+
+#
+# Write a csproj, then shell out to build and check the assembly---
+# can't get it working nicely in CMake yet
+#
+configure_file(
+   ${PacBioBAM_SwigSourceDir}/PacBio.BAM.csproj.in
+   ${PacBioBAM_CSharpLibDir}/PacBio.BAM.csproj)
+configure_file(
+  ${CSharpTestRootDir}/TestPbbam.cs.in
+  ${CSharpTestRootDir}/TestPbbam.cs)
+configure_file(
+   ${CSharpTestRootDir}/buildAssembly.sh.in
+   buildAssembly.sh)
+add_custom_command(
+  OUTPUT ${PacBioBAM_CSharpDLL}
+  DEPENDS ${SWIG_MODULE_PacBioBam_REAL_NAME}
+  COMMAND bash ./buildAssembly.sh "${HTSLIB_LIBRARIES}"
+)
+add_custom_target(CSharpAssembly ALL DEPENDS ${PacBioBAM_CSharpDLL})
diff --git a/src/swig/WrapPython.cmake b/src/swig/WrapPython.cmake

new file mode 100644 (file)

index 0000000..719c5c2
--- /dev/null
+++ b/src/swig/WrapPython.cmake
@@ -0,0 +1,55 @@
+
+# setup
+find_package(PythonLibs REQUIRED)
+include_directories(${PYTHON_INCLUDE_PATH})
+set(PacBioBAM_PythonLibDir ${PacBioBAM_LibDir}/python)
+set(PythonTestRootDir ${PacBioBAM_TestsDir}/src/python)
+
+# create wrapper
+file(MAKE_DIRECTORY ${PacBioBAM_PythonLibDir})
+set(CMAKE_SWIG_OUTDIR ${PacBioBAM_PythonLibDir})  # put PacBioBam.py in lib/python
+
+swig_add_module(PacBioBam python PacBioBam.i)
+swig_link_libraries(PacBioBam ${PacBioBAM_LIBRARIES} ${PYTHON_LIBRARIES})
+set_target_properties(
+    ${SWIG_MODULE_PacBioBam_REAL_NAME}            # put _PacBioBam.so in lib/python
+    PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY ${PacBioBAM_PythonLibDir}
+)
+#add_dependencies(${SWIG_MODULE_PacBioBam_REAL_NAME} pbbam ${PacBioBAM_LIBRARIES})
+target_link_libraries(${SWIG_MODULE_PacBioBam_REAL_NAME} pbbam)
+
+# simple "wrapper worked" check
+# this is run every build, to check importing from Python, but does NOT run full Python-side unit tests
+add_custom_target(
+    check_swig_python
+    ALL
+    "PYTHONPATH=${PacBioBAM_PythonLibDir}" python check_swig.py
+    COMMENT "Checking Python wrapper"
+    WORKING_DIRECTORY ${PythonTestRootDir}
+)
+add_dependencies(check_swig_python ${SWIG_MODULE_PacBioBam_REAL_NAME})
+
+# unit tests
+if(PacBioBAM_build_tests)
+
+    # configure data directory info
+    configure_file(
+        ${PythonTestRootDir}/test/config.py.in
+        ${PythonTestRootDir}/test/config.py
+    )
+
+    # test runner
+    add_test(
+        NAME PythonUnitTests
+        WORKING_DIRECTORY ${PythonTestRootDir}
+        COMMAND "python" test_pbbam.py
+    )
+    set_tests_properties(
+        PythonUnitTests
+        PROPERTIES
+        ENVIRONMENT "PYTHONPATH=${PacBioBAM_PythonLibDir}"
+    )
+
+endif() # unit tests
+
diff --git a/src/swig/WrapR.cmake b/src/swig/WrapR.cmake

new file mode 100644 (file)

index 0000000..5dccedd
--- /dev/null
+++ b/src/swig/WrapR.cmake
@@ -0,0 +1,71 @@
+# setup
+set(R_INCLUDE_DIR_HINT /mnt/software/r/R/3.1.1/usr/share/R/include) # TODO: hard-coded hint for now, clean up later
+find_package(R REQUIRED)
+include_directories(${R_INCLUDE_DIR})
+set(PacBioBAM_RLibDir ${PacBioBAM_LibDir}/R)
+set(RTestRootDir ${PacBioBAM_TestsDir}/src/R)
+
+# Suppress warnings from generated code
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-Wno-unused-parameter" HAS_NO_UNUSED_PARAMETER)
+if(HAS_NO_UNUSED_PARAMETER)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
+endif()
+
+# SWIG R does not support PBBAM_SHARED_PTR, but it does support boost::shared_ptr
+# So force boost if we're wrapping for R.
+add_definitions(-DPBBAM_USE_BOOST_SHARED_PTR)
+
+# create wrapper & library
+file(MAKE_DIRECTORY ${PacBioBAM_RLibDir})
+set(CMAKE_SWIG_OUTDIR ${PacBioBAM_RLibDir})      # put PacBioBam.R wrapper in lib/R
+swig_add_module(PacBioBam r PacBioBam.i)
+swig_link_libraries(PacBioBam ${PacBioBAM_LIBRARIES})
+if(R_LIBRARIES)
+    swig_link_libraries(PacBioBam ${R_LIBRARIES})
+endif()
+
+# make sure the library is named "PacBioBam.so" explicitly
+# no "lib" prefix... that gets in the way of the name lookups between SWIG/R
+# and make sure library ends up in lib/R
+set_target_properties(
+    ${SWIG_MODULE_PacBioBam_REAL_NAME}
+    PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY ${PacBioBAM_RLibDir}
+    RUNTIME_OUTPUT_DIRECTORY ${PacBioBAM_RLibDir}
+    SONAME PacBioBam.so
+    PREFIX ""
+)
+add_dependencies(${SWIG_MODULE_PacBioBam_REAL_NAME} pbbam)
+
+# simple "wrapper worked" check
+configure_file(
+    ${RTestRootDir}/check_swig.R.in
+    ${RTestRootDir}/check_swig.R
+)
+
+add_custom_target(
+    check_swig_R
+    ALL
+    "R" --slave --no-save < ${RTestRootDir}/check_swig.R
+    COMMENT "Checking R wrapper"
+    WORKING_DIRECTORY ${PacBioBAM_RLibDir}
+)
+add_dependencies(check_swig_R ${SWIG_MODULE_PacBioBam_REAL_NAME})
+
+# unit tests
+if(PacBioBAM_build_tests)
+
+    # configure script
+    configure_file(
+        ${RTestRootDir}/test_pbbam.sh.in
+        ${RTestRootDir}/test_pbbam.sh
+    )
+
+    # test runner
+    add_test(
+        NAME RUnitTests
+        COMMAND "sh" ${RTestRootDir}/test_pbbam.sh
+        WORKING_DIRECTORY ${PacBioBAM_RLibDir}
+    )
+endif()
diff --git a/src/swig/ZmwGroupQuery.i b/src/swig/ZmwGroupQuery.i

new file mode 100644 (file)

index 0000000..c020eb5
--- /dev/null
+++ b/src/swig/ZmwGroupQuery.i
@@ -0,0 +1,14 @@
+/* ZmwGroupQuery.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/internal/QueryBase.h>
+#include <pbbam/ZmwGroupQuery.h>
+       
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/internal/QueryBase.h>
+%include <pbbam/ZmwGroupQuery.h>
diff --git a/src/swig/ZmwQuery.i b/src/swig/ZmwQuery.i

new file mode 100644 (file)

index 0000000..8ad33d7
--- /dev/null
+++ b/src/swig/ZmwQuery.i
@@ -0,0 +1,11 @@
+/* ZmwQuery.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/ZmwQuery.h>    
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/ZmwQuery.h>
+\ No newline at end of file
diff --git a/src/swig/ZmwReadStitcher.i b/src/swig/ZmwReadStitcher.i

new file mode 100644 (file)

index 0000000..1eadcaf
--- /dev/null
+++ b/src/swig/ZmwReadStitcher.i
@@ -0,0 +1,11 @@
+/* ZmwReadStitcher.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/virtual/ZmwReadStitcher.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+%}
+
+%include <pbbam/virtual/ZmwReadStitcher.h>
+\ No newline at end of file
diff --git a/src/swig/ZmwWhitelistVirtualReader.i b/src/swig/ZmwWhitelistVirtualReader.i

new file mode 100644 (file)

index 0000000..36f7e74
--- /dev/null
+++ b/src/swig/ZmwWhitelistVirtualReader.i
@@ -0,0 +1,23 @@
+/* ZmwWhitelistVirtualReader.i */
+
+%module PacBioBam
+
+%{
+#include <pbbam/virtual/ZmwWhitelistVirtualReader.h>
+#include <pbbam/virtual/WhitelistedZmwReadStitcher.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+typedef PacBio::BAM::WhitelistedZmwReadStitcher ZmwWhitelistVirtualReader;
+%}
+
+%include <pbbam/virtual/ZmwWhitelistVirtualReader.h>
+%include <pbbam/virtual/WhitelistedZmwReadStitcher.h>
+typedef PacBio::BAM::WhitelistedZmwReadStitcher ZmwWhitelistVirtualReader;
+
+#ifdef SWIGPYTHON
+%pythoncode %{
+
+ZmwWhitelistVirtualReader = WhitelistedZmwReadStitcher
+
+%}
+#endif 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt

new file mode 100644 (file)

index 0000000..bbcc1e5
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,80 @@
+
+if(PacBioBAM_build_tests)
+
+    # setup GoogleTest
+    if (NOT GTEST_SRC_DIR)
+        set(PREBUILT_GTEST_SRC ${PacBioBAM_RootDir}/../../../../prebuilt.tmpout/gtest/gtest_1.7.0/)
+        if(EXISTS ${PREBUILT_GTEST_SRC})
+            set(GTEST_SRC_DIR ${PREBUILT_GTEST_SRC})
+        else()
+            set(GTEST_SRC_DIR ${PacBioBAM_RootDir}/../gtest) # keep old fallback behavior for external builds, for now at least
+        endif()
+    endif()
+    add_subdirectory(${GTEST_SRC_DIR} external/gtest/build)
+
+    # generate paths/values used by for unit tests
+    configure_file(
+        ${PacBioBAM_TestsDir}/src/TestData.h.in
+        ${CMAKE_BINARY_DIR}/generated/TestData.h
+    )
+    configure_file(
+        ${PacBioBAM_TestsDir}/data/group/group.fofn.in
+        ${CMAKE_BINARY_DIR}/generated/group.fofn
+    )
+
+    # grab PacBioBAM unit test source files
+    include(files.cmake)
+    set(SOURCES
+        ${PacBioBAMTest_H}
+        ${PacBioBAMTest_CPP}
+    )
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PacBioBAM_CXX_FLAGS}")
+
+    # define unit test executable
+    add_definitions(-DPBBAM_TESTING)
+    if(MSVC)
+        # VS2012+ pooh-pooh's Derek's "#define private public" trick
+        add_definitions(-D_ALLOW_KEYWORD_MACROS)
+    endif()
+
+    if(PacBioBAM_wrap_r)
+        # SWIG R does not support std::shared_ptr, but it does support boost::shared_ptr
+        # So force boost if we're wrapping for R.
+        add_definitions(-DPBBAM_USE_BOOST_SHARED_PTR)
+    endif()
+
+    add_executable(test_pbbam ${SOURCES})
+    set_target_properties(test_pbbam PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY ${PacBioBAM_BinDir}
+    )
+    target_include_directories(test_pbbam
+        PUBLIC
+        ${CMAKE_BINARY_DIR}/generated
+        ${PacBioBAM_INCLUDE_DIRS}
+        ${gtest_SOURCE_DIR}/include
+        ${gtest_SOURCE_DIR}
+    )
+
+    # generate test data
+    add_custom_target(
+        generate_test_data
+        WORKING_DIRECTORY ${PacBioBAM_TestsDir}/scripts
+        COMMAND "python" generate_data.py
+            ${PacBioBAM_TestsDir}/data/
+            ${GeneratedTestDataDir}
+    )
+
+    # add unit tests to test framework
+    add_test(
+        NAME UnitTests
+        WORKING_DIRECTORY ${PacBioBAM_BinDir}
+        COMMAND test_pbbam
+    )
+    add_dependencies(test_pbbam generate_test_data)
+    target_link_libraries(test_pbbam
+        pbbam
+        ${CMAKE_THREAD_LIBS_INIT} # quirky pthreads
+        gtest
+        gtest_main
+    )
+endif() # PacBioBAM_build_tests
diff --git a/tests/data/aligned.bam b/tests/data/aligned.bam

new file mode 100644 (file)

index 0000000..34d81e5

Binary files /dev/null and b/tests/data/aligned.bam differ
diff --git a/tests/data/aligned.bam.bai b/tests/data/aligned.bam.bai

new file mode 100644 (file)

index 0000000..66ba855

Binary files /dev/null and b/tests/data/aligned.bam.bai differ
diff --git a/tests/data/aligned.bam.pbi b/tests/data/aligned.bam.pbi

new file mode 100644 (file)

index 0000000..f2cf207

Binary files /dev/null and b/tests/data/aligned.bam.pbi differ
diff --git a/tests/data/aligned.sam b/tests/data/aligned.sam

new file mode 100644 (file)

index 0000000..ad45e63
--- /dev/null
+++ b/tests/data/aligned.sam
@@ -0,0 +1,8 @@
+@HD    VN:1.3.1        SO:coordinate   pb:3.0.3
+@SQ    SN:lambda_NEB3011       LN:48502        M5:a1319ff90e994c8190a4fe6569d0822a
+@RG    ID:0d7b28fa     PL:PACBIO       DS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100      PU:singleInsertion      PM:SEQUEL
+@PG    ID:bwa  PN:bwa  VN:0.7.10-r1017-dirty   CL:bwa mem lambdaNEB.fa singleInsertion.fasta
+singleInsertion/100/0_49       2048    lambda_NEB3011  5211    60      3H8=1D19=1I21=59H       *       0       0       GGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGAT       *       NM:i:2  MD:Z:8^T40      AS:i:34 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,9378,+,52S37=2D10=1I11=,60,3;       qe:i:49 qs:i:0  np:i:1  zm:i:100        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
+singleInsertion/200/0_49       2048    lambda_NEB3011  5211    60      3H8=1D19=1I21=59H       *       0       0       GGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGAT       *       NM:i:2  MD:Z:8^T40      AS:i:34 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,9378,-,37=2D10=1I11=52S,60,3;       qe:i:49 qs:i:0  np:i:1  zm:i:200        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
+singleInsertion/100/0_111      0       lambda_NEB3011  9378    60      52S37=2D10=1I11=        *       0       0       TTTGGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGATAAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGAGCAGCACGGTAAACAGCGGCAA *       NM:i:3  MD:Z:37^TC21    AS:i:43 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,5211,+,3S8=1D19=1I21=59S,60,2;      qe:i:111        qs:i:0  np:i:1  zm:i:100        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
+singleInsertion/100/0_111      16      lambda_NEB3011  9378    60      37=2D10=1I11=52S        *       0       0       AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGAGCAGCACGGTAAACAGCGGCAAATCAGCCAGTCCGGCATCAATTGGCCTCCTGACCGCTGTACCTGCAGCCAAA *       NM:i:3  MD:Z:37^TC21    AS:i:43 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,5211,+,3S8=1D19=1I21=59S,60,2;      qe:i:111        qs:i:0  np:i:1  zm:i:100        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
diff --git a/tests/data/aligned2.bam b/tests/data/aligned2.bam

new file mode 100644 (file)

index 0000000..672e5e5

Binary files /dev/null and b/tests/data/aligned2.bam differ
diff --git a/tests/data/aligned2.bam.bai b/tests/data/aligned2.bam.bai

new file mode 100644 (file)

index 0000000..f954ab0

Binary files /dev/null and b/tests/data/aligned2.bam.bai differ
diff --git a/tests/data/aligned2.bam.pbi b/tests/data/aligned2.bam.pbi

new file mode 100644 (file)

index 0000000..c1e82de

Binary files /dev/null and b/tests/data/aligned2.bam.pbi differ
diff --git a/tests/data/chunking/chunking.subreadset.xml b/tests/data/chunking/chunking.subreadset.xml

new file mode 100644 (file)

index 0000000..6d15ff1
--- /dev/null
+++ b/tests/data/chunking/chunking.subreadset.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+    TimeStampedName="subreadset_150304_231155" 
+    MetaType="PacBio.DataSet.SubreadSet" 
+    Name="DataSet_SubreadSet" 
+    Tags="" 
+    Version="3.0.0" 
+    CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam">
+        <pbbase:FileIndices>
+            <pbbase:FileIndex 
+                UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5194" 
+                TimeStampedName="bam_index_150304_231155" 
+                MetaType="PacBio.Index.PacBioIndex" 
+                ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi"/>
+        </pbbase:FileIndices>
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5197" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam">
+        <pbbase:FileIndices>
+            <pbbase:FileIndex 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5198" 
+                TimeStampedName="bam_index_150304_231155" 
+                MetaType="PacBio.Index.PacBioIndex" 
+                ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi"/>
+        </pbbase:FileIndices>
+    </pbbase:ExternalResource><pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5195" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam">
+        <pbbase:FileIndices>
+            <pbbase:FileIndex 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5196" 
+                TimeStampedName="bam_index_150304_231155" 
+                MetaType="PacBio.Index.PacBioIndex" 
+                ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi"/>
+        </pbbase:FileIndices>
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="movie" Operator="=" Value="m150404_101626_42267_c100807920800000001823174110291514_s1_p0"/>
+            <pbbase:Property Name="zm" Operator="lt" Value="1816"/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam

new file mode 100644 (file)

index 0000000..c4ec7ea

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..4af87e2

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam

new file mode 100644 (file)

index 0000000..e623aca

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..6479979

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam

new file mode 100644 (file)

index 0000000..8544f6a

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..a9f4edb

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi differ
diff --git a/tests/data/dataset/ali1.xml b/tests/data/dataset/ali1.xml

new file mode 100644 (file)

index 0000000..ab0a82a
--- /dev/null
+++ b/tests/data/dataset/ali1.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments0.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments1.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments1.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/ali2.xml b/tests/data/dataset/ali2.xml

new file mode 100644 (file)

index 0000000..c35f9ec
--- /dev/null
+++ b/tests/data/dataset/ali2.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments2.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments2.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments3.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments3.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/ali3.xml b/tests/data/dataset/ali3.xml

new file mode 100644 (file)

index 0000000..f58d25f
--- /dev/null
+++ b/tests/data/dataset/ali3.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01"  MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments2.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments2.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments3.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments3.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.75" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/ali4.xml b/tests/data/dataset/ali4.xml

new file mode 100644 (file)

index 0000000..ab0a82a
--- /dev/null
+++ b/tests/data/dataset/ali4.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments0.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments1.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments1.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/bam_mapping.bam b/tests/data/dataset/bam_mapping.bam

new file mode 100644 (file)

index 0000000..2d4ae7b

Binary files /dev/null and b/tests/data/dataset/bam_mapping.bam differ
diff --git a/tests/data/dataset/bam_mapping.bam.pbi b/tests/data/dataset/bam_mapping.bam.pbi

new file mode 100644 (file)

index 0000000..fe7c3be

Binary files /dev/null and b/tests/data/dataset/bam_mapping.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_1.bam b/tests/data/dataset/bam_mapping_1.bam

new file mode 100644 (file)

index 0000000..1e9670e

Binary files /dev/null and b/tests/data/dataset/bam_mapping_1.bam differ
diff --git a/tests/data/dataset/bam_mapping_1.bam.pbi b/tests/data/dataset/bam_mapping_1.bam.pbi

new file mode 100644 (file)

index 0000000..d99a174

Binary files /dev/null and b/tests/data/dataset/bam_mapping_1.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_2.bam b/tests/data/dataset/bam_mapping_2.bam

new file mode 100644 (file)

index 0000000..09678ea

Binary files /dev/null and b/tests/data/dataset/bam_mapping_2.bam differ
diff --git a/tests/data/dataset/bam_mapping_2.bam.pbi b/tests/data/dataset/bam_mapping_2.bam.pbi

new file mode 100644 (file)

index 0000000..d1765ef

Binary files /dev/null and b/tests/data/dataset/bam_mapping_2.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_new.bam b/tests/data/dataset/bam_mapping_new.bam

new file mode 100644 (file)

index 0000000..3039331

Binary files /dev/null and b/tests/data/dataset/bam_mapping_new.bam differ
diff --git a/tests/data/dataset/bam_mapping_new.bam.pbi b/tests/data/dataset/bam_mapping_new.bam.pbi

new file mode 100644 (file)

index 0000000..82d497c

Binary files /dev/null and b/tests/data/dataset/bam_mapping_new.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_staggered.xml b/tests/data/dataset/bam_mapping_staggered.xml

new file mode 100644 (file)

index 0000000..879c193
--- /dev/null
+++ b/tests/data/dataset/bam_mapping_staggered.xml
@@ -0,0 +1,35 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<pbds:DataSet CreatedAt="2015-05-13T10:58:26" MetaType="PacBio.DataSet.DataSet" Name="" Tags="" UniqueId="30f72098-bc5b-e06b-566c-8b28dda909a8" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"  xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_1.bam">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_1.bam.bai"/>
+                       </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_2.bam">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_2.bam.bai"/>
+                       </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:DataSets>
+        <pbds:DataSet CreatedAt="2015-05-13T10:58:26" UniqueId="c5402d06-4643-057c-e300-fe229b4e8909" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+            <pbbase:ExternalResources>
+                <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_2.bam">
+                               <pbbase:FileIndices>
+                                       <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_2.bam.bai"/>
+                               </pbbase:FileIndices>
+                </pbbase:ExternalResource>
+            </pbbase:ExternalResources>
+        </pbds:DataSet>
+        <pbds:DataSet CreatedAt="2015-05-13T10:58:26" UniqueId="f8b54a55-5fb7-706f-ab35-39afc9c86924" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+            <pbbase:ExternalResources>
+                <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_1.bam">
+                               <pbbase:FileIndices>
+                                       <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_1.bam.bai"/>
+                               </pbbase:FileIndices>
+                </pbbase:ExternalResource>
+            </pbbase:ExternalResources>
+        </pbds:DataSet>
+    </pbds:DataSets>
+</pbds:DataSet>
diff --git a/tests/data/dataset/barcode.dataset.xml b/tests/data/dataset/barcode.dataset.xml

new file mode 100644 (file)

index 0000000..1fbbb18
--- /dev/null
+++ b/tests/data/dataset/barcode.dataset.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:BarcodeSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.BarcodeSet" Name="DataSet_BarcodeSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Barcodes FASTA" Description="Points to an example Barcodes FASTA file." MetaType="BarcodeFile.BarcodeFastaFile" ResourceId="file:///mnt/path/to/barcode.fasta" Tags="Example"/>
+       </pbbase:ExternalResources>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>400</pbds:TotalLength>
+               <pbds:NumRecords>30</pbds:NumRecords>
+               <pbds:BarcodeConstruction>paired</pbds:BarcodeConstruction>
+       </pbds:DataSetMetadata>
+</pbds:BarcodeSet>
diff --git a/tests/data/dataset/ccsread.dataset.xml b/tests/data/dataset/ccsread.dataset.xml

new file mode 100644 (file)

index 0000000..97b5943
--- /dev/null
+++ b/tests/data/dataset/ccsread.dataset.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:ConsensusReadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.ConsensusReadSet" Name="DataSet_ConsensusReadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First ConsensusRead BAM" Description="Points to an example ConsensusRead BAM file." MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="file:///mnt/path/to/ccsreads0.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="file:///mnt/path/to/ccsreads0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second ConsensusRead BAM" Description="Points to another example ConsensusRead BAM file." MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="file:///mnt/path/to/ccsreads1.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="file:///mnt/path/to/ccsreads0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+</pbds:ConsensusReadSet>
diff --git a/tests/data/dataset/lambda_contigs.xml b/tests/data/dataset/lambda_contigs.xml

new file mode 100644 (file)

index 0000000..4abc8cc
--- /dev/null
+++ b/tests/data/dataset/lambda_contigs.xml
@@ -0,0 +1,6 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<pbds:ReferenceSet CreatedAt="2015-05-28T10:56:36" MetaType="PacBio.DataSet.ReferenceSet" Name="" Tags="" UniqueId="596e87db-34f9-d2fd-c905-b017543170e1" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource ResourceId="file:tests/data/lambda_contigs.fasta"/>
+    </pbbase:ExternalResources>
+</pbds:ReferenceSet>
+\ No newline at end of file
diff --git a/tests/data/dataset/malformed.xml b/tests/data/dataset/malformed.xml

new file mode 100644 (file)

index 0000000..31e0942
--- /dev/null
+++ b/tests/data/dataset/malformed.xml
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="utf-8"?>
+<SubreadSet 
+    CreatedAt="2015-08-19T15:39:36.331"
+    Description="Merged dataset from 1 files using DatasetMerger 0.1.2" 
+    MetaType="PacBio.DataSet.HdfSubreadSet" 
+    Name="Subreads from runr000013_42267_150403" 
+    Tags="pacbio.secondary.instrument=RS" 
+    TimeStampedName="hdfsubreadset_2015-08-19T15:39:36.331-07:00" 
+    UniqueId="b4741521-2a4c-42df-8a13-0a755ca9ed1e"
+    Version="0.5" 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:ns0="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:ns1="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:ns2="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:ns3="http://pacificbiosciences.com/PacBioReagentKit.xsd">
+       <ns0:ExternalResources>
+        <ns0:ExternalResource 
+            MetaType="SubreadFile.SubreadBamFile"
+            TimeStampedName="SubreadFile.SubreadBamFile_00000000000000"
+            UniqueId="251acf71-9eb0-489e-9dd1-cdbd11432753" 
+            ResourceId="file:///mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0//mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0/file.subreads.subreads.bam"  />
+    </ns0:ExternalResources>
+    <DataSetMetadata>
+        <TotalLength>50000000</TotalLength>
+        <NumRecords>150000</NumRecords>
+        <ns2:Collections>
+            <ns2:CollectionMetadata 
+                Context="m150404_101626_42267_c100807920800000001823174110291514_s1_p0" 
+                InstrumentId="1" 
+                InstrumentName="42267" 
+                MetaType="PacBio.Collection" 
+                TimeStampedName="m150404_101626_42267_c100807920800000001823174110291514_s1_p0" 
+                UniqueId="d66c8372-2b70-4dcf-b64f-9f8b5cc351fd">
+                <ns2:InstCtrlVer>2.3.0.1.142990</ns2:InstCtrlVer>
+                <ns2:SigProcVer>NRT@172.31.128.10:8082, SwVer=2301.142990, HwVer=1.0</ns2:SigProcVer>
+                <ns2:RunDetails>
+                    <ns2:RunId>r000013_42267_150403</ns2:RunId>
+                    <ns2:Name>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:Name>
+                </ns2:RunDetails>
+                <ns2:WellSample Name="Inst42267-040315-SAT-100pM-2kb-P6C4">
+                    <ns2:PlateId>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:PlateId>
+                    <ns2:WellName>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:WellName>
+                    <ns2:Concentration>0.0</ns2:Concentration>                         
+                    <ns2:SampleReuseEnabled>false</ns2:SampleReuseEnabled>
+                    <ns2:StageHotstartEnabled>false</ns2:StageHotstartEnabled>
+                    <ns2:SizeSelectionEnabled>false</ns2:SizeSelectionEnabled>
+                    <ns2:UseCount>1</ns2:UseCount>
+                    <ns1:BioSamplePointers>
+                        <ns1:BioSamplePointer>251acf71-9eb0-489e-9dd1-cdbd11432752</ns1:BioSamplePointer>
+                    </ns1:BioSamplePointers>
+                </ns2:WellSample>
+                <ns2:Automation>
+                    <ns0:AutomationParameters>
+                        <ns0:AutomationParameter />
+                    </ns0:AutomationParameters>
+                </ns2:Automation>
+                <ns2:CollectionNumber>7</ns2:CollectionNumber>
+                <ns2:CellIndex>4</ns2:CellIndex>
+                <ns2:CellPac Barcode="10080792080000000182317411029151" />
+                <ns2:Primary>
+                    <ns2:AutomationName>BasecallerV1</ns2:AutomationName>
+                    <ns2:ConfigFileName>2-3-0_P6-C4.xml</ns2:ConfigFileName>
+                    <ns2:SequencingCondition />
+                    <ns2:OutputOptions>
+                        <ns2:ResultsFolder>Analysis_Results</ns2:ResultsFolder>
+                        <ns2:CollectionPathUri>rsy://mp-rsync/vol55//RS_DATA_STAGING/42267/Inst42267-040315-SAT-100pM-2kb-P6C4_13/A04_7/</ns2:CollectionPathUri>
+                        <ns2:CopyFiles>
+                            <ns2:CollectionFileCopy>Fasta</ns2:CollectionFileCopy>
+                        </ns2:CopyFiles>
+                        <ns2:Readout>Bases</ns2:Readout>
+                        <ns2:MetricsVerbosity>Minimal</ns2:MetricsVerbosity>
+                     </ns2:OutputOptions>
+                 </ns2:Primary>
+             </ns2:CollectionMetadata>
+         </ns2:Collections>
+         <ns1:BioSamples>
+             <ns1:BioSample
+                 Description="Inst42267-SAT-100pM-2kbLambda-P6C4-Std120_CPS_040315"
+                 MetaType="PacBio.Sample" 
+                 Name="Inst42267-040315-SAT-100pM-2kb-P6C4" 
+                 TimeStampedName="biosample_2015-08-19T15:39:36.331-07:00" 
+                 UniqueId="251acf71-9eb0-489e-9dd1-cdbd11432752" />
+         </ns1:BioSamples>
+      </DataSetMetadata>
+</SubreadSet>
diff --git a/tests/data/dataset/pbalchemy10kbp.xml b/tests/data/dataset/pbalchemy10kbp.xml

new file mode 100644 (file)

index 0000000..96189ad
--- /dev/null
+++ b/tests/data/dataset/pbalchemy10kbp.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:DataSet CreatedAt="2015-05-22T16:56:16" MetaType="PacBio.DataSet.DataSet" Name="" Tags="" UniqueId="58e3f7c5-24c1-b58b-fbd5-37de268cc2f0" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+  <pbbase:ExternalResources>
+    <pbbase:ExternalResource ResourceId="file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam">
+      <pbbase:FileIndices>
+        <pbbase:FileIndex ResourceId="file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam.bai"/>
+      </pbbase:FileIndices>
+    </pbbase:ExternalResource>
+  </pbbase:ExternalResources>
+  <pbds:Filters>
+      <pbds:Filter>
+          <pbbase:Properties>
+              <pbbase:Property Name="rname" Value="E.faecalis.1" Operator="=" />
+          </pbbase:Properties>
+      </pbds:Filter>
+  </pbds:Filters>
+</pbds:DataSet>
diff --git a/tests/data/dataset/reference.dataset.xml b/tests/data/dataset/reference.dataset.xml

new file mode 100644 (file)

index 0000000..3cfbe8c
--- /dev/null
+++ b/tests/data/dataset/reference.dataset.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:ReferenceSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.ReferenceSet" Name="DataSet_ReferenceSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First References FASTA" Description="Points to an example references FASTA file." MetaType="PacBio.ReferenceFile.ReferenceFastaFile" ResourceId="file:///mnt/path/to/reference.fasta" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.SaWriterIndex" ResourceId="file:///mnt/path/to/reference.fasta.sa"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.SamIndex" ResourceId="file:///mnt/path/to/reference.fasta.fai"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>5000000</pbds:TotalLength>
+               <pbds:NumRecords>500</pbds:NumRecords>
+               <pbds:Organism>Tribble</pbds:Organism>
+               <pbds:Ploidy>Diploid</pbds:Ploidy>
+               <pbds:Contigs>
+                       <pbds:Contig Name="gi|229359445|emb|AM181176.4|" Description="Pseudomonas fluorescens SBW25 complete genome|quiver" Length="6722109" Digest="f627c795efad7ce0050ed42b942d408e"/>
+               </pbds:Contigs>
+       </pbds:DataSetMetadata>
+</pbds:ReferenceSet>
diff --git a/tests/data/dataset/subread_dataset1.xml b/tests/data/dataset/subread_dataset1.xml

new file mode 100644 (file)

index 0000000..1d64e79
--- /dev/null
+++ b/tests/data/dataset/subread_dataset1.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.SubreadSet" Name="DataSet_SubreadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd" >
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource Name="First Subreads BAM" Description="Points to an example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads0.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads0.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource Name="Second Subreads BAM" Description="Points to another example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads1.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads0.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:Filters>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="rq" Value="0.75" Operator=">"/>
+            </pbbase:Properties>
+        </pbds:Filter>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="QNAME" Value="100/0/0_100" Operator="=="/>
+            </pbbase:Properties>
+        </pbds:Filter>
+    </pbds:Filters>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>500000</pbds:TotalLength>
+        <pbds:NumRecords>500</pbds:NumRecords>
+        <pbmeta:Collections>
+            <pbmeta:CollectionMetadata Context="m152720_092723_00114_c100480560100000001823075906281381_s1_p0" InstrumentName="RS" InstrumentId="43210">
+                <pbmeta:InstCtrlVer>2.3.0.0.140640</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:RunId>e903682f-e502-465c-a2b6-9dd77c9f43fc</pbmeta:RunId>
+                    <pbmeta:Name>beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p</pbmeta:Name>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample Name="Well Sample 1" UniqueId="aaa2df90-d44f-4a48-9f35-3b99473c68f5">
+                    <pbmeta:PlateId>2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers</pbmeta:PlateId>
+                    <pbmeta:WellName>B01</pbmeta:WellName>
+                    <pbmeta:Concentration>10</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>true</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>true</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                    <pbmeta:Comments>Lorem ipsum</pbmeta:Comments>
+                    <pbsample:BioSamplePointers>
+                        <pbsample:BioSamplePointer>abc2df90-d44f-4a48-9f35-3b99473c68f5</pbsample:BioSamplePointer>
+                    </pbsample:BioSamplePointers>
+                </pbmeta:WellSample>
+                <pbmeta:Automation>
+                    <pbbase:AutomationParameters>
+                                   <pbbase:AutomationParameter/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:CellPac Barcode="100480560100000001823075906281381"/>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>1-3-0_Standard_C2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:SequencingCondition/>
+                    <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+                    <pbmeta:CollectionPathUri>rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1</pbmeta:CollectionPathUri>
+                    <pbmeta:CopyFiles>
+                        <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                    </pbmeta:CopyFiles>
+                </pbmeta:Primary>
+            </pbmeta:CollectionMetadata>
+        </pbmeta:Collections>
+        <pbsample:BioSamples>
+            <pbsample:BioSample UniqueId="abc2df90-d44f-4a48-9f35-3b99473c68f5" Name="consectetur purus" Description="Risus sit amet lectus vehicula vulputate quisque porta accumsan venenatis." CreatedAt="2015-01-20T13:27:23.9271737-08:00"/>
+        </pbsample:BioSamples>
+    </pbds:DataSetMetadata>
+</pbds:SubreadSet>
+<!-- TODO what do internal references look like?-->
diff --git a/tests/data/dataset/subread_dataset2.xml b/tests/data/dataset/subread_dataset2.xml

new file mode 100644 (file)

index 0000000..a395330
--- /dev/null
+++ b/tests/data/dataset/subread_dataset2.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.SubreadSet" Name="DataSet_SubreadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd" >
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource Name="First Subreads BAM" Description="Points to an example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads2.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads2.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource Name="Second Subreads BAM" Description="Points to another example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads3.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads3.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:Filters>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="rq" Value="0.75" Operator=">"/>
+            </pbbase:Properties>
+        </pbds:Filter>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="QNAME" Value="100/0/0_100" Operator="=="/>
+            </pbbase:Properties>
+        </pbds:Filter>
+    </pbds:Filters>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>500000</pbds:TotalLength>
+        <pbds:NumRecords>500</pbds:NumRecords>
+        <pbmeta:Collections>
+            <pbmeta:CollectionMetadata Context="m152720_092723_00114_c100480560100000001823075906281381_s1_p0" InstrumentName="RS" InstrumentId="43210">
+                <pbmeta:InstCtrlVer>2.3.0.0.140640</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:RunId>e903682f-e502-465c-a2b6-9dd77c9f43fc</pbmeta:RunId>
+                    <pbmeta:Name>beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p</pbmeta:Name>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample Name="Well Sample 1" UniqueId="aaa2df90-d44f-4a48-9f35-3b99473c68f5">
+                    <pbmeta:PlateId>2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers</pbmeta:PlateId>
+                    <pbmeta:WellName>B01</pbmeta:WellName>
+                    <pbmeta:Concentration>10</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>true</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>true</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                    <pbmeta:Comments>Lorem ipsum</pbmeta:Comments>
+                    <pbsample:BioSamplePointers>
+                        <pbsample:BioSamplePointer>abc2df90-d44f-4a48-9f35-3b99473c68f5</pbsample:BioSamplePointer>
+                    </pbsample:BioSamplePointers>
+                </pbmeta:WellSample>
+                <pbmeta:Automation>
+                    <pbbase:AutomationParameters>
+                            <pbbase:AutomationParameter/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:CellPac Barcode="100480560100000001823075906281381"/>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>1-3-0_Standard_C2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:SequencingCondition/>
+                    <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+                    <pbmeta:CollectionPathUri>rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1</pbmeta:CollectionPathUri>
+                    <pbmeta:CopyFiles>
+                        <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                    </pbmeta:CopyFiles>
+                </pbmeta:Primary>
+            </pbmeta:CollectionMetadata>
+        </pbmeta:Collections>
+        <pbsample:BioSamples>
+            <pbsample:BioSample UniqueId="abc2df90-d44f-4a48-9f35-3b99473c68f5" Name="consectetur purus" Description="Risus sit amet lectus vehicula vulputate quisque porta accumsan venenatis." CreatedAt="2015-01-20T13:27:23.9271737-08:00"/>
+        </pbsample:BioSamples>
+    </pbds:DataSetMetadata>
+</pbds:SubreadSet>
+<!-- TODO what do internal references look like?-->
diff --git a/tests/data/dataset/subread_dataset3.xml b/tests/data/dataset/subread_dataset3.xml

new file mode 100644 (file)

index 0000000..91923a8
--- /dev/null
+++ b/tests/data/dataset/subread_dataset3.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.SubreadSet" Name="DataSet_SubreadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd" >
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource Name="First Subreads BAM" Description="Points to an example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads2.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads2.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource Name="Second Subreads BAM" Description="Points to another example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads3.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads3.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:Filters>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+            </pbbase:Properties>
+        </pbds:Filter>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="QNAME" Value="100/0/0_100" Operator="=="/>
+            </pbbase:Properties>
+        </pbds:Filter>
+    </pbds:Filters>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>500000</pbds:TotalLength>
+        <pbds:NumRecords>500</pbds:NumRecords>
+        <pbmeta:Collections>
+            <pbmeta:CollectionMetadata Context="m152720_092723_00114_c100480560100000001823075906281381_s1_p0" InstrumentName="RS" InstrumentId="43210">
+                <pbmeta:InstCtrlVer>2.3.0.0.140640</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:RunId>e903682f-e502-465c-a2b6-9dd77c9f43fc</pbmeta:RunId>
+                    <pbmeta:Name>beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p</pbmeta:Name>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample Name="Well Sample 1" UniqueId="aaa2df90-d44f-4a48-9f35-3b99473c68f5">
+                    <pbmeta:PlateId>2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers</pbmeta:PlateId>
+                    <pbmeta:WellName>B01</pbmeta:WellName>
+                    <pbmeta:Concentration>10</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>true</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>true</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                    <pbmeta:Comments>Lorem ipsum</pbmeta:Comments>
+                    <pbsample:BioSamplePointers>
+                        <pbsample:BioSamplePointer>abc2df90-d44f-4a48-9f35-3b99473c68f5</pbsample:BioSamplePointer>
+                    </pbsample:BioSamplePointers>
+                </pbmeta:WellSample>
+                <pbmeta:Automation>
+                    <pbbase:AutomationParameters>
+                                   <pbbase:AutomationParameter/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:CellPac Barcode="100480560100000001823075906281381"/>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>1-3-0_Standard_C2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:SequencingCondition/>
+                    <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+                    <pbmeta:CollectionPathUri>rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1</pbmeta:CollectionPathUri>
+                    <pbmeta:CopyFiles>
+                        <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                    </pbmeta:CopyFiles>
+                </pbmeta:Primary>
+            </pbmeta:CollectionMetadata>
+        </pbmeta:Collections>
+        <pbsample:BioSamples>
+            <pbsample:BioSample UniqueId="abc2df90-d44f-4a48-9f35-3b99473c68f5" Name="consectetur purus" Description="Risus sit amet lectus vehicula vulputate quisque porta accumsan venenatis." CreatedAt="2015-01-20T13:27:23.9271737-08:00"/>
+        </pbsample:BioSamples>
+    </pbds:DataSetMetadata>
+</pbds:SubreadSet>
+<!-- TODO what do internal references look like?-->
diff --git a/tests/data/dataset/transformed_rs_subread_dataset.xml b/tests/data/dataset/transformed_rs_subread_dataset.xml

new file mode 100644 (file)

index 0000000..465d9a6
--- /dev/null
+++ b/tests/data/dataset/transformed_rs_subread_dataset.xml
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:HdfSubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xmlns:bax="http://whatever"
+    xmlns:fn="http://www.w3.org/2005/xpath-functions"
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" 
+    xmlns:uuid="java:java.util.UUID" 
+    xmlns:xs="http://www.w3.org/2001/XMLSchema"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    Name="Subreads from run r001173_42129_130607"
+    MetaType="PacBio.DataSet.SubreadSet"
+    Tags="pacbio.secondary.instrument=RS"
+    Version="0.5"
+    UniqueId="abbc9183-b01e-4671-8c12-19efee534647">
+   <pbbase:ExternalResources>
+      <pbbase:ExternalResource MetaType="PacBio.SubreadFile.BaxFile"
+          ResourceId="file:///mnt/secondary-siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.0.bax.h5"/>
+      <pbbase:ExternalResource MetaType="PacBio.SubreadFile.BaxFile"
+          ResourceId="file:///mnt/secondary-siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.1.bax.h5"/>
+      <pbbase:ExternalResource MetaType="PacBio.SubreadFile.BaxFile"
+          ResourceId="file:///mnt/secondary-siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.2.bax.h5"/>
+   </pbbase:ExternalResources>
+   <pbds:DataSetMetadata>
+      <pbds:TotalLength>50000000</pbds:TotalLength>
+      <pbds:NumRecords>150000</pbds:NumRecords>
+      <pbmeta:Collections>
+         <pbmeta:CollectionMetadata Context="m130608_033634_42129_c100515232550000001823076608221351_s1_p0"
+                             InstrumentName="42129"
+                             InstrumentId="1">
+            <pbmeta:InstCtrlVer>2.0.1.0.124174</pbmeta:InstCtrlVer>
+            <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2010.124174, HwVer=1.0</pbmeta:SigProcVer>
+            <pbmeta:RunDetails>
+               <pbmeta:RunId>r001173_42129_130607</pbmeta:RunId>
+               <pbmeta:Name>2013-06-07_42129_10kb_Ecoli_201-validation_2</pbmeta:Name>
+            </pbmeta:RunDetails>
+            <pbmeta:WellSample Name="P4-C2_Ecoli_10kb_MBS_stageHS">
+               <pbmeta:PlateId>2013-06-07_42129_10kb_Ecoli_201-validation_2</pbmeta:PlateId>
+               <pbmeta:WellName>P4-C2_Ecoli_10kb_MBS_stageHS</pbmeta:WellName>
+               <pbmeta:Concentration>0</pbmeta:Concentration>
+               <pbmeta:SampleReuseEnabled>false</pbmeta:SampleReuseEnabled>
+               <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+               <pbmeta:SizeSelectionEnabled>
+                                    false
+                                                               </pbmeta:SizeSelectionEnabled>
+               <pbmeta:UseCount>1</pbmeta:UseCount>
+               <pbmeta:Comments>P4-C2_Ecoli_10kb_MBS_stageHS</pbmeta:Comments>
+               <pbsample:BioSamplePointers>
+                  <pbsample:BioSamplePointer>abafd4ed-5cf7-4b83-a869-1a5d239d30e2</pbsample:BioSamplePointer>
+               </pbsample:BioSamplePointers>
+            </pbmeta:WellSample>
+            <pbmeta:AutomationName>MagBead Standard Seq v2</pbmeta:AutomationName>
+            <pbmeta:CollectionNumber>2</pbmeta:CollectionNumber>
+            <pbmeta:CellIndex>1</pbmeta:CellIndex>
+            <pbmeta:CellPac Barcode="10051523255000000182307660822135"/>
+            <pbmeta:Primary>
+               <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+               <pbmeta:ConfigFileName>2-0-0_P4-C2.xml</pbmeta:ConfigFileName>
+               <pbmeta:SequencingCondition/>
+               <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+               <pbmeta:CollectionPathUri>rsy://mp-f030-io/vol54//RS_DATA_STAGING/42129/2013-06-07_42129_10kb_Ecoli_201-validation_2_1173/A01_2/</pbmeta:CollectionPathUri>
+               <pbmeta:CopyFiles>
+                  <pbmeta:CollectionFileCopy>Fasta</pbmeta:CollectionFileCopy>
+               </pbmeta:CopyFiles>
+            </pbmeta:Primary>
+         </pbmeta:CollectionMetadata>
+      </pbmeta:Collections>
+      <pbsample:BioSamples>
+         <pbsample:BioSample Name="P4-C2_Ecoli_10kb_MBS_stageHS" Description="P4-C2_Ecoli_10kb_MBS_stageHS"
+                    UniqueId="abafd4ed-5cf7-4b83-a869-1a5d239d30e2"/>
+      </pbsample:BioSamples>
+   </pbds:DataSetMetadata>
+</pbds:HdfSubreadSet>
diff --git a/tests/data/empty.bam b/tests/data/empty.bam

new file mode 100644 (file)

index 0000000..1b22456

Binary files /dev/null and b/tests/data/empty.bam differ
diff --git a/tests/data/empty.bam.pbi b/tests/data/empty.bam.pbi

new file mode 100644 (file)

index 0000000..e398d79

Binary files /dev/null and b/tests/data/empty.bam.pbi differ
diff --git a/tests/data/group/group.fofn.in b/tests/data/group/group.fofn.in

new file mode 100644 (file)

index 0000000..c2621c5
--- /dev/null
+++ b/tests/data/group/group.fofn.in
@@ -0,0 +1,3 @@
+@PacBioBAM_TestsDir@/data/group/test1.bam
+@PacBioBAM_TestsDir@/data/group/test2.bam
+@PacBioBAM_TestsDir@/data/group/test3.bam
diff --git a/tests/data/group/test1.bam b/tests/data/group/test1.bam

new file mode 100644 (file)

index 0000000..2ba687b

Binary files /dev/null and b/tests/data/group/test1.bam differ
diff --git a/tests/data/group/test2.bam b/tests/data/group/test2.bam

new file mode 100644 (file)

index 0000000..9e22b30

Binary files /dev/null and b/tests/data/group/test2.bam differ
diff --git a/tests/data/group/test2.bam.pbi b/tests/data/group/test2.bam.pbi

new file mode 100644 (file)

index 0000000..761600b

Binary files /dev/null and b/tests/data/group/test2.bam.pbi differ
diff --git a/tests/data/group/test3.bam b/tests/data/group/test3.bam

new file mode 100644 (file)

index 0000000..093e93a

Binary files /dev/null and b/tests/data/group/test3.bam differ
diff --git a/tests/data/lambdaNEB.fa b/tests/data/lambdaNEB.fa

new file mode 100644 (file)

index 0000000..33011e5
--- /dev/null
+++ b/tests/data/lambdaNEB.fa
@@ -0,0 +1,608 @@
+>lambda_NEB3011
+GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTA
+ATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGACAGGTGCTGAAAGCGAGCTTTTTGGCCTCTGTCGTTTCC
+TTTCTCTGTTTTTGTCCGTGGAATGAACAATGGAAGTCAACAAAAAGCAGCTGGCTGACATTTTCGGTGCGAGTATCCGT
+ACCATTCAGAACTGGCAGGAACAGGGAATGCCCGTTCTGCGAGGCGGTGGCAAGGGTAATGAGGTGCTTTATGACTCTGC
+CGCCGTCATAAAATGGTATGCCGAAAGGGATGCTGAAATTGAGAACGAAAAGCTGCGCCGGGAGGTTGAAGAACTGCGGC
+AGGCCAGCGAGGCAGATCTCCAGCCAGGAACTATTGAGTACGAACGCCATCGACTTACGCGTGCGCAGGCCGACGCACAG
+GAACTGAAGAATGCCAGAGACTCCGCTGAAGTGGTGGAAACCGCATTCTGTACTTTCGTGCTGTCGCGGATCGCAGGTGA
+AATTGCCAGTATTCTCGACGGGCTCCCCCTGTCGGTGCAGCGGCGTTTTCCGGAACTGGAAAACCGACATGTTGATTTCC
+TGAAACGGGATATCATCAAAGCCATGAACAAAGCAGCCGCGCTGGATGAACTGATACCGGGGTTGCTGAGTGAATATATC
+GAACAGTCAGGTTAACAGGCTGCGGCATTTTGTCCGCGCCGGGCTTCGCTCACTGTTCAGGCCGGAGCCACAGACCGCCG
+TTGAATGGGCGGATGCTAATTACTATCTCCCGAAAGAATCCGCATACCAGGAAGGGCGCTGGGAAACACTGCCCTTTCAG
+CGGGCCATCATGAATGCGATGGGCAGCGACTACATCCGTGAGGTGAATGTGGTGAAGTCTGCCCGTGTCGGTTATTCCAA
+AATGCTGCTGGGTGTTTATGCCTACTTTATAGAGCATAAGCAGCGCAACACCCTTATCTGGTTGCCGACGGATGGTGATG
+CCGAGAACTTTATGAAAACCCACGTTGAGCCGACTATTCGTGATATTCCGTCGCTGCTGGCGCTGGCCCCGTGGTATGGC
+AAAAAGCACCGGGATAACACGCTCACCATGAAGCGTTTCACTAATGGGCGTGGCTTCTGGTGCCTGGGCGGTAAAGCGGC
+AAAAAACTACCGTGAAAAGTCGGTGGATGTGGCGGGTTATGATGAACTTGCTGCTTTTGATGATGATATTGAACAGGAAG
+GCTCTCCGACGTTCCTGGGTGACAAGCGTATTGAAGGCTCGGTCTGGCCAAAGTCCATCCGTGGCTCCACGCCAAAAGTG
+AGAGGCACCTGTCAGATTGAGCGTGCAGCCAGTGAATCCCCGCATTTTATGCGTTTTCATGTTGCCTGCCCGCATTGCGG
+GGAGGAGCAGTATCTTAAATTTGGCGACAAAGAGACGCCGTTTGGCCTCAAATGGACGCCGGATGACCCCTCCAGCGTGT
+TTTATCTCTGCGAGCATAATGCCTGCGTCATCCGCCAGCAGGAGCTGGACTTTACTGATGCCCGTTATATCTGCGAAAAG
+ACCGGGATCTGGACCCGTGATGGCATTCTCTGGTTTTCGTCATCCGGTGAAGAGATTGAGCCACCTGACAGTGTGACCTT
+TCACATCTGGACAGCGTACAGCCCGTTCACCACCTGGGTGCAGATTGTCAAAGACTGGATGAAAACGAAAGGGGATACGG
+GAAAACGTAAAACCTTCGTAAACACCACGCTCGGTGAGACGTGGGAGGCGAAAATTGGCGAACGTCCGGATGCTGAAGTG
+ATGGCAGAGCGGAAAGAGCATTATTCAGCGCCCGTTCCTGACCGTGTGGCTTACCTGACCGCCGGTATCGACTCCCAGCT
+GGACCGCTACGAAATGCGCGTATGGGGATGGGGGCCGGGTGAGGAAAGCTGGCTGATTGACCGGCAGATTATTATGGGCC
+GCCACGACGATGAACAGACGCTGCTGCGTGTGGATGAGGCCATCAATAAAACCTATACCCGCCGGAATGGTGCAGAAATG
+TCGATATCCCGTATCTGCTGGGATACTGGCGGGATTGACCCGACCATTGTGTATGAACGCTCGAAAAAACATGGGCTGTT
+CCGGGTGATCCCCATTAAAGGGGCATCCGTCTACGGAAAGCCGGTGGCCAGCATGCCACGTAAGCGAAACAAAAACGGGG
+TTTACCTTACCGAAATCGGTACGGATACCGCGAAAGAGCAGATTTATAACCGCTTCACACTGACGCCGGAAGGGGATGAA
+CCGCTTCCCGGTGCCGTTCACTTCCCGAATAACCCGGATATTTTTGATCTGACCGAAGCGCAGCAGCTGACTGCTGAAGA
+GCAGGTCGAAAAATGGGTGGATGGCAGGAAAAAAATACTGTGGGACAGCAAAAAGCGACGCAATGAGGCACTCGACTGCT
+TCGTTTATGCGCTGGCGGCGCTGCGCATCAGTATTTCCCGCTGGCAGCTGGATCTCAGTGCGCTGCTGGCGAGCCTGCAG
+GAAGAGGATGGTGCAGCAACCAACAAGAAAACACTGGCAGATTACGCCCGTGCCTTATCCGGAGAGGATGAATGACGCGA
+CAGGAAGAACTTGCCGCTGCCCGTGCGGCACTGCATGACCTGATGACAGGTAAACGGGTGGCAACAGTACAGAAAGACGG
+ACGAAGGGTGGAGTTTACGGCCACTTCCGTGTCTGACCTGAAAAAATATATTGCAGAGCTGGAAGTGCAGACCGGCATGA
+CACAGCGACGCAGGGGACCTGCAGGATTTTATGTATGAAAACGCCCACCATTCCCACCCTTCTGGGGCCGGACGGCATGA
+CATCGCTGCGCGAATATGCCGGTTATCACGGCGGTGGCAGCGGATTTGGAGGGCAGTTGCGGTCGTGGAACCCACCGAGT
+GAAAGTGTGGATGCAGCCCTGTTGCCCAACTTTACCCGTGGCAATGCCCGCGCAGACGATCTGGTACGCAATAACGGCTA
+TGCCGCCAACGCCATCCAGCTGCATCAGGATCATATCGTCGGGTCTTTTTTCCGGCTCAGTCATCGCCCAAGCTGGCGCT
+ATCTGGGCATCGGGGAGGAAGAAGCCCGTGCCTTTTCCCGCGAGGTTGAAGCGGCATGGAAAGAGTTTGCCGAGGATGAC
+TGCTGCTGCATTGACGTTGAGCGAAAACGCACGTTTACCATGATGATTCGGGAAGGTGTGGCCATGCACGCCTTTAACGG
+TGAACTGTTCGTTCAGGCCACCTGGGATACCAGTTCGTCGCGGCTTTTCCGGACACAGTTCCGGATGGTCAGCCCGAAGC
+GCATCAGCAACCCGAACAATACCGGCGACAGCCGGAACTGCCGTGCCGGTGTGCAGATTAATGACAGCGGTGCGGCGCTG
+GGATATTACGTCAGCGAGGACGGGTATCCTGGCTGGATGCCGCAGAAATGGACATGGATACCCCGTGAGTTACCCGGCGG
+GCGCGCCTCGTTCATTCACGTTTTTGAACCCGTGGAGGACGGGCAGACTCGCGGTGCAAATGTGTTTTACAGCGTGATGG
+AGCAGATGAAGATGCTCGACACGCTGCAGAACACGCAGCTGCAGAGCGCCATTGTGAAGGCGATGTATGCCGCCACCATT
+GAGAGTGAGCTGGATACGCAGTCAGCGATGGATTTTATTCTGGGCGCGAACAGTCAGGAGCAGCGGGAAAGGCTGACCGG
+CTGGATTGGTGAAATTGCCGCGTATTACGCCGCAGCGCCGGTCCGGCTGGGAGGCGCAAAAGTACCGCACCTGATGCCGG
+GTGACTCACTGAACCTGCAGACGGCTCAGGATACGGATAACGGCTACTCCGTGTTTGAGCAGTCACTGCTGCGGTATATC
+GCTGCCGGGCTGGGTGTCTCGTATGAGCAGCTTTCCCGGAATTACGCCCAGATGAGCTACTCCACGGCACGGGCCAGTGC
+GAACGAGTCGTGGGCGTACTTTATGGGGCGGCGAAAATTCGTCGCATCCCGTCAGGCGAGCCAGATGTTTCTGTGCTGGC
+TGGAAGAGGCCATCGTTCGCCGCGTGGTGACGTTACCTTCAAAAGCGCGCTTCAGTTTTCAGGAAGCCCGCAGTGCCTGG
+GGGAACTGCGACTGGATAGGCTCCGGTCGTATGGCCATCGATGGTCTGAAAGAAGTTCAGGAAGCGGTGATGCTGATAGA
+AGCCGGACTGAGTACCTACGAGAAAGAGTGCGCAAAACGCGGTGACGACTATCAGGAAATTTTTGCCCAGCAGGTCCGTG
+AAACGATGGAGCGCCGTGCAGCCGGTCTTAAACCGCCCGCCTGGGCGGCTGCAGCATTTGAATCCGGGCTGCGACAATCA
+ACAGAGGAGGAGAAGAGTGACAGCAGAGCTGCGTAATCTCCCGCATATTGCCAGCATGGCCTTTAATGAGCCGCTGATGC
+TTGAACCCGCCTATGCGCGGGTTTTCTTTTGTGCGCTTGCAGGCCAGCTTGGGATCAGCAGCCTGACGGATGCGGTGTCC
+GGCGACAGCCTGACTGCCCAGGAGGCACTCGCGACGCTGGCATTATCCGGTGATGATGACGGACCACGACAGGCCCGCAG
+TTATCAGGTCATGAACGGCATCGCCGTGCTGCCGGTGTCCGGCACGCTGGTCAGCCGGACGCGGGCGCTGCAGCCGTACT
+CGGGGATGACCGGTTACAACGGCATTATCGCCCGTCTGCAACAGGCTGCCAGCGATCCGATGGTGGACGGCATTCTGCTC
+GATATGGACACGCCCGGCGGGATGGTGGCGGGGGCATTTGACTGCGCTGACATCATCGCCCGTGTGCGTGACATAAAACC
+GGTATGGGCGCTTGCCAACGACATGAACTGCAGTGCAGGTCAGTTGCTTGCCAGTGCCGCCTCCCGGCGTCTGGTCACGC
+AGACCGCCCGGACAGGCTCCATCGGCGTCATGATGGCTCACAGTAATTACGGTGCTGCGCTGGAGAAACAGGGTGTGGAA
+ATCACGCTGATTTACAGCGGCAGCCATAAGGTGGATGGCAACCCCTACAGCCATCTTCCGGATGACGTCCGGGAGACACT
+GCAGTCCCGGATGGACGCAACCCGCCAGATGTTTGCGCAGAAGGTGTCGGCATATACCGGCCTGTCCGTGCAGGTTGTGC
+TGGATACCGAGGCTGCAGTGTACAGCGGTCAGGAGGCCATTGATGCCGGACTGGCTGATGAACTTGTTAACAGCACCGAT
+GCGATCACCGTCATGCGTGATGCACTGGATGCACGTAAATCCCGTCTCTCAGGAGGGCGAATGACCAAAGAGACTCAATC
+AACAACTGTTTCAGCCACTGCTTCGCAGGCTGACGTTACTGACGTGGTGCCAGCGACGGAGGGCGAGAACGCCAGCGCGG
+CGCAGCCGGACGTGAACGCGCAGATCACCGCAGCGGTTGCGGCAGAAAACAGCCGCATTATGGGGATCCTCAACTGTGAG
+GAGGCTCACGGACGCGAAGAACAGGCACGCGTGCTGGCAGAAACCCCCGGTATGACCGTGAAAACGGCCCGCCGCATTCT
+GGCCGCAGCACCACAGAGTGCACAGGCGCGCAGTGACACTGCGCTGGATCGTCTGATGCAGGGGGCACCGGCACCGCTGG
+CTGCAGGTAACCCGGCATCTGATGCCGTTAACGATTTGCTGAACACACCAGTGTAAGGGATGTTTATGACGAGCAAAGAA
+ACCTTTACCCATTACCAGCCGCAGGGCAACAGTGACCCGGCTCATACCGCAACCGCGCCCGGCGGATTGAGTGCGAAAGC
+GCCTGCAATGACCCCGCTGATGCTGGACACCTCCAGCCGTAAGCTGGTTGCGTGGGATGGCACCACCGACGGTGCTGCCG
+TTGGCATTCTTGCGGTTGCTGCTGACCAGACCAGCACCACGCTGACGTTCTACAAGTCCGGCACGTTCCGTTATGAGGAT
+GTGCTCTGGCCGGAGGCTGCCAGCGACGAGACGAAAAAACGGACCGCGTTTGCCGGAACGGCAATCAGCATCGTTTAACT
+TTACCCTTCATCACTAAAGGCCGCCTGTGCGGCTTTTTTTACGGGATTTTTTTATGTCGATGTACACAACCGCCCAACTG
+CTGGCGGCAAATGAGCAGAAATTTAAGTTTGATCCGCTGTTTCTGCGTCTCTTTTTCCGTGAGAGCTATCCCTTCACCAC
+GGAGAAAGTCTATCTCTCACAAATTCCGGGACTGGTAAACATGGCGCTGTACGTTTCGCCGATTGTTTCCGGTGAGGTTA
+TCCGTTCCCGTGGCGGCTCCACCTCTGAATTTACGCCGGGATATGTCAAGCCGAAGCATGAAGTGAATCCGCAGATGACC
+CTGCGTCGCCTGCCGGATGAAGATCCGCAGAATCTGGCGGACCCGGCTTACCGCCGCCGTCGCATCATCATGCAGAACAT
+GCGTGACGAAGAGCTGGCCATTGCTCAGGTCGAAGAGATGCAGGCAGTTTCTGCCGTGCTTAAGGGCAAATACACCATGA
+CCGGTGAAGCCTTCGATCCGGTTGAGGTGGATATGGGCCGCAGTGAGGAGAATAACATCACGCAGTCCGGCGGCACGGAG
+TGGAGCAAGCGTGACAAGTCCACGTATGACCCGACCGACGATATCGAAGCCTACGCGCTGAACGCCAGCGGTGTGGTGAA
+TATCATCGTGTTCGATCCGAAAGGCTGGGCGCTGTTCCGTTCCTTCAAAGCCGTCAAGGAGAAGCTGGATACCCGTCGTG
+GCTCTAATTCCGAGCTGGAGACAGCGGTGAAAGACCTGGGCAAAGCGGTGTCCTATAAGGGGATGTATGGCGATGTGGCC
+ATCGTCGTGTATTCCGGACAGTACGTGGAAAACGGCGTCAAAAAGAACTTCCTGCCGGACAACACGATGGTGCTGGGGAA
+CACTCAGGCACGCGGTCTGCGCACCTATGGCTGCATTCAGGATGCGGACGCACAGCGCGAAGGCATTAACGCCTCTGCCC
+GTTACCCGAAAAACTGGGTGACCACCGGCGATCCGGCGCGTGAGTTCACCATGATTCAGTCAGCACCGCTGATGCTGCTG
+GCTGACCCTGATGAGTTCGTGTCCGTACAACTGGCGTAATCATGGCCCTTCGGGGCCATTGTTTCTCTGTGGAGGAGTCC
+ATGACGAAAGATGAACTGATTGCCCGTCTCCGCTCGCTGGGTGAACAACTGAACCGTGATGTCAGCCTGACGGGGACGAA
+AGAAGAACTGGCGCTCCGTGTGGCAGAGCTGAAAGAGGAGCTTGATGACACGGATGAAACTGCCGGTCAGGACACCCCTC
+TCAGCCGGGAAAATGTGCTGACCGGACATGAAAATGAGGTGGGATCAGCGCAGCCGGATACCGTGATTCTGGATACGTCT
+GAACTGGTCACGGTCGTGGCACTGGTGAAGCTGCATACTGATGCACTTCACGCCACGCGGGATGAACCTGTGGCATTTGT
+GCTGCCGGGAACGGCGTTTCGTGTCTCTGCCGGTGTGGCAGCCGAAATGACAGAGCGCGGCCTGGCCAGAATGCAATAAC
+GGGAGGCGCTGTGGCTGATTTCGATAACCTGTTCGATGCTGCCATTGCCCGCGCCGATGAAACGATACGCGGGTACATGG
+GAACGTCAGCCACCATTACATCCGGTGAGCAGTCAGGTGCGGTGATACGTGGTGTTTTTGATGACCCTGAAAATATCAGC
+TATGCCGGACAGGGCGTGCGCGTTGAAGGCTCCAGCCCGTCCCTGTTTGTCCGGACTGATGAGGTGCGGCAGCTGCGGCG
+TGGAGACACGCTGACCATCGGTGAGGAAAATTTCTGGGTAGATCGGGTTTCGCCGGATGATGGCGGAAGTTGTCATCTCT
+GGCTTGGACGGGGCGTACCGCCTGCCGTTAACCGTCGCCGCTGAAAGGGGGATGTATGGCCATAAAAGGTCTTGAGCAGG
+CCGTTGAAAACCTCAGCCGTATCAGCAAAACGGCGGTGCCTGGTGCCGCCGCAATGGCCATTAACCGCGTTGCTTCATCC
+GCGATATCGCAGTCGGCGTCACAGGTTGCCCGTGAGACAAAGGTACGCCGGAAACTGGTAAAGGAAAGGGCCAGGCTGAA
+AAGGGCCACGGTCAAAAATCCGCAGGCCAGAATCAAAGTTAACCGGGGGGATTTGCCCGTAATCAAGCTGGGTAATGCGC
+GGGTTGTCCTTTCGCGCCGCAGGCGTCGTAAAAAGGGGCAGCGTTCATCCCTGAAAGGTGGCGGCAGCGTGCTTGTGGTG
+GGTAACCGTCGTATTCCCGGCGCGTTTATTCAGCAACTGAAAAATGGCCGGTGGCATGTCATGCAGCGTGTGGCTGGGAA
+AAACCGTTACCCCATTGATGTGGTGAAAATCCCGATGGCGGTGCCGCTGACCACGGCGTTTAAACAAAATATTGAGCGGA
+TACGGCGTGAACGTCTTCCGAAAGAGCTGGGCTATGCGCTGCAGCATCAACTGAGGATGGTAATAAAGCGATGAAACATA
+CTGAACTCCGTGCAGCCGTACTGGATGCACTGGAGAAGCATGACACCGGGGCGACGTTTTTTGATGGTCGCCCCGCTGTT
+TTTGATGAGGCGGATTTTCCGGCAGTTGCCGTTTATCTCACCGGCGCTGAATACACGGGCGAAGAGCTGGACAGCGATAC
+CTGGCAGGCGGAGCTGCATATCGAAGTTTTCCTGCCTGCTCAGGTGCCGGATTCAGAGCTGGATGCGTGGATGGAGTCCC
+GGATTTATCCGGTGATGAGCGATATCCCGGCACTGTCAGATTTGATCACCAGTATGGTGGCCAGCGGCTATGACTACCGG
+CGCGACGATGATGCGGGCTTGTGGAGTTCAGCCGATCTGACTTATGTCATTACCTATGAAATGTGAGGACGCTATGCCTG
+TACCAAATCCTACAATGCCGGTGAAAGGTGCCGGGACCACCCTGTGGGTTTATAAGGGGAGCGGTGACCCTTACGCGAAT
+CCGCTTTCAGACGTTGACTGGTCGCGTCTGGCAAAAGTTAAAGACCTGACGCCCGGCGAACTGACCGCTGAGTCCTATGA
+CGACAGCTATCTCGATGATGAAGATGCAGACTGGACTGCGACCGGGCAGGGGCAGAAATCTGCCGGAGATACCAGCTTCA
+CGCTGGCGTGGATGCCCGGAGAGCAGGGGCAGCAGGCGCTGCTGGCGTGGTTTAATGAAGGCGATACCCGTGCCTATAAA
+ATCCGCTTCCCGAACGGCACGGTCGATGTGTTCCGTGGCTGGGTCAGCAGTATCGGTAAGGCGGTGACGGCGAAGGAAGT
+GATCACCCGCACGGTGAAAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGTAACAGCGGCAACCG
+GCATGACCGTGACGCCTGCCAGCACCTCGGTGGTGAAAGGGCAGAGCACCACGCTGACCGTGGCCTTCCAGCCGGAGGGC
+GTAACCGACAAGAGCTTTCGTGCGGTGTCTGCGGATAAAACAAAAGCCACCGTGTCGGTCAGTGGTATGACCATCACCGT
+GAACGGCGTTGCTGCAGGCAAGGTCAACATTCCGGTTGTATCCGGTAATGGTGAGTTTGCTGCGGTTGCAGAAATTACCG
+TCACCGCCAGTTAATCCGGAGAGTCAGCGATGTTCCTGAAAACCGAATCATTTGAACATAACGGTGTGACCGTCACGCTT
+TCTGAACTGTCAGCCCTGCAGCGCATTGAGCATCTCGCCCTGATGAAACGGCAGGCAGAACAGGCGGAGTCAGACAGCAA
+CCGGAAGTTTACTGTGGAAGACGCCATCAGAACCGGCGCGTTTCTGGTGGCGATGTCCCTGTGGCATAACCATCCGCAGA
+AGACGCAGATGCCGTCCATGAATGAAGCCGTTAAACAGATTGAGCAGGAAGTGCTTACCACCTGGCCCACGGAGGCAATT
+TCTCATGCTGAAAACGTGGTGTACCGGCTGTCTGGTATGTATGAGTTTGTGGTGAATAATGCCCCTGAACAGACAGAGGA
+CGCCGGGCCCGCAGAGCCTGTTTCTGCGGGAAAGTGTTCGACGGTGAGCTGAGTTTTGCCCTGAAACTGGCGCGTGAGAT
+GGGGCGACCCGACTGGCGTGCCATGCTTGCCGGGATGTCATCCACGGAGTATGCCGACTGGCACCGCTTTTACAGTACCC
+ATTATTTTCATGATGTTCTGCTGGATATGCACTTTTCCGGGCTGACGTACACCGTGCTCAGCCTGTTTTTCAGCGATCCG
+GATATGCATCCGCTGGATTTCAGTCTGCTGAACCGGCGCGAGGCTGACGAAGAGCCTGAAGATGATGTGCTGATGCAGAA
+AGCGGCAGGGCTTGCCGGAGGTGTCCGCTTTGGCCCGGACGGGAATGAAGTTATCCCCGCTTCCCCGGATGTGGCGGACA
+TGACGGAGGATGACGTAATGCTGATGACAGTATCAGAAGGGATCGCAGGAGGAGTCCGGTATGGCTGAACCGGTAGGCGA
+TCTGGTCGTTGATTTGAGTCTGGATGCGGCCAGATTTGACGAGCAGATGGCCAGAGTCAGGCGTCATTTTTCTGGTACGG
+AAAGTGATGCGAAAAAAACAGCGGCAGTCGTTGAACAGTCGCTGAGCCGACAGGCGCTGGCTGCACAGAAAGCGGGGATT
+TCCGTCGGGCAGTATAAAGCCGCCATGCGTATGCTGCCTGCACAGTTCACCGACGTGGCCACGCAGCTTGCAGGCGGGCA
+AAGTCCGTGGCTGATCCTGCTGCAACAGGGGGGGCAGGTGAAGGACTCCTTCGGCGGGATGATCCCCATGTTCAGGGGGC
+TTGCCGGTGCGATCACCCTGCCGATGGTGGGGGCCACCTCGCTGGCGGTGGCGACCGGTGCGCTGGCGTATGCCTGGTAT
+CAGGGCAACTCAACCCTGTCCGATTTCAACAAAACGCTGGTCCTTTCCGGCAATCAGGCGGGACTGACGGCAGATCGTAT
+GCTGGTCCTGTCCAGAGCCGGGCAGGCGGCAGGGCTGACGTTTAACCAGACCAGCGAGTCACTCAGCGCACTGGTTAAGG
+CGGGGGTAAGCGGTGAGGCTCAGATTGCGTCCATCAGCCAGAGTGTGGCGCGTTTCTCCTCTGCATCCGGCGTGGAGGTG
+GACAAGGTCGCTGAAGCCTTCGGGAAGCTGACCACAGACCCGACGTCGGGGCTGACGGCGATGGCTCGCCAGTTCCATAA
+CGTGTCGGCGGAGCAGATTGCGTATGTTGCTCAGTTGCAGCGTTCCGGCGATGAAGCCGGGGCATTGCAGGCGGCGAACG
+AGGCCGCAACGAAAGGGTTTGATGACCAGACCCGCCGCCTGAAAGAGAACATGGGCACGCTGGAGACCTGGGCAGACAGG
+ACTGCGCGGGCATTCAAATCCATGTGGGATGCGGTGCTGGATATTGGTCGTCCTGATACCGCGCAGGAGATGCTGATTAA
+GGCAGAGGCTGCGTATAAGAAAGCAGACGACATCTGGAATCTGCGCAAGGATGATTATTTTGTTAACGATGAAGCGCGGG
+CGCGTTACTGGGATGATCGTGAAAAGGCCCGTCTTGCGCTTGAAGCCGCCCGAAAGAAGGCTGAGCAGCAGACTCAACAG
+GACAAAAATGCGCAGCAGCAGAGCGATACCGAAGCGTCACGGCTGAAATATACCGAAGAGGCGCAGAAGGCTTACGAACG
+GCTGCAGACGCCGCTGGAGAAATATACCGCCCGTCAGGAAGAACTGAACAAGGCACTGAAAGACGGGAAAATCCTGCAGG
+CGGATTACAACACGCTGATGGCGGCGGCGAAAAAGGATTATGAAGCGACGCTGAAAAAGCCGAAACAGTCCAGCGTGAAG
+GTGTCTGCGGGCGATCGTCAGGAAGACAGTGCTCATGCTGCCCTGCTGACGCTTCAGGCAGAACTCCGGACGCTGGAGAA
+GCATGCCGGAGCAAATGAGAAAATCAGCCAGCAGCGCCGGGATTTGTGGAAGGCGGAGAGTCAGTTCGCGGTACTGGAGG
+AGGCGGCGCAACGTCGCCAGCTGTCTGCACAGGAGAAATCCCTGCTGGCGCATAAAGATGAGACGCTGGAGTACAAACGC
+CAGCTGGCTGCACTTGGCGACAAGGTTACGTATCAGGAGCGCCTGAACGCGCTGGCGCAGCAGGCGGATAAATTCGCACA
+GCAGCAACGGGCAAAACGGGCCGCCATTGATGCGAAAAGCCGGGGGCTGACTGACCGGCAGGCAGAACGGGAAGCCACGG
+AACAGCGCCTGAAGGAACAGTATGGCGATAATCCGCTGGCGCTGAATAACGTCATGTCAGAGCAGAAAAAGACCTGGGCG
+GCTGAAGACCAGCTTCGCGGGAACTGGATGGCAGGCCTGAAGTCCGGCTGGAGTGAGTGGGAAGAGAGCGCCACGGACAG
+TATGTCGCAGGTAAAAAGTGCAGCCACGCAGACCTTTGATGGTATTGCACAGAATATGGCGGCGATGCTGACCGGCAGTG
+AGCAGAACTGGCGCAGCTTCACCCGTTCCGTGCTGTCCATGATGACAGAAATTCTGCTTAAGCAGGCAATGGTGGGGATT
+GTCGGGAGTATCGGCAGCGCCATTGGCGGGGCTGTTGGTGGCGGCGCATCCGCGTCAGGCGGTACAGCCATTCAGGCCGC
+TGCGGCGAAATTCCATTTTGCAACCGGAGGATTTACGGGAACCGGCGGCAAATATGAGCCAGCGGGGATTGTTCACCGTG
+GTGAGTTTGTCTTCACGAAGGAGGCAACCAGCCGGATTGGCGTGGGGAATCTTTACCGGCTGATGCGCGGCTATGCCACC
+GGCGGTTATGTCGGTACACCGGGCAGCATGGCAGACAGCCGGTCGCAGGCGTCCGGGACGTTTGAGCAGAATAACCATGT
+GGTGATTAACAACGACGGCACGAACGGGCAGATAGGTCCGGCTGCTCTGAAGGCGGTGTATGACATGGCCCGCAAGGGTG
+CCCGTGATGAAATTCAGACACAGATGCGTGATGGTGGCCTGTTCTCCGGAGGTGGACGATGAAGACCTTCCGCTGGAAAG
+TGAAACCCGGTATGGATGTGGCTTCGGTCCCTTCTGTAAGAAAGGTGCGCTTTGGTGATGGCTATTCTCAGCGAGCGCCT
+GCCGGGCTGAATGCCAACCTGAAAACGTACAGCGTGACGCTTTCTGTCCCCCGTGAGGAGGCCACGGTACTGGAGTCGTT
+TCTGGAAGAGCACGGGGGCTGGAAATCCTTTCTGTGGACGCCGCCTTATGAGTGGCGGCAGATAAAGGTGACCTGCGCAA
+AATGGTCGTCGCGGGTCAGTATGCTGCGTGTTGAGTTCAGCGCAGAGTTTGAACAGGTGGTGAACTGATGCAGGATATCC
+GGCAGGAAACACTGAATGAATGCACCCGTGCGGAGCAGTCGGCCAGCGTGGTGCTCTGGGAAATCGACCTGACAGAGGTC
+GGTGGAGAACGTTATTTTTTCTGTAATGAGCAGAACGAAAAAGGTGAGCCGGTCACCTGGCAGGGGCGACAGTATCAGCC
+GTATCCCATTCAGGGGAGCGGTTTTGAACTGAATGGCAAAGGCACCAGTACGCGCCCCACGCTGACGGTTTCTAACCTGT
+ACGGTATGGTCACCGGGATGGCGGAAGATATGCAGAGTCTGGTCGGCGGAACGGTGGTCCGGCGTAAGGTTTACGCCCGT
+TTTCTGGATGCGGTGAACTTCGTCAACGGAAACAGTTACGCCGATCCGGAGCAGGAGGTGATCAGCCGCTGGCGCATTGA
+GCAGTGCAGCGAACTGAGCGCGGTGAGTGCCTCCTTTGTACTGTCCACGCCGACGGAAACGGATGGCGCTGTTTTTCCGG
+GACGTATCATGCTGGCCAACACCTGCACCTGGACCTATCGCGGTGACGAGTGCGGTTATAGCGGTCCGGCTGTCGCGGAT
+GAATATGACCAGCCAACGTCCGATATCACGAAGGATAAATGCAGCAAATGCCTGAGCGGTTGTAAGTTCCGCAATAACGT
+CGGCAACTTTGGCGGCTTCCTTTCCATTAACAAACTTTCGCAGTAAATCCCATGACACAGACAGAATCAGCGATTCTGGC
+GCACGCCCGGCGATGTGCGCCAGCGGAGTCGTGCGGCTTCGTGGTAAGCACGCCGGAGGGGGAAAGATATTTCCCCTGCG
+TGAATATCTCCGGTGAGCCGGAGGCGTATTTCCGTATGTCGCCGGAAGACTGGCTGCAGGCAGAAATGCAGGGTGAGATT
+GTGGCGCTGGTCCACAGCCACCCCGGTGGTCTGCCCTGGCTGAGTGAGGCCGACCGGCGGCTGCAGGTGCAGAGTGATTT
+GCCGTGGTGGCTGGTCTGCCGGGGGACGATTCATAAGTTCCGCTGTGTGCCGCATCTCACCGGGCGGCGCTTTGAGCACG
+GTGTGACGGACTGTTACACACTGTTCCGGGATGCTTATCATCTGGCGGGGATTGAGATGCCGGACTTTCATCGTGAGGAT
+GACTGGTGGCGTAACGGCCAGAATCTCTATCTGGATAATCTGGAGGCGACGGGGCTGTATCAGGTGCCGTTGTCAGCGGC
+ACAGCCGGGCGATGTGCTGCTGTGCTGTTTTGGTTCATCAGTGCCGAATCACGCCGCAATTTACTGCGGCGACGGCGAGC
+TGCTGCACCATATTCCTGAACAACTGAGCAAACGAGAGAGGTACACCGACAAATGGCAGCGACGCACACACTCCCTCTGG
+CGTCACCGGGCATGGCGCGCATCTGCCTTTACGGGGATTTACAACGATTTGGTCGCCGCATCGACCTTCGTGTGAAAACG
+GGGGCTGAAGCCATCCGGGCACTGGCCACACAGCTCCCGGCGTTTCGTCAGAAACTGAGCGACGGCTGGTATCAGGTACG
+GATTGCCGGGCGGGACGTCAGCACGTCCGGGTTAACGGCGCAGTTACATGAGACTCTGCCTGATGGCGCTGTAATTCATA
+TTGTTCCCAGAGTCGCCGGGGCCAAGTCAGGTGGCGTATTCCAGATTGTCCTGGGGGCTGCCGCCATTGCCGGATCATTC
+TTTACCGCCGGAGCCACCCTTGCAGCATGGGGGGCAGCCATTGGGGCCGGTGGTATGACCGGCATCCTGTTTTCTCTCGG
+TGCCAGTATGGTGCTCGGTGGTGTGGCGCAGATGCTGGCACCGAAAGCCAGAACTCCCCGTATACAGACAACGGATAACG
+GTAAGCAGAACACCTATTTCTCCTCACTGGATAACATGGTTGCCCAGGGCAATGTTCTGCCTGTTCTGTACGGGGAAATG
+CGCGTGGGGTCACGCGTGGTTTCTCAGGAGATCAGCACGGCAGACGAAGGGGACGGTGGTCAGGTTGTGGTGATTGGTCG
+CTGATGCAAAATGTTTTATGTGAAACCGCCTGCGGGCGGTTTTGTCATTTATGGAGCGTGAGGAATGGGTAAAGGAAGCA
+GTAAGGGGCATACCCCGCGCGAAGCGAAGGACAACCTGAAGTCCACGCAGTTGCTGAGTGTGATCGATGCCATCAGCGAA
+GGGCCGATTGAAGGTCCGGTGGATGGCTTAAAAAGCGTGCTGCTGAACAGTACGCCGGTGCTGGACACTGAGGGGAATAC
+CAACATATCCGGTGTCACGGTGGTGTTCCGGGCTGGTGAGCAGGAGCAGACTCCGCCGGAGGGATTTGAATCCTCCGGCT
+CCGAGACGGTGCTGGGTACGGAAGTGAAATATGACACGCCGATCACCCGCACCATTACGTCTGCAAACATCGACCGTCTG
+CGCTTTACCTTCGGTGTACAGGCACTGGTGGAAACCACCTCAAAGGGTGACAGGAATCCGTCGGAAGTCCGCCTGCTGGT
+TCAGATACAACGTAACGGTGGCTGGGTGACGGAAAAAGACATCACCATTAAGGGCAAAACCACCTCGCAGTATCTGGCCT
+CGGTGGTGATGGGTAACCTGCCGCCGCGCCCGTTTAATATCCGGATGCGCAGGATGACGCCGGACAGCACCACAGACCAG
+CTGCAGAACAAAACGCTCTGGTCGTCATACACTGAAATCATCGATGTGAAACAGTGCTACCCGAACACGGCACTGGTCGG
+CGTGCAGGTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGTATTCTGCAGGTGC
+CGTCGAACTATAACCCGCAGACGCGGCAATACAGCGGTATCTGGGACGGAACGTTTAAACCGGCATACAGCAACAACATG
+GCCTGGTGTCTGTGGGATATGCTGACCCATCCGCGCTACGGCATGGGGAAACGTCTTGGTGCGGCGGATGTGGATAAATG
+GGCGCTGTATGTCATCGGCCAGTACTGCGACCAGTCAGTGCCGGACGGCTTTGGCGGCACGGAGCCGCGCATCACCTGTA
+ATGCGTACCTGACCACACAGCGTAAGGCGTGGGATGTGCTCAGCGATTTCTGCTCGGCGATGCGCTGTATGCCGGTATGG
+AACGGGCAGACGCTGACGTTCGTGCAGGACCGACCGTCGGATAAGACGTGGACCTATAACCGCAGTAATGTGGTGATGCC
+GGATGATGGCGCGCCGTTCCGCTACAGCTTCAGCGCCCTGAAGGACCGCCATAATGCCGTTGAGGTGAACTGGATTGACC
+CGAACAACGGCTGGGAGACGGCGACAGAGCTTGTTGAAGATACGCAGGCCATTGCCCGTTACGGTCGTAATGTTACGAAG
+ATGGATGCCTTTGGCTGTACCAGCCGGGGGCAGGCACACCGCGCCGGGCTGTGGCTGATTAAAACAGAACTGCTGGAAAC
+GCAGACCGTGGATTTCAGCGTCGGCGCAGAAGGGCTTCGCCATGTACCGGGCGATGTTATTGAAATCTGCGATGATGACT
+ATGCCGGTATCAGCACCGGTGGTCGTGTGCTGGCGGTGAACAGCCAGACCCGGACGCTGACGCTCGACCGTGAAATCACG
+CTGCCATCCTCCGGTACCGCGCTGATAAGCCTGGTTGACGGAAGTGGCAATCCGGTCAGCGTGGAGGTTCAGTCCGTCAC
+CGACGGCGTGAAGGTAAAAGTGAGCCGTGTTCCTGACGGTGTTGCTGAATACAGCGTATGGGAGCTGAAGCTGCCGACGC
+TGCGCCAGCGACTGTTCCGCTGCGTGAGTATCCGTGAGAACGACGACGGCACGTATGCCATCACCGCCGTGCAGCATGTG
+CCGGAAAAAGAGGCCATCGTGGATAACGGGGCGCACTTTGACGGCGAACAGAGTGGCACGGTGAATGGTGTCACGCCGCC
+AGCGGTGCAGCACCTGACCGCAGAAGTCACTGCAGACAGCGGGGAATATCAGGTGCTGGCGCGATGGGACACACCGAAGG
+TGGTGAAGGGCGTGAGTTTCCTGCTCCGTCTGACCGTAACAGCGGACGACGGCAGTGAGCGGCTGGTCAGCACGGCCCGG
+ACGACGGAAACCACATACCGCTTCACGCAACTGGCGCTGGGGAACTACAGGCTGACAGTCCGGGCGGTAAATGCGTGGGG
+GCAGCAGGGCGATCCGGCGTCGGTATCGTTCCGGATTGCCGCACCGGCAGCACCGTCGAGGATTGAGCTGACGCCGGGCT
+ATTTTCAGATAACCGCCACGCCGCATCTTGCCGTTTATGACCCGACGGTACAGTTTGAGTTCTGGTTCTCGGAAAAGCAG
+ATTGCGGATATCAGACAGGTTGAAACCAGCACGCGTTATCTTGGTACGGCGCTGTACTGGATAGCCGCCAGTATCAATAT
+CAAACCGGGCCATGATTATTACTTTTATATCCGCAGTGTGAACACCGTTGGCAAATCGGCATTCGTGGAGGCCGTCGGTC
+GGGCGAGCGATGATGCGGAAGGTTACCTGGATTTTTTCAAAGGCAAGATAACCGAATCCCATCTCGGCAAGGAGCTGCTG
+GAAAAAGTCGAGCTGACGGAGGATAACGCCAGCAGACTGGAGGAGTTTTCGAAAGAGTGGAAGGATGCCAGTGATAAGTG
+GAATGCCATGTGGGCTGTCAAAATTGAGCAGACCAAAGACGGCAAACATTATGTCGCGGGTATTGGCCTCAGCATGGAGG
+ACACGGAGGAAGGCAAACTGAGCCAGTTTCTGGTTGCCGCCAATCGTATCGCATTTATTGACCCGGCAAACGGGAATGAA
+ACGCCGATGTTTGTGGCGCAGGGCAACCAGATATTCATGAACGACGTGTTCCTGAAGCGCCTGACGGCCCCCACCATTAC
+CAGCGGCGGCAATCCTCCGGCCTTTTCCCTGACACCGGACGGAAAGCTGACCGCTAAAAATGCGGATATCAGTGGCAGTG
+TGAATGCGAACTCCGGGACGCTCAGTAATGTGACGATAGCTGAAAACTGTACGATAAACGGTACGCTGAGGGCGGAAAAA
+ATCGTCGGGGACATTGTAAAGGCGGCGAGCGCGGCTTTTCCGCGCCAGCGTGAAAGCAGTGTGGACTGGCCGTCAGGTAC
+CCGTACTGTCACCGTGACCGATGACCATCCTTTTGATCGCCAGATAGTGGTGCTTCCGCTGACGTTTCGCGGAAGTAAGC
+GTACTGTCAGCGGCAGGACAACGTATTCGATGTGTTATCTGAAAGTACTGATGAACGGTGCGGTGATTTATGATGGCGCG
+GCGAACGAGGCGGTACAGGTGTTCTCCCGTATTGTTGACATGCCAGCGGGTCGGGGAAACGTGATCCTGACGTTCACGCT
+TACGTCCACACGGCATTCGGCAGATATTCCGCCGTATACGTTTGCCAGCGATGTGCAGGTTATGGTGATTAAGAAACAGG
+CGCTGGGCATCAGCGTGGTCTGAGTGTGTTACAGAGGTTCGTCCGGGAACGGGCGTTTTATTATAAAACAGTGAGAGGTG
+AACGATGCGTAATGTGTGTATTGCCGTTGCTGTCTTTGCCGCACTTGCGGTGACAGTCACTCCGGCCCGTGCGGAAGGTG
+GACATGGTACGTTTACGGTGGGCTATTTTCAAGTGAAACCGGGTACATTGCCGTCGTTGTCGGGCGGGGATACCGGTGTG
+AGTCATCTGAAAGGGATTAACGTGAAGTACCGTTATGAGCTGACGGACAGTGTGGGGGTGATGGCTTCCCTGGGGTTCGC
+CGCGTCGAAAAAGAGCAGCACAGTGATGACCGGGGAGGATACGTTTCACTATGAGAGCCTGCGTGGACGTTATGTGAGCG
+TGATGGCCGGACCGGTTTTACAAATCAGTAAGCAGGTCAGTGCGTACGCCATGGCCGGAGTGGCTCACAGTCGGTGGTCC
+GGCAGTACAATGGATTACCGTAAGACGGAAATCACTCCCGGGTATATGAAAGAGACGACCACTGCCAGGGACGAAAGTGC
+AATGCGGCATACCTCAGTGGCGTGGAGTGCAGGTATACAGATTAATCCGGCAGCGTCCGTCGTTGTTGATATTGCTTATG
+AAGGCTCCGGCAGTGGCGACTGGCGTACTGACGGATTCATCGTTGGGGTCGGTTATAAATTCTGATTAGCCAGGTAACAC
+AGTGTTATGACAGCCCGCCGGAACCGGTGGGCTTTTTTGTGGGGTGAATATGGCAGTAAAGATTTCAGGAGTCCTGAAAG
+ACGGCACAGGAAAACCGGTACAGAACTGCACCATTCAGCTGAAAGCCAGACGTAACAGCACCACGGTGGTGGTGAACACG
+GTGGGCTCAGAGAATCCGGATGAAGCCGGGCGTTACAGCATGGATGTGGAGTACGGTCAGTACAGTGTCATCCTGCAGGT
+TGACGGTTTTCCACCATCGCACGCCGGGACCATCACCGTGTATGAAGATTCACAACCGGGGACGCTGAATGATTTTCTCT
+GTGCCATGACGGAGGATGATGCCCGGCCGGAGGTGCTGCGTCGTCTTGAACTGATGGTGGAAGAGGTGGCGCGTAACGCG
+TCCGTGGTGGCACAGAGTACGGCAGACGCGAAGAAATCAGCCGGCGATGCCAGTGCATCAGCTGCTCAGGTCGCGGCCCT
+TGTGACTGATGCAACTGACTCAGCACGCGCCGCCAGCACGTCCGCCGGACAGGCTGCATCGTCAGCTCAGGAAGCGTCCT
+CCGGCGCAGAAGCGGCATCAGCAAAGGCCACTGAAGCGGAAAAAAGTGCCGCAGCCGCAGAGTCCTCAAAAAACGCGGCG
+GCCACCAGTGCCGGTGCGGCGAAAACGTCAGAAACGAATGCTGCAGCGTCACAACAATCAGCCGCCACGTCTGCCTCCAC
+CGCGGCCACGAAAGCGTCAGAGGCCGCCACTTCAGCACGAGATGCGGTGGCCTCAAAAGAGGCAGCAAAATCATCAGAAA
+CGAACGCATCATCAAGTGCCGGTCGTGCAGCTTCCTCGGCAACGGCGGCAGAAAATTCTGCCAGGGCGGCAAAAACGTCC
+GAGACGAATGCCAGGTCATCTGAAACAGCAGCGGAACGGAGCGCCTCTGCCGCGGCAGACGCAAAAACAGCGGCGGCGGG
+GAGTGCGTCAACGGCATCCACGAAGGCGACAGAGGCTGCGGGAAGTGCGGTATCAGCATCGCAGAGCAAAAGTGCGGCAG
+AAGCGGCGGCAATACGTGCAAAAAATTCGGCAAAACGTGCAGAAGATATAGCTTCAGCTGTCGCGCTTGAGGATGCGGAC
+ACAACGAGAAAGGGGATAGTGCAGCTCAGCAGTGCAACCAACAGCACGTCTGAAACGCTTGCTGCAACGCCAAAGGCGGT
+TAAGGTGGTAATGGATGAAACGAACAGAAAAGCCCACTGGACAGTCCGGCACTGACCGGAACGCCAACAGCACCAACCGC
+GCTCAGGGGAACAAACAATACCCAGATTGCGAACACCGCTTTTGTACTGGCCGCGATTGCAGATGTTATCGACGCGTCAC
+CTGACGCACTGAATACGCTGAATGAACTGGCCGCAGCGCTCGGGAATGATCCAGATTTTGCTACCACCATGACTAACGCG
+CTTGCGGGTAAACAACCGAAGAATGCGACACTGACGGCGCTGGCAGGGCTTTCCACGGCGAAAAATAAATTACCGTATTT
+TGCGGAAAATGATGCCGCCAGCCTGACTGAACTGACTCAGGTTGGCAGGGATATTCTGGCAAAAAATTCCGTTGCAGATG
+TTCTTGAATACCTTGGGGCCGGTGAGAATTCGGCCTTTCCGGCAGGTGCGCCGATCCCGTGGCCATCAGATATCGTTCCG
+TCTGGCTACGTCCTGATGCAGGGGCAGGCGTTTGACAAATCAGCCTACCCAAAACTTGCTGTCGCGTATCCATCGGGTGT
+GCTTCCTGATATGCGAGGCTGGACAATCAAGGGGAAACCCGCCAGCGGTCGTGCTGTATTGTCTCAGGAACAGGATGGAA
+TTAAGTCGCACACCCACAGTGCCAGTGCATCCGGTACGGATTTGGGGACGAAAACCACATCGTCGTTTGATTACGGGACG
+AAAACAACAGGCAGTTTCGATTACGGCACCAAATCGACGAATAACACGGGGGCTCATGCTCACAGTCTGAGCGGTTCAAC
+AGGGGCCGCGGGTGCTCATGCCCACACAAGTGGTTTAAGGATGAACAGTTCTGGCTGGAGTCAGTATGGAACAGCAACCA
+TTACAGGAAGTTTATCCACAGTTAAAGGAACCAGCACACAGGGTATTGCTTATTTATCGAAAACGGACAGTCAGGGCAGC
+CACAGTCACTCATTGTCCGGTACAGCCGTGAGTGCCGGTGCACATGCGCATACAGTTGGTATTGGTGCGCACCAGCATCC
+GGTTGTTATCGGTGCTCATGCCCATTCTTTCAGTATTGGTTCACACGGACACACCATCACCGTTAACGCTGCGGGTAACG
+CGGAAAACACCGTCAAAAACATTGCATTTAACTATATTGTGAGGCTTGCATAATGGCATTCAGAATGAGTGAACAACCAC
+GGACCATAAAAATTTATAATCTGCTGGCCGGAACTAATGAATTTATTGGTGAAGGTGACGCATATATTCCGCCTCATACC
+GGTCTGCCTGCAAACAGTACCGATATTGCACCGCCAGATATTCCGGCTGGCTTTGTGGCTGTTTTCAACAGTGATGAGGC
+ATCGTGGCATCTCGTTGAAGACCATCGGGGTAAAACCGTCTATGACGTGGCTTCCGGCGACGCGTTATTTATTTCTGAAC
+TCGGTCCGTTACCGGAAAATTTTACCTGGTTATCGCCGGGAGGGGAATATCAGAAGTGGAACGGCACAGCCTGGGTGAAG
+GATACGGAAGCAGAAAAACTGTTCCGGATCCGGGAGGCGGAAGAAACAAAAAAAAGCCTGATGCAGGTAGCCAGTGAGCA
+TATTGCGCCGCTTCAGGATGCTGCAGATCTGGAAATTGCAACGAAGGAAGAAACCTCGTTGCTGGAAGCCTGGAAGAAGT
+ATCGGGTGTTGCTGAACCGTGTTGATACATCAACTGCACCTGATATTGAGTGGCCTGCTGTCCCTGTTATGGAGTAATCG
+TTTTGTGATATGCCGCAGAAACGTTGTATGAAATAACGTTCTGCGGTTAGTTAGTATATTGTAAAGCTGAGTATTGGTTT
+ATTTGGCGATTATTATCTTCAGGAGAATAATGGAAGTTCTATGACTCAATTGTTCATAGTGTTTACATCACCGCCAATTG
+CTTTTAAGACTGAACGCATGAAATATGGTTTTTCGTCATGTTTTGAGTCTGCTGTTGATATTTCTAAAGTCGGTTTTTTT
+TCTTCGTTTTCTCTAACTATTTTCCATGAAATACATTTTTGATTATTATTTGAATCAATTCCAATTACCTGAAGTCTTTC
+ATCTATAATTGGCATTGTATGTATTGGTTTATTGGAGTAGATGCTTGCTTTTCTGAGCCATAGCTCTGATATCCAAATGA
+AGCCATAGGCATTTGTTATTTTGGCTCTGTCAGCTGCATAACGCCAAAAAATATATTTATCTGCTTGATCTTCAAATGTT
+GTATTGATTAAATCAATTGGATGGAATTGTTTATCATAAAAAATTAATGTTTGAATGTGATAACCGTCCTTTAAAAAAGT
+CGTTTCTGCAAGCTTGGCTGTATAGTCAACTAACTCTTCTGTCGAAGTGATATTTTTAGGCTTATCTACCAGTTTTAGAC
+GCTCTTTAATATCTTCAGGAATTATTTTATTGTCATATTGTATCATGCTAAATGACAATTTGCTTATGGAGTAATCTTTT
+AATTTTAAATAAGTTATTCTCCTGGCTTCATCAAATAAAGAGTCGAATGATGTTGGCGAAATCACATCGTCACCCATTGG
+ATTGTTTATTTGTATGCCAAGAGAGTTACAGCAGTTATACATTCTGCCATAGATTATAGCTAAGGCATGTAATAATTCGT
+AATCTTTTAGCGTATTAGCGACCCATCGTCTTTCTGATTTAATAATAGATGATTCAGTTAAATATGAAGGTAATTTCTTT
+TGTGCAAGTCTGACTAACTTTTTTATACCAATGTTTAACATACTTTCATTTGTAATAAACTCAATGTCATTTTCTTCAAT
+GTAAGATGAAATAAGAGTAGCCTTTGCCTCGCTATACATTTCTAAATCGCCTTGTTTTTCTATCGTATTGCGAGAATTTT
+TAGCCCAAGCCATTAATGGATCATTTTTCCATTTTTCAATAACATTATTGTTATACCAAATGTCATATCCTATAATCTGG
+TTTTTGTTTTTTTGAATAATAAATGTTACTGTTCTTGCGGTTTGGAGGAATTGATTCAAATTCAAGCGAAATAATTCAGG
+GTCAAAATATGTATCAATGCAGCATTTGAGCAAGTGCGATAAATCTTTAAGTCTTCTTTCCCATGGTTTTTTAGTCATAA
+AACTCTCCATTTTGATAGGTTGCATGCTAGATGCTGATATATTTTAGAGGTGATAAAATTAACTGCTTAACTGTCAATGT
+AATACAAGTTGTTTGATCTTTGCAATGATTCTTATCAGAAACCATATAGTAAATTAGTTACACAGGAAATTTTTAATATT
+ATTATTATCATTCATTATGTATTAAAATTAGAGTTGTGGCTTGGCTCTGCTAACACGTTGCTCATAGGAGATATGGTAGA
+GCCGCAGACACGTCGTATGCAGGAACGTGCTGCGGCTGGCTGGTGAACTTCCGATAGTGCGGGTGTTGAATGATTTCCAG
+TTGCTACCGATTTTACATATTTTTTGCATGAGAGAATTTGTACCACCTCCCACCGACCATCTATGACTGTACGCCACTGT
+CCCTAGGACTGCTATGTGCCGGAGCGGACATTACAAACGTCCTTCTCGGTGCATGCCACTGTTGCCAATGACCTGCCTAG
+GAATTGGTTAGCAAGTTACTACCGGATTTTGTAAAAACAGCCCTCCTCATATAAAAAGTATTCGTTCACTTCCGATAAGC
+GTCGTAATTTTCTATCTTTCATCATATTCTAGATCCCTCTGAAAAAATCTTCCGAGTTTGCTAGGCACTGATACATAACT
+CTTTTCCAATAATTGGGGAAGTCATTCAAATCTATAATAGGTTTCAGATTTGCTTCAATAAATTCTGACTGTAGCTGCTG
+AAACGTTGCGGTTGAACTATATTTCCTTATAACTTTTACGAAAGAGTTTCTTTGAGTAATCACTTCACTCAAGTGCTTCC
+CTGCCTCCAAACGATACCTGTTAGCAATATTTAATAGCTTGAAATGATGAAGAGCTCTGTGTTTGTCTTCCTGCCTCCAG
+TTCGCCGGGCATTCAACATAAAAACTGATAGCACCCGGAGTTCCGGAAACGAAATTTGCATATACCCATTGCTCACGAAA
+AAAAATGTCCTTGTCGATATAGGGATGAATCGCTTGGTGTACCTCATCTACTGCGAAAACTTGACCTTTCTCTCCCATAT
+TGCAGTCGCGGCACGATGGAACTAAATTAATAGGCATCACCGAAAATTCAGGATAATGTGCAATAGGAAGAAAATGATCT
+ATATTTTTTGTCTGTCCTATATCACCACAAAATGGACATTTTTCACCTGATGAAACAAGCATGTCATCGTAATATGTTCT
+AGCGGGTTTGTTTTTATCTCGGAGATTATTTTCATAAAGCTTTTCTAATTTAACCTTTGTCAGGTTACCAACTACTAAGG
+TTGTAGGCTCAAGAGGGTGTGTCCTGTCGTAGGTAAATAACTGACCTGTCGAGCTTAATATTCTATATTGTTGTTCTTTC
+TGCAAAAAAGTGGGGAAGTGAGTAATGAAATTATTTCTAACATTTATCTGCATCATACCTTCCGAGCATTTATTAAGCAT
+TTCGCTATAAGTTCTCGCTGGAAGAGGTAGTTTTTTCATTGTACTTTACCTTCATCTCTGTTCATTATCATCGCTTTTAA
+AACGGTTCGACCTTCTAATCCTATCTGACCATTATAATTTTTTAGAATGGTTTCATAAGAAAGCTCTGAATCAACGGACT
+GCGATAATAAGTGGTGGTATCCAGAATTTGTCACTTCAAGTAAAAACACCTCACGAGTTAAAACACCTAAGTTCTCACCG
+AATGTCTCAATATCCGGACGGATAATATTTATTGCTTCTCTTGACCGTAGGACTTTCCACATGCAGGATTTTGGAACCTC
+TTGCAGTACTACTGGGGAATGAGTTGCAATTATTGCTACACCATTGCGTGCATCGAGTAAGTCGCTTAATGTTCGTAAAA
+AAGCAGAGAGCAAAGGTGGATGCAGATGAACCTCTGGTTCATCGAATAAAACTAATGACTTTTCGCCAACGACATCTACT
+AATCTTGTGATAGTAAATAAAACAATTGCATGTCCAGAGCTCATTCGAAGCAGATATTTCTGGATATTGTCATAAAACAA
+TTTAGTGAATTTATCATCGTCCACTTGAATCTGTGGTTCATTACGTCTTAACTCTTCATATTTAGAAATGAGGCTGATGA
+GTTCCATATTTGAAAAGTTTTCATCACTACTTAGTTTTTTGATAGCTTCAAGCCAGAGTTGTCTTTTTCTATCTACTCTC
+ATACAACCAATAAATGCTGAAATGAATTCTAAGCGGAGATCGCCTAGTGATTTTAAACTATTGCTGGCAGCATTCTTGAG
+TCCAATATAAAAGTATTGTGTACCTTTTGCTGGGTCAGGTTGTTCTTTAGGAGGAGTAAAAGGATCAAATGCACTAAACG
+AAACTGAAACAAGCGATCGAAAATATCCCTTTGGGATTCTTGACTCGATAAGTCTATTATTTTCAGAGAAAAAATATTCA
+TTGTTTTCTGGGTTGGTGATTGCACCAATCATTCCATTCAAAATTGTTGTTTTACCACACCCATTCCGCCCGATAAAAGC
+ATGAATGTTCGTGCTGGGCATAGAATTAACCGTCACCTCAAAAGGTATAGTTAAATCACTGAATCCGGGAGCACTTTTTC
+TATTAAATGAAAAGTGGAAATCTGACAATTCTGGCAAACCATTTAACACACGTGCGAACTGTCCATGAATTTCTGAAAGA
+GTTACCCCTCTAAGTAATGAGGTGTTAAGGACGCTTTCATTTTCAATGTCGGCTAATCGATTTGGCCATACTACTAAATC
+CTGAATAGCTTTAAGAAGGTTATGTTTAAAACCATCGCTTAATTTGCTGAGATTAACATAGTAGTCAATGCTTTCACCTA
+AGGAAAAAAACATTTCAGGGAGTTGACTGAATTTTTTATCTATTAATGAATAAGTGCTTACTTCTTCTTTTTGACCTACA
+AAACCAATTTTAACATTTCCGATATCGCATTTTTCACCATGCTCATCAAAGACAGTAAGATAAAACATTGTAACAAAGGA
+ATAGTCATTCCAACCATCTGCTCGTAGGAATGCCTTATTTTTTTCTACTGCAGGAATATACCCGCCTCTTTCAATAACAC
+TAAACTCCAACATATAGTAACCCTTAATTTTATTAAAATAACCGCAATTTATTTGGCGGCAACACAGGATCTCTCTTTTA
+AGTTACTCTCTATTACATACGTTTTCCATCTAAAAATTAGTAGTATTGAACTTAACGGGGCATCGTATTGTAGTTTTCCA
+TATTTAGCTTTCTGCTTCCTTTTGGATAACCCACTGTTATTCATGTTGCATGGTGCACTGTTTATACCAACGATATAGTC
+TATTAATGCATATATAGTATCGCCGAACGATTAGCTCTTCAGGCTTCTGAAGAAGCGTTTCAAGTACTAATAAGCCGATA
+GATAGCCACGGACTTCGTAGCCATTTTTCATAAGTGTTAACTTCCGCTCCTCGCTCATAACAGACATTCACTACAGTTAT
+GGCGGAAAGGTATGCATGCTGGGTGTGGGGAAGTCGTGAAAGAAAAGAAGTCAGCTGCGTCGTTTGACATCACTGCTATC
+TTCTTACTGGTTATGCAGGTCGTAGTGGGTGGCACACAAAGCTTTGCACTGGATTGCGAGGCTTTGTGCTTCTCTGGAGT
+GCGACAGGTTTGATGACAAAAAATTAGCGCAAGAAGACAAAAATCACCTTGCGCTAATGCTCTGTTACAGGTCACTAATA
+CCATCTAAGTAGTTGATTCATAGTGACTGCATATGTTGTGTTTTACAGTATTATGTAGTCTGTTTTTTATGCAAAATCTA
+ATTTAATATATTGATATTTATATCATTTTACGTTTCTCGTTCAGCTTTTTTATACTAAGTTGGCATTATAAAAAAGCATT
+GCTTATCAATTTGTTGCAACGAACAGGTCACTATCAGTCAAAATAAAATCATTATTTGATTTCAATTTTGTCCCACTCCC
+TGCCTCTGTCATCACGATACTGTGATGCCATGGTGTCCGACTTATGCCCGAGAAGATGTTGAGCAAACTTATCGCTTATC
+TGCTTCTCATAGAGTCTTGCAGACAAACTGCGCAACTCGTGAAAGGTAGGCGGATCCCCTTCGAAGGAAAGACCTGATGC
+TTTTCGTGCGCGCATAAAATACCTTGATACTGTGCCGGATGAAAGCGGTTCGCGACGAGTAGATGCAATTATGGTTTCTC
+CGCCAAGAATCTCTTTGCATTTATCAAGTGTTTCCTTCATTGATATTCCGAGAGCATCAATATGCAATGCTGTTGGGATG
+GCAATTTTTACGCCTGTTTTGCTTTGCTCGACATAAAGATATCCATCTACGATATCAGACCACTTCATTTCGCATAAATC
+ACCAACTCGTTGCCCGGTAACAACAGCCAGTTCCATTGCAAGTCTGAGCCAACATGGTGATGATTCTGCTGCTTGATAAA
+TTTTCAGGTATTCGTCAGCCGTAAGTCTTGATCTCCTTACCTCTGATTTTGCTGCGCGAGTGGCAGCGACATGGTTTGTT
+GTTATATGGCCTTCAGCTATTGCCTCTCGGAATGCATCGCTCAGTGTTGATCTGATTAACTTGGCTGACGCCGCCTTGCC
+CTCGTCTATGTATCCATTGAGCATTGCCGCAATTTCTTTTGTGGTGATGTCTTCAAGTGGAGCATCAGGCAGACCCCTCC
+TTATTGCTTTAATTTTGCTCATGTAATTTATGAGTGTCTTCTGCTTGATTCCTCTGCTGGCCAGGATTTTTTCGTAGCGA
+TCAAGCCATGAATGTAACGTAACGGAATTATCACTGTTGATTCTCGCTGTCAGAGGCTTGTGTTTGTGTCCTGAAAATAA
+CTCAATGTTGGCCTGTATAGCTTCAGTGATTGCGATTCGCCTGTCTCTGCCTAATCCAAACTCTTTACCCGTCCTTGGGT
+CCCTGTAGCAGTAATATCCATTGTTTCTTATATAAAGGTTAGGGGGTAAATCCCGGCGCTCATGACTTCGCCTTCTTCCC
+ATTTCTGATCCTCTTCAAAAGGCCACCTGTTACTGGTCGATTTAAGTCAACCTTTACCGCTGATTCGTGGAACAGATACT
+CTCTTCCATCCTTAACCGGAGGTGGGAATATCCTGCATTCCCGAACCCATCGACGAACTGTTTCAAGGCTTCTTGGACGT
+CGCTGGCGTGCGTTCCACTCCTGAAGTGTCAAGTACATCGCAAAGTCTCCGCAATTACACGCAAGAAAAAACCGCCATCA
+GGCGGCTTGGTGTTCTTTCAGTTCTTCAATTCGAATATTGGTTACGTCTGCATGTGCTATCTGCGCCCATATCATCCAGT
+GGTCGTAGCAGTCGTTGATGTTCTCCGCTTCGATAACTCTGTTGAATGGCTCTCCATTCCATTCTCCTGTGACTCGGAAG
+TGCATTTATCATCTCCATAAAACAAAACCCGCCGTAGCGAGTTCAGATAAAATAAATCCCCGCGAGTGCGAGGATTGTTA
+TGTAATATTGGGTTTAATCATCTATATGTTTTGTACAGAGAGGGCAAGTATCGTTTCCACCGTACTCGTGATAATAATTT
+TGCACGGTATCAGTCATTTCTCGCACATTGCAGAATGGGGATTTGTCTTCATTAGACTTATAAACCTTCATGGAATATTT
+GTATGCCGACTCTATATCTATACCTTCATCTACATAAACACCTTCGTGATGTCTGCATGGAGACAAGACACCGGATCTGC
+ACAACATTGATAACGCCCAATCTTTTTGCTCAGACTCTAACTCATTGATACTCATTTATAAACTCCTTGCAATGTATGTC
+GTTTCAGCTAAACGGTATCAGCAATGTTTATGTAAAGAAACAGTAAGATAATACTCAACCCGATGTTTGAGTACGGTCAT
+CATCTGACACTACAGACTCTGGCATCGCTGTGAAGACGACGCGAAATTCAGCATTTTCACAAGCGTTATCTTTTACAAAA
+CCGATCTCACTCTCCTTTGATGCGAATGCCAGCGTCAGACATCATATGCAGATACTCACCTGCATCCTGAACCCATTGAC
+CTCCAACCCCGTAATAGCGATGCGTAATGATGTCGATAGTTACTAACGGGTCTTGTTCGATTAACTGCCGCAGAAACTCT
+TCCAGGTCACCAGTGCAGTGCTTGATAACAGGAGTCTTCCCAGGATGGCGAACAACAAGAAACTGGTTTCCGTCTTCACG
+GACTTCGTTGCTTTCCAGTTTAGCAATACGCTTACTCCCATCCGAGATAACACCTTCGTAATACTCACGCTGCTCGTTGA
+GTTTTGATTTTGCTGTTTCAAGCTCAACACGCAGTTTCCCTACTGTTAGCGCAATATCCTCGTTCTCCTGGTCGCGGCGT
+TTGATGTATTGCTGGTTTCTTTCCCGTTCATCCAGCAGTTCCAGCACAATCGATGGTGTTACCAATTCATGGAAAAGGTC
+TGCGTCAAATCCCCAGTCGTCATGCATTGCCTGCTCTGCCGCTTCACGCAGTGCCTGAGAGTTAATTTCGCTCACTTCGA
+ACCTCTCTGTTTACTGATAAGTTCCAGATCCTCCTGGCAACTTGCACAAGTCCGACAACCCTGAACGACCAGGCGTCTTC
+GTTCATCTATCGGATCGCCACACTCACAACAATGAGTGGCAGATATAGCCTGGTGGTTCAGGCGGCGCATTTTTATTGCT
+GTGTTGCGCTGTAATTCTTCTATTTCTGATGCTGAATCAATGATGTCTGCCATCTTTCATTAATCCCTGAACTGTTGGTT
+AATACGCTTGAGGGTGAATGCGAATAATAAAAAAGGAGCCTGTAGCTCCCTGATGATTTTGCTTTTCATGTTCATCGTTC
+CTTAAAGACGCCGTTTAACATGCCGATTGCCAGGCTTAAATGAGTCGGTGTGAATCCCATCAGCGTTACCGTTTCGCGGT
+GCTTCTTCAGTACGCTACGGCAAATGTCATCGACGTTTTTATCCGGAAACTGCTGTCTGGCTTTTTTTGATTTCAGAATT
+AGCCTGACGGGCAATGCTGCGAAGGGCGTTTTCCTGCTGAGGTGTCATTGAACAAGTCCCATGTCGGCAAGCATAAGCAC
+ACAGAATATGAAGCCCGCTGCCAGAAAAATGCATTCCGTGGTTGTCATACCTGGTTTCTCTCATCTGCTTCTGCTTTCGC
+CACCATCATTTCCAGCTTTTGTGAAAGGGATGCGGCTAACGTATGAAATTCTTCGTCTGTTTCTACTGGTATTGGCACAA
+ACCTGATTCCAATTTGAGCAAGGCTATGTGCCATCTCGATACTCGTTCTTAACTCAACAGAAGATGCTTTGTGCATACAG
+CCCCTCGTTTATTATTTATCTCCTCAGCCAGCCGCTGTGCTTTCAGTGGATTTCGGATAACAGAAAGGCCGGGAAATACC
+CAGCCTCGCTTTGTAACGGAGTAGACGAAAGTGATTGCGCCTACCCGGATATTATCGTGAGGATGCGTCATCGCCATTGC
+TCCCCAAATACAAAACCAATTTCAGCCAGTGCCTCGTCCATTTTTTCGATGAACTCCGGCACGATCTCGTCAAAACTCGC
+CATGTACTTTTCATCCCGCTCAATCACGACATAATGCAGGCCTTCACGCTTCATACGCGGGTCATAGTTGGCAAAGTACC
+AGGCATTTTTTCGCGTCACCCACATGCTGTACTGCACCTGGGCCATGTAAGCTGACTTTATGGCCTCGAAACCACCGAGC
+CGGAACTTCATGAAATCCCGGGAGGTAAACGGGCATTTCAGTTCAAGGCCGTTGCCGTCACTGCATAAACCATCGGGAGA
+GCAGGCGGTACGCATACTTTCGTCGCGATAGATGATCGGGGATTCAGTAACATTCACGCCGGAAGTGAATTCAAACAGGG
+TTCTGGCGTCGTTCTCGTACTGTTTTCCCCAGGCCAGTGCTTTAGCGTTAACTTCCGGAGCCACACCGGTGCAAACCTCA
+GCAAGCAGGGTGTGGAAGTAGGACATTTTCATGTCAGGCCACTTCTTTCCGGAGCGGGGTTTTGCTATCACGTTGTGAAC
+TTCTGAAGCGGTGATGACGCCGAGCCGTAATTTGTGCCACGCATCATCCCCCTGTTCGACAGCTCTCACATCGATCCCGG
+TACGCTGCAGGATAATGTCCGGTGTCATGCTGCCACCTTCTGCTCTGCGGCTTTCTGTTTCAGGAATCCAAGAGCTTTTA
+CTGCTTCGGCCTGTGTCAGTTCTGACGATGCACGAATGTCGCGGCGAAATATCTGGGAACAGAGCGGCAATAAGTCGTCA
+TCCCATGTTTTATCCAGGGCGATCAGCAGAGTGTTAATCTCCTGCATGGTTTCATCGTTAACCGGAGTGATGTCGCGTTC
+CGGCTGACGTTCTGCAGTGTATGCAGTATTTTCGACAATGCGCTCGGCTTCATCCTTGTCATAGATACCAGCAAATCCGA
+AGGCCAGACGGGCACACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTGATTTCT
+CTGCCTTCGCGAGTTTTGAATGGTTCGCGGCGGCATTCATCCATCCATTCGGTAACGCAGATCGGATGATTACGGTCCTT
+GCGGTAAATCCGGCATGTACAGGATTCATTGTCCTGCTCAAAGTCCATGCCATCAAACTGCTGGTTTTCATTGATGATGC
+GGGACCAGCCATCAACGCCCACCACCGGAACGATGCCATTCTGCTTATCAGGAAAGGCGTAAATTTCTTTCGTCCACGGA
+TTAAGGCCGTACTGGTTGGCAACGATCAGTAATGCGATGAACTGCGCATCGCTGGCATCACCTTTAAATGCCGTCTGGCG
+AAGAGTGGTGATCAGTTCCTGTGGGTCGACAGAATCCATGCCGACACGTTCAGCCAGCTTCCCAGCCAGCGTTGCGAGTG
+CAGTACTCATTCGTTTTATACCTCTGAATCAATATCAACCTGGTGGTGAGCAATGGTTTCAACCATGTACCGGATGTGTT
+CTGCCATGCGCTCCTGAAACTCAACATCGTCATCAAACGCACGGGTAATGGATTTTTTGCTGGCCCCGTGGCGTTGCAAA
+TGATCGATGCATAGCGATTCAAACAGGTGCTGGGGCAGGCCTTTTTCCATGTCGTCTGCCAGTTCTGCCTCTTTCTCTTC
+ACGGGCGAGCTGCTGGTAGTGACGCGCCCAGCTCTGAGCCTCAAGACGATCCTGAATGTAATAAGCGTTCATGGCTGAAC
+TCCTGAAATAGCTGTGAAAATATCGCCCGCGAAATGCCGGGCTGATTAGGAAAACAGGAAAGGGGGTTAGTGAATGCTTT
+TGCTTGATCTCAGTTTCAGTATTAATATCCATTTTTTATAAGCGTCGACGGCTTCACGAAACATCTTTTCATCGCCAATA
+AAAGTGGCGATAGTGAATTTAGTCTGGATAGCCATAAGTGTTTGATCCATTCTTTGGGACTCCTGGCTGATTAAGTATGT
+CGATAAGGCGTTTCCATCCGTCACGTAATTTACGGGTGATTCGTTCAAGTAAAGATTCGGAAGGGCAGCCAGCAACAGGC
+CACCCTGCAATGGCATATTGCATGGTGTGCTCCTTATTTATACATAACGAAAAACGCCTCGAGTGAAGCGTTATTGGTAT
+GCGGTAAAACCGCACTCAGGCGGCCTTGATAGTCATATCATCTGAATCAAATATTCCTGATGTATCGATATCGGTAATTC
+TTATTCCTTCGCTACCATCCATTGGAGGCCATCCTTCCTGACCATTTCCATCATTCCAGTCGAACTCACACACAACACCA
+TATGCATTTAAGTCGCTTGAAATTGCTATAAGCAGAGCATGTTGCGCCAGCATGATTAATACAGCATTTAATACAGAGCC
+GTGTTTATTGAGTCGGTATTCAGAGTCTGACCAGAAATTATTAATCTGGTGAAGTTTTTCCTCTGTCATTACGTCATGGT
+CGATTTCAATTTCTATTGATGCTTTCCAGTCGTAATCAATGATGTATTTTTTGATGTTTGACATCTGTTCATATCCTCAC
+AGATAAAAAATCGCCCTCACACTGGAGGGCAAAGAAGATTTCCAATAATCAGAACAAGTCGGCTCCTGTTTAGTTACGAG
+CGACATTGCTCCGTGTATTCACTCGTTGGAATGAATACACAGTGCAGTGTTTATTCTGTTATTTATGCCAAAAATAAAGG
+CCACTATCAGGCAGCTTTGTTGTTCTGTTTACCAAGTTCTCTGGCAATCATTGCCGTCGTTCGTATTGCCCATTTATCGA
+CATATTTCCCATCTTCCATTACAGGAAACATTTCTTCAGGCTTAACCATGCATTCCGATTGCAGCTTGCATCCATTGCAT
+CGCTTGAATTGTCCACACCATTGATTTTTATCAATAGTCGTAGTCATACGGATAGTCCTGGTATTGTTCCATCACATCCT
+GAGGATGCTCTTCGAACTCTTCAAATTCTTCTTCCATATATCACCTTAAATAGTGGATTGCGGTAGTAAAGATTGTGCCT
+GTCTTTTAACCACATCAGGCTCGGTGGTTCTCGTGTACCCCTACAGCGAGAAATCGGATAAACTATTACAACCCCTACAG
+TTTGATGAGTATAGAAATGGATCCACTCGTTATTCTCGGACGAGTGTTCAGTAATGAACCTCTGGAGAGAACCATGTATA
+TGATCGTTATCTGGGTTGGACTTCTGCTTTTAAGCCCAGATAACTGGCCTGAATATGTTAATGAGAGAATCGGTATTCCT
+CATGTGTGGCATGTTTTCGTCTTTGCTCTTGCATTTTCGCTAGCAATTAATGTGCATCGATTATCAGCTATTGCCAGCGC
+CAGATATAAGCGATTTAAGCTAAGAAAACGCATTAAGATGCAAAACGATAAAGTGCGATCAGTAATTCAAAACCTTACAG
+AAGAGCAATCTATGGTTTTGTGCGCAGCCCTTAATGAAGGCAGGAAGTATGTGGTTACATCAAAACAATTCCCATACATT
+AGTGAGTTGATTGAGCTTGGTGTGTTGAACAAAACTTTTTCCCGATGGAATGGAAAGCATATATTATTCCCTATTGAGGA
+TATTTACTGGACTGAATTAGTTGCCAGCTATGATCCATATAATATTGAGATAAAGCCAAGGCCAATATCTAAGTAACTAG
+ATAAGAGGAATCGATTTTCCCTTAATTTTCTGGCGTCCACTGCATGTTATGCCGCGTTCGCCAGGCTTGCTGTACCATGT
+GCGCTGATTCTTGCGCTCAATACGTTGCAGGTTGCTTTCAATCTGTTTGTGGTATTCAGCCAGCACTGTAAGGTCTATCG
+GATTTAGTGCGCTTTCTACTCGTGATTTCGGTTTGCGATTCAGCGAGAGAATAGGGCGGTTAACTGGTTTTGCGCTTACC
+CCAACCAACAGGGGATTTGCTGCTTTCCATTGAGCCTGTTTCTCTGCGCGACGTTCGCGGCGGCGTGTTTGTGCATCCAT
+CTGGATTCTCCTGTCAGTTAGCTTTGGTGGTGTGTGGCAGTTGTAGTCCTGAACGAAAACCCCCCGCGATTGGCACATTG
+GCAGCTAATCCGGAATCGCACTTACGGCCAATGCTTCGTTTCGTATCACACACCCCAAAGCCTTCTGCTTTGAATGCTGC
+CCTTCTTCAGGGCTTAATTTTTAAGAGCGTCACCTTCATGGTGGTCAGTGCGTCCTGCTGATGTGCTCAGTATCACCGCC
+AGTGGTATTTATGTCAACACCGCCAGAGATAATTTATCACCGCAGATGGTTATCTGTATGTTTTTTATATGAATTTATTT
+TTTGCAGGGGGGCATTGTTTGGTAGGTGAGAGATCTGAATTGCTATGTTTAGTGAGTTGTATCTATTTATTTTTCAATAA
+ATACAATTGGTTATGTGTTTTGGGGGCGATCGTGAGGCAAAGAAAACCCGGCGCTGAGGCCGGGTTATTCTTGTTCTCTG
+GTCAAATTATATAGTTGGAAAACAAGGATGCATATATGAATGAACGATGCAGAGGCAATGCCGATGGCGATAGTGGGTAT
+CATGTAGCCGCTTATGCTGGAAAGAAGCAATAACCCGCAGAAAAACAAAGCTCCAAGCTCAACAAAACTAAGGGCATAGA
+CAATAACTACCGATGTCATATACCCATACTCTCTAATCTTGGCCAGTCGGCGCGTTCTGCTTCCGATTAGAAACGTCAAG
+GCAGCAATCAGGATTGCAATCATGGTTCCTGCATATGATGACAATGTCGCCCCAAGACCATCTCTATGAGCTGAAAAAGA
+AACACCAGGAATGTAGTGGCGGAAAAGGAGATAGCAAATGCTTACGATAACGTAAGGAATTATTACTATGTAAACACCAG
+GCATGATTCTGTTCCGCATAATTACTCCTGATAATTAATCCTTAACTTTGCCCACCTGCCTTTTAAAACATTCCAGTATA
+TCACTTTTCATTCTTGCGTAGCAATATGCCATCTCTTCAGCTATCTCAGCATTGGTGACCTTGTTCAGAGGCGCTGAGAG
+ATGGCCTTTTTCTGATAGATAATGTTCTGTTAAAATATCTCCGGCCTCATCTTTTGCCCGCAGGCTAATGTCTGAAAATT
+GAGGTGACGGGTTAAAAATAATATCCTTGGCAACCTTTTTTATATCCCTTTTAAATTTTGGCTTAATGACTATATCCAAT
+GAGTCAAAAAGCTCCCCTTCAATATCTGTTGCCCCTAAGACCTTTAATATATCGCCAAATACAGGTAGCTTGGCTTCTAC
+CTTCACCGTTGTTCGGCCGATGAAATGCATATGCATAACATCGTCTTTGGTGGTTCCCCTCATCAGTGGCTCTATCTGAA
+CGCGCTCTCCACTGCTTAATGACATTCCTTTCCCGATTAAAAAATCTGTCAGATCGGATGTGGTCGGCCCGAAAACAGTT
+CTGGCAAAACCAATGGTGTCGCCTTCAACAAACAAAAAAGATGGGAATCCCAATGATTCGTCATCTGCGAGGCTGTTCTT
+AATATCTTCAACTGAAGCTTTAGAGCGATTTATCTTCTGAACCAGACTCTTGTCATTTGTTTTGGTAAAGAGAAAAGTTT
+TTCCATCGATTTTATGAATATACAAATAATTGGAGCCAACCTGCAGGTGATGATTATCAGCCAGCAGAGAATTAAGGAAA
+ACAGACAGGTTTATTGAGCGCTTATCTTTCCCTTTATTTTTGCTGCGGTAAGTCGCATAAAAACCATTCTTCATAATTCA
+ATCCATTTACTATGTTATGTTCTGAGGGGAGTGAAAATTCCCCTAATTCGATGAAGATTCTTGCTCAATTGTTATCAGCT
+ATGCGCCGACCAGAACACCTTGCCGATCAGCCAAACGTCTCTTCAGGCCACTGACTAGCGATAACTTTCCCCACAACGGA
+ACAACTCTCATTGCATGGGATCATTGGGTACTGTGGGTTTAGTGGTTGTAAAAACACCTGACCGCTATCCCTGATCAGTT
+TCTTGAAGGTAAACTCATCACCCCCAAGTCTGGCTATGCAGAAATCACCTGGCTCAACAGCCTGCTCAGGGTCAACGAGA
+ATTAACATTCCGTCAGGAAAGCTTGGCTTGGAGCCTGTTGGTGCGGTCATGGAATTACCTTCAACCTCAAGCCAGAATGC
+AGAATCACTGGCTTTTTTGGTTGTGCTTACCCATCTCTCCGCATCACCTTTGGTAAAGGTTCTAAGCTTAGGTGAGAACA
+TCCCTGCCTGAACATGAGAAAAAACAGGGTACTCATACTCACTTCTAAGTGACGGCTGCATACTAACCGCTTCATACATC
+TCGTAGATTTCTCTGGCGATTGAAGGGCTAAATTCTTCAACGCTAACTTTGAGAATTTTTGTAAGCAATGCGGCGTTATA
+AGCATTTAATGCATTGATGCCATTAAATAAAGCACCAACGCCTGACTGCCCCATCCCCATCTTGTCTGCGACAGATTCCT
+GGGATAAGCCAAGTTCATTTTTCTTTTTTTCATAAATTGCTTTAAGGCGACGTGCGTCCTCAAGCTGCTCTTGTGTTAAT
+GGTTTCTTTTTTGTGCTCATACGTTAAATCTATCACCGCAAGGGATAAATATCTAACACCGTGCGTGTTGACTATTTTAC
+CTCTGGCGGTGATAATGGTTGCATGTACTAAGGAGGTTGTATGGAACAACGCATAACCCTGAAAGATTATGCAATGCGCT
+TTGGGCAAACCAAGACAGCTAAAGATCTCGGCGTATATCAAAGCGCGATCAACAAGGCCATTCATGCAGGCCGAAAGATT
+TTTTTAACTATAAACGCTGATGGAAGCGTTTATGCGGAAGAGGTAAAGCCCTTCCCGAGTAACAAAAAAACAACAGCATA
+AATAACCCCGCTCTTACACATTCCAGCCCTGAAAAAGGGCATCAAATTAAACCACACCTATGGTGTATGCATTTATTTGC
+ATACATTCAATCAATTGTTATCTAAGGAAATACTTACATATGGTTCGTGCAAACAAACGCAACGAGGCTCTACGAATCGA
+GAGTGCGTTGCTTAACAAAATCGCAATGCTTGGAACTGAGAAGACAGCGGAAGCTGTGGGCGTTGATAAGTCGCAGATCA
+GCAGGTGGAAGAGGGACTGGATTCCAAAGTTCTCAATGCTGCTTGCTGTTCTTGAATGGGGGGTCGTTGACGACGACATG
+GCTCGATTGGCGCGACAAGTTGCTGCGATTCTCACCAATAAAAAACGCCCGGCGGCAACCGAGCGTTCTGAACAAATCCA
+GATGGAGTTCTGAGGTCATTACTGGATCTATCAACAGGAGTCATTATGACAAATACAGCAAAAATACTCAACTTCGGCAG
+AGGTAACTTTGCCGGACAGGAGCGTAATGTGGCAGATCTCGATGATGGTTACGCCAGACTATCAAATATGCTGCTTGAGG
+CTTATTCGGGCGCAGATCTGACCAAGCGACAGTTTAAAGTGCTGCTTGCCATTCTGCGTAAAACCTATGGGTGGAATAAA
+CCAATGGACAGAATCACCGATTCTCAACTTAGCGAGATTACAAAGTTACCTGTCAAACGGTGCAATGAAGCCAAGTTAGA
+ACTCGTCAGAATGAATATTATCAAGCAGCAAGGCGGCATGTTTGGACCAAATAAAAACATCTCAGAATGGTGCATCCCTC
+AAAACGAGGGAAAATCCCCTAAAACGAGGGATAAAACATCCCTCAAATTGGGGGATTGCTATCCCTCAAAACAGGGGGAC
+ACAAAAGACACTATTACAAAAGAAAAAAGAAAAGATTATTCGTCAGAGAATTCTGGCGAATCCTCTGACCAGCCAGAAAA
+CGACCTTTCTGTGGTGAAACCGGATGCTGCAATTCAGAGCGGCAGCAAGTGGGGGACAGCAGAAGACCTGACCGCCGCAG
+AGTGGATGTTTGACATGGTGAAGACTATCGCACCATCAGCCAGAAAACCGAATTTTGCTGGGTGGGCTAACGATATCCGC
+CTGATGCGTGAACGTGACGGACGTAACCACCGCGACATGTGTGTGCTGTTCCGCTGGGCATGCCAGGACAACTTCTGGTC
+CGGTAACGTGCTGAGCCCGGCCAAACTCCGCGATAAGTGGACCCAACTCGAAATCAACCGTAACAAGCAACAGGCAGGCG
+TGACAGCCAGCAAACCAAAACTCGACCTGACAAACACAGACTGGATTTACGGGGTGGATCTATGAAAAACATCGCCGCAC
+AGATGGTTAACTTTGACCGTGAGCAGATGCGTCGGATCGCCAACAACATGCCGGAACAGTACGACGAAAAGCCGCAGGTA
+CAGCAGGTAGCGCAGATCATCAACGGTGTGTTCAGCCAGTTACTGGCAACTTTCCCGGCGAGCCTGGCTAACCGTGACCA
+GAACGAAGTGAACGAAATCCGTCGCCAGTGGGTTCTGGCTTTTCGGGAAAACGGGATCACCACGATGGAACAGGTTAACG
+CAGGAATGCGCGTAGCCCGTCGGCAGAATCGACCATTTCTGCCATCACCCGGGCAGTTTGTTGCATGGTGCCGGGAAGAA
+GCATCCGTTACCGCCGGACTGCCAAACGTCAGCGAGCTGGTTGATATGGTTTACGAGTATTGCCGGAAGCGAGGCCTGTA
+TCCGGATGCGGAGTCTTATCCGTGGAAATCAAACGCGCACTACTGGCTGGTTACCAACCTGTATCAGAACATGCGGGCCA
+ATGCGCTTACTGATGCGGAATTACGCCGTAAGGCCGCAGATGAGCTTGTCCATATGACTGCGAGAATTAACCGTGGTGAG
+GCGATCCCTGAACCAGTAAAACAACTTCCTGTCATGGGCGGTAGACCTCTAAATCGTGCACAGGCTCTGGCGAAGATCGC
+AGAAATCAAAGCTAAGTTCGGACTGAAAGGAGCAAGTGTATGACGGGCAAAGAGGCAATTATTCATTACCTGGGGACGCA
+TAATAGCTTCTGTGCGCCGGACGTTGCCGCGCTAACAGGCGCAACAGTAACCAGCATAAATCAGGCCGCGGCTAAAATGG
+CACGGGCAGGTCTTCTGGTTATCGAAGGTAAGGTCTGGCGAACGGTGTATTACCGGTTTGCTACCAGGGAAGAACGGGAA
+GGAAAGATGAGCACGAACCTGGTTTTTAAGGAGTGTCGCCAGAGTGCCGCGATGAAACGGGTATTGGCGGTATATGGAGT
+TAAAAGATGACCATCTACATTACTGAGCTAATAACAGGCCTGCTGGTAATCGCAGGCCTTTTTATTTGGGGGAGAGGGAA
+GTCATGAAAAAACTAACCTTTGAAATTCGATCTCCAGCACATCAGCAAAACGCTATTCACGCAGTACAGCAAATCCTTCC
+AGACCCAACCAAACCAATCGTAGTAACCATTCAGGAACGCAACCGCAGCTTAGACCAAAACAGGAAGCTATGGGCCTGCT
+TAGGTGACGTCTCTCGTCAGGTTGAATGGCATGGTCGCTGGCTGGATGCAGAAAGCTGGAAGTGTGTGTTTACCGCAGCA
+TTAAAGCAGCAGGATGTTGTTCCTAACCTTGCCGGGAATGGCTTTGTGGTAATAGGCCAGTCAACCAGCAGGATGCGTGT
+AGGCGAATTTGCGGAGCTATTAGAGCTTATACAGGCATTCGGTACAGAGCGTGGCGTTAAGTGGTCAGACGAAGCGAGAC
+TGGCTCTGGAGTGGAAAGCGAGATGGGGAGACAGGGCTGCATGATAAATGTCGTTAGTTTCTCCGGTGGCAGGACGTCAG
+CATATTTGCTCTGGCTAATGGAGCAAAAGCGACGGGCAGGTAAAGACGTGCATTACGTTTTCATGGATACAGGTTGTGAA
+CATCCAATGACATATCGGTTTGTCAGGGAAGTTGTGAAGTTCTGGGATATACCGCTCACCGTATTGCAGGTTGATATCAA
+CCCGGAGCTTGGACAGCCAAATGGTTATACGGTATGGGAACCAAAGGATATTCAGACGCGAATGCCTGTTCTGAAGCCAT
+TTATCGATATGGTAAAGAAATATGGCACTCCATACGTCGGCGGCGCGTTCTGCACTGACAGATTAAAACTCGTTCCCTTC
+ACCAAATACTGTGATGACCATTTCGGGCGAGGGAATTACACCACGTGGATTGGCATCAGAGCTGATGAACCGAAGCGGCT
+AAAGCCAAAGCCTGGAATCAGATATCTTGCTGAACTGTCAGACTTTGAGAAGGAAGATATCCTCGCATGGTGGAAGCAAC
+AACCATTCGATTTGCAAATACCGGAACATCTCGGTAACTGCATATTCTGCATTAAAAAATCAACGCAAAAAATCGGACTT
+GCCTGCAAAGATGAGGAGGGATTGCAGCGTGTTTTTAATGAGGTCATCACGGGATCCCATGTGCGTGACGGACATCGGGA
+AACGCCAAAGGAGATTATGTACCGAGGAAGAATGTCGCTGGACGGTATCGCGAAAATGTATTCAGAAAATGATTATCAAG
+CCCTGTATCAGGACATGGTACGAGCTAAAAGATTCGATACCGGCTCTTGTTCTGAGTCATGCGAAATATTTGGAGGGCAG
+CTTGATTTCGACTTCGGGAGGGAAGCTGCATGATGCGATGTTATCGGTGCGGTGAATGCAAAGAAGATAACCGCTTCCGA
+CCAAATCAACCTTACTGGAATCGATGGTGTCTCCGGTGTGAAAGAACACCAACAGGGGTGTTACCACTACCGCAGGAAAA
+GGAGGACGTGTGGCGAGACAGCGACGAAGTATCACCGACATAATCTGCGAAAACTGCAAATACCTTCCAACGAAACGCAC
+CAGAAATAAACCCAAGCCAATCCCAAAAGAATCTGACGTAAAAACCTTCAACTACACGGCTCACCTGTGGGATATCCGGT
+GGCTAAGACGTCGTGCGAGGAAAACAAGGTGATTGACCAAAATCGAAGTTACGAACAAGAAAGCGTCGAGCGAGCTTTAA
+CGTGCGCTAACTGCGGTCAGAAGCTGCATGTGCTGGAAGTTCACGTGTGTGAGCACTGCTGCGCAGAACTGATGAGCGAT
+CCGAATAGCTCGATGCACGAGGAAGAAGATGATGGCTAAACCAGCGCGAAGACGATGTAAAAACGATGAATGCCGGGAAT
+GGTTTCACCCTGCATTCGCTAATCAGTGGTGGTGCTCTCCAGAGTGTGGAACCAAGATAGCACTCGAACGACGAAGTAAA
+GAACGCGAAAAAGCGGAAAAAGCAGCAGAGAAGAAACGACGACGAGAGGAGCAGAAACAGAAAGATAAACTTAAGATTCG
+AAAACTCGCCTTAAAGCCCCGCAGTTACTGGATTAAACAAGCCCAACAAGCCGTAAACGCCTTCATCAGAGAAAGAGACC
+GCGACTTACCATGTATCTCGTGCGGAACGCTCACGTCTGCTCAGTGGGATGCCGGACATTACCGGACAACTGCTGCGGCA
+CCTCAACTCCGATTTAATGAACGCAATATTCACAAGCAATGCGTGGTGTGCAACCAGCACAAAAGCGGAAATCTCGTTCC
+GTATCGCGTCGAACTGATTAGCCGCATCGGGCAGGAAGCAGTAGACGAAATCGAATCAAACCATAACCGCCATCGCTGGA
+CTATCGAAGAGTGCAAGGCGATCAAGGCAGAGTACCAACAGAAACTCAAAGACCTGCGAAATAGCAGAAGTGAGGCCGCA
+TGACGTTCTCAGTAAAAACCATTCCAGACATGCTCGTTGAAACATACGGAAATCAGACAGAAGTAGCACGCAGACTGAAA
+TGTAGTCGCGGTACGGTCAGAAAATACGTTGATGATAAAGACGGGAAAATGCACGCCATCGTCAACGACGTTCTCATGGT
+TCATCGCGGATGGAGTGAAAGAGATGCGCTATTACGAAAAAATTGATGGCAGCAAATACCGAAATATTTGGGTAGTTGGC
+GATCTGCACGGATGCTACACGAACCTGATGAACAAACTGGATACGATTGGATTCGACAACAAAAAAGACCTGCTTATCTC
+GGTGGGCGATTTGGTTGATCGTGGTGCAGAGAACGTTGAATGCCTGGAATTAATCACATTCCCCTGGTTCAGAGCTGTAC
+GTGGAAACCATGAGCAAATGATGATTGATGGCTTATCAGAGCGTGGAAACGTTAATCACTGGCTGCTTAATGGCGGTGGC
+TGGTTCTTTAATCTCGATTACGACAAAGAAATTCTGGCTAAAGCTCTTGCCCATAAAGCAGATGAACTTCCGTTAATCAT
+CGAACTGGTGAGCAAAGATAAAAAATATGTTATCTGCCACGCCGATTATCCCTTTGACGAATACGAGTTTGGAAAGCCAG
+TTGATCATCAGCAGGTAATCTGGAACCGCGAACGAATCAGCAACTCACAAAACGGGATCGTGAAAGAAATCAAAGGCGCG
+GACACGTTCATCTTTGGTCATACGCCAGCAGTGAAACCACTCAAGTTTGCCAACCAAATGTATATCGATACCGGCGCAGT
+GTTCTGCGGAAACCTAACATTGATTCAGGTACAGGGAGAAGGCGCATGAGACTCGAAAGCGTAGCTAAATTTCATTCGCC
+AAAAAGCCCGATGATGAGCGACTCACCACGGGCCACGGCTTCTGACTCTCTTTCCGGTACTGATGTGATGGCTGCTATGG
+GGATGGCGCAATCACAAGCCGGATTCGGTATGGCTGCATTCTGCGGTAAGCACGAACTCAGCCAGAACGACAAACAAAAG
+GCTATCAACTATCTGATGCAATTTGCACACAAGGTATCGGGGAAATACCGTGGTGTGGCAAAGCTTGAAGGAAATACTAA
+GGCAAAGGTACTGCAAGTGCTCGCAACATTCGCTTATGCGGATTATTGCCGTAGTGCCGCGACGCCGGGGGCAAGATGCA
+GAGATTGCCATGGTACAGGCCGTGCGGTTGATATTGCCAAAACAGAGCTGTGGGGGAGAGTTGTCGAGAAAGAGTGCGGA
+AGATGCAAAGGCGTCGGCTATTCAAGGATGCCAGCAAGCGCAGCATATCGCGCTGTGACGATGCTAATCCCAAACCTTAC
+CCAACCCACCTGGTCACGCACTGTTAAGCCGCTGTATGACGCTCTGGTGGTGCAATGCCACAAAGAAGAGTCAATCGCAG
+ACAACATTTTGAATGCGGTCACACGTTAGCAGCATGATTGCCACGGATGGCAACATATTAACGGCATGATATTGACTTAT
+TGAATAAAATTGGGTAAATTTGACTCAACGATGGGTTAATTCGCTCGTTGTGGTAGTGAGATGAAAAGAGGCGGCGCTTA
+CTACCGATTCCGCCTAGTTGGTCACTTCGACGTATCGTCTGGAACTCCAACCATCGCAGGCAGAGAGGTCTGCAAAATGC
+AATCCCGAAACAGTTCGCAGGTAATAGTTAGAGCCTGCATAACGGTTTCGGGATTTTTTATATCTGCACAACAGGTAAGA
+GCATTGAGTCGATAATCGTGAAGAGTCGGCGAGCCTGGTTAGCCAGTGCTCTTTCCGTTGTGCTGAATTAAGCGAATACC
+GGAAGCAGAACCGGATCACCAAATGCGTACAGGCGTCATCGCCGCCCAGCAACAGCACAACCCAAACTGAGCCGTAGCCA
+CTGTCTGTCCTGAATTCATTAGTAATAGTTACGCTGCGGCCTTTTACACATGACCTTCGTGAAAGCGGGTGGCAGGAGGT
+CGCGCTAACAACCTCCTGCCGTTTTGCCCGTGCATATCGGTCACGAACAAATCTGATTACTAAACACAGTAGCCTGGATT
+TGTTCTATCAGTAATCGACCTTATTCCTAATTAAATAGAGCAAATCCCCTTATTGGGGGTAAGACATGAAGATGCCAGAA
+AAACATGACCTGTTGGCCGCCATTCTCGCGGCAAAGGAACAAGGCATCGGGGCAATCCTTGCGTTTGCAATGGCGTACCT
+TCGCGGCAGATATAATGGCGGTGCGTTTACAAAAACAGTAATCGACGCAACGATGTGCGCCATTATCGCCTAGTTCATTC
+GTGACCTTCTCGACTTCGCCGGACTAAGTAGCAATCTCGCTTATATAACGAGCGTGTTTATCGGCTACATCGGTACTGAC
+TCGATTGGTTCGCTTATCAAACGCTTCGCTGCTAAAAAAGCCGGAGTAGAAGATGGTAGAAATCAATAATCAACGTAAGG
+CGTTCCTCGATATGCTGGCGTGGTCGGAGGGAACTGATAACGGACGTCAGAAAACCAGAAATCATGGTTATGACGTCATT
+GTAGGCGGAGAGCTATTTACTGATTACTCCGATCACCCTCGCAAACTTGTCACGCTAAACCCAAAACTCAAATCAACAGG
+CGCCGGACGCTACCAGCTTCTTTCCCGTTGGTGGGATGCCTACCGCAAGCAGCTTGGCCTGAAAGACTTCTCTCCGAAAA
+GTCAGGACGCTGTGGCATTGCAGCAGATTAAGGAGCGTGGCGCTTTACCTATGATTGATCGTGGTGATATCCGTCAGGCA
+ATCGACCGTTGCAGCAATATCTGGGCTTCACTGCCGGGCGCTGGTTATGGTCAGTTCGAGCATAAGGCTGACAGCCTGAT
+TGCAAAATTCAAAGAAGCGGGCGGAACGGTCAGAGAGATTGATGTATGAGCAGAGTCACCGCGATTATCTCCGCTCTGGT
+TATCTGCATCATCGTCTGCCTGTCATGGGCTGTTAATCATTACCGTGATAACGCCATTACCTACAAAGCCCAGCGCGACA
+AAAATGCCAGAGAACTGAAGCTGGCGAACGCGGCAATTACTGACATGCAGATGCGTCAGCGTGATGTTGCTGCGCTCGAT
+GCAAAATACACGAAGGAGTTAGCTGATGCTAAAGCTGAAAATGATGCTCTGCGTGATGATGTTGCCGCTGGTCGTCGTCG
+GTTGCACATCAAAGCAGTCTGTCAGTCAGTGCGTGAAGCCACCACCGCCTCCGGCGTGGATAATGCAGCCTCCCCCCGAC
+TGGCAGACACCGCTGAACGGGATTATTTCACCCTCAGAGAGAGGCTGATCACTATGCAAAAACAACTGGAAGGAACCCAG
+AAGTATATTAATGAGCAGTGCAGATAGAGTTGCCCATATCGATGGGCAACTCATGCAATTATTGTGAGCAATACACACGC
+GCTTCCAGCGGAGTATAAATGCCTAAAGTAATAAAACCGAGCAATCCATTTACGAATGTTTGCTGGGTTTCTGTTTTAAC
+AACATTTTCTGCGCCGCCACAAATTTTGGCTGCATCGACAGTTTTCTTCTGCCCAATTCCAGAAACGAAGAAATGATGGG
+TGATGGTTTCCTTTGGTGCTACTGCTGCCGGTTTGTTTTGAACAGTAAACGTCTGTTGAGCACATCCTGTAATAAGCAGG
+GCCAGCGCAGTAGCGAGTAGCATTTTTTTCATGGTGTTATTCCCGATGCTTTTTGAAGTTCGCAGAATCGTATGTGTAGA
+AAATTAAACAAACCCTAAACAATGAGTTGAAATTTCATATTGTTAATATTTATTAATGTATGTCAGGTGCGATGAATCGT
+CATTGTATTCCCGGATTAACTATGTCCACAGCCCTGACGGGGAACTTCTCTGCGGGAGTGTCCGGGAATAATTAAAACGA
+TGCACACAGGGTTTAGCGCGTACACGTATTGCATTATGCCAACGCCCCGGTGCTGACACGGAAGAAACCGGACGTTATGA
+TTTAGCGTGGAAAGATTTGTGTAGTGTTCTGAATGCTCTCAGTAAATAGTAATGAATTATCAAAGGTATAGTAATATCTT
+TTATGTTCATGGATATTTGTAACCCATCGGAAAACTCCTGCTTTAGCAAGATTTTCCCTGTATTGCTGAAATGTGATTTC
+TCTTGATTTCAACCTATCATAGGACGTTTCTATAAGATGCGTGTTTCTTGAGAATTTAACATTTACAACCTTTTTAAGTC
+CTTTTATTAACACGGTGTTATCGTTTTCTAACACGATGTGAATATTATCTGTGGCTAGATAGTAAATATAATGTGAGACG
+TTGTGACGTTTTAGTTCAGAATAAAACAATTCACAGTCTAAATCTTTTCGCACTTGATCGAATATTTCTTTAAAAATGGC
+AACCTGAGCCATTGGTAAAACCTTCCATGTGATACGAGGGCGCGTAGTTTGCATTATCGTTTTTATCGTTTCAATCTGGT
+CTGACCTCCTTGTGTTTTGTTGATGATTTATGTCAAATATTAGGAATGTTTTCACTTAATAGTATTGGTTGCGTAACAAA
+GTGCGGTCCTGCTGGCATTCTGGAGGGAAATACAACCGACAGATGTATGTAAGGCCAACGTGCTCAAATCTTCATACAGA
+AAGATTTGAAGTAATATTTTAACCGCTAGATGAAGAGCAAGCGCATGGAGCGACAAAATGAATAAAGAACAATCTGCTGA
+TGATCCCTCCGTGGATCTGATTCGTGTAAAAAATATGCTTAATAGCACCATTTCTATGAGTTACCCTGATGTTGTAATTG
+CATGTATAGAACATAAGGTGTCTCTGGAAGCATTCAGAGCAATTGAGGCAGCGTTGGTGAAGCACGATAATAATATGAAG
+GATTATTCCCTGGTGGTTGACTGATCACCATAACTGCTAATCATTCAAACTATTTAGTCTGTGACAGAGCCAACACGCAG
+TCTGTCACTGTCAGGAAAGTGGTAAAACTGCAACTCAATTACTGCAATGCCCTCGTAATTAAGTGAATTTACAATATCGT
+CCTGTTCGGAGGGAAGAACGCGGGATGTTCATTCTTCATCACTTTTAATTGATGTATATGCTCTCTTTTCTGACGTTAGT
+CTCCGACGGCAGGCTTCAATGACCCAGGCTGAGAAATTCCCGGACCCTTTTTGCTCAAGAGCGATGTTAATTTGTTCAAT
+CATTTGGTTAGGAAAGCGGATGTTGCGGGTTGTTGTTCTGCGGGTTCTGTTCTTCGTTGACATGAGGTTGCCCCGTATTC
+AGTGTCGCTGATTTGTATTGTCTGAAGTTGTTTTTACGTTAAGTTGATGCAGATCAATTAATACGATACCTGCGTCATAA
+TTGATTATTTGACGTGGTTTGATGGCCTCCACGCACGTTGTGATATGTAGATGATAATCATTATCACTTTACGGGTCCTT
+TCCGGTGATCCGACAGGTTACG
diff --git a/tests/data/lambdaNEB.fa.fai b/tests/data/lambdaNEB.fa.fai

new file mode 100644 (file)

index 0000000..064af36
--- /dev/null
+++ b/tests/data/lambdaNEB.fa.fai
@@ -0,0 +1 @@
+lambda_NEB3011 48502   16      80      81
diff --git a/tests/data/merge.fofn b/tests/data/merge.fofn

new file mode 100644 (file)

index 0000000..8a79dff
--- /dev/null
+++ b/tests/data/merge.fofn
@@ -0,0 +1,2 @@
+aligned.bam
+aligned2.bam
diff --git a/tests/data/phi29.bam b/tests/data/phi29.bam

new file mode 100644 (file)

index 0000000..46176b6

Binary files /dev/null and b/tests/data/phi29.bam differ
diff --git a/tests/data/phi29.bam.pbi b/tests/data/phi29.bam.pbi

new file mode 100644 (file)

index 0000000..5282b94

Binary files /dev/null and b/tests/data/phi29.bam.pbi differ
diff --git a/tests/data/polymerase/consolidate.subread.dataset.xml b/tests/data/polymerase/consolidate.subread.dataset.xml

new file mode 100644 (file)

index 0000000..ca85a7a
--- /dev/null
+++ b/tests/data/polymerase/consolidate.subread.dataset.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+    TimeStampedName="subreadset_150304_231155" 
+    MetaType="PacBio.DataSet.SubreadSet" 
+    Name="DataSet_SubreadSet" 
+    Version="3.0.0" 
+    CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="production.subreads.bam">
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource
+        UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195"
+        TimeStampedName="scraps_bam_150304_231155"
+        MetaType="PacBio.SubreadFile.ScrapsBamFile"
+        ResourceId="production.scraps.bam">
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="qStart" Value="4000" Operator=">"/>
+            <pbbase:Property Name="qStart" Value="5000" Operator="<"/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
diff --git a/tests/data/polymerase/filtered_resources.subread.dataset.xml b/tests/data/polymerase/filtered_resources.subread.dataset.xml

new file mode 100644 (file)

index 0000000..e414e00
--- /dev/null
+++ b/tests/data/polymerase/filtered_resources.subread.dataset.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+    TimeStampedName="subreadset_150304_231155" 
+    MetaType="PacBio.DataSet.SubreadSet" 
+    Name="DataSet_SubreadSet" 
+    Version="3.0.0" 
+    CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="./production.subreads.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.ScrapsBamFile" 
+                ResourceId="./production.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="./internal.subreads.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.ScrapsBamFile" 
+                ResourceId="./internal.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5197" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.HqRegionBamFile" 
+        ResourceId="./production_hq.hqregion.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5199" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.HqScrapsBamFile" 
+                ResourceId="./production_hq.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="zm" Value="100000" Operator="=="/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
+\ No newline at end of file
diff --git a/tests/data/polymerase/internal.hqregions.bam b/tests/data/polymerase/internal.hqregions.bam

new file mode 100644 (file)

index 0000000..8e31e6b

Binary files /dev/null and b/tests/data/polymerase/internal.hqregions.bam differ
diff --git a/tests/data/polymerase/internal.hqregions.bam.pbi b/tests/data/polymerase/internal.hqregions.bam.pbi

new file mode 100644 (file)

index 0000000..b79e661

Binary files /dev/null and b/tests/data/polymerase/internal.hqregions.bam.pbi differ
diff --git a/tests/data/polymerase/internal.lqregions.bam b/tests/data/polymerase/internal.lqregions.bam

new file mode 100644 (file)

index 0000000..96878a3

Binary files /dev/null and b/tests/data/polymerase/internal.lqregions.bam differ
diff --git a/tests/data/polymerase/internal.lqregions.bam.pbi b/tests/data/polymerase/internal.lqregions.bam.pbi

new file mode 100644 (file)

index 0000000..a4b7237

Binary files /dev/null and b/tests/data/polymerase/internal.lqregions.bam.pbi differ
diff --git a/tests/data/polymerase/internal.polymerase.bam b/tests/data/polymerase/internal.polymerase.bam

new file mode 100644 (file)

index 0000000..8f293c1

Binary files /dev/null and b/tests/data/polymerase/internal.polymerase.bam differ
diff --git a/tests/data/polymerase/internal.polymerase.bam.pbi b/tests/data/polymerase/internal.polymerase.bam.pbi

new file mode 100644 (file)

index 0000000..c423905

Binary files /dev/null and b/tests/data/polymerase/internal.polymerase.bam.pbi differ
diff --git a/tests/data/polymerase/internal.scraps.bam b/tests/data/polymerase/internal.scraps.bam

new file mode 100644 (file)

index 0000000..47c1689

Binary files /dev/null and b/tests/data/polymerase/internal.scraps.bam differ
diff --git a/tests/data/polymerase/internal.scraps.bam.pbi b/tests/data/polymerase/internal.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..9db21f2

Binary files /dev/null and b/tests/data/polymerase/internal.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/internal.subreads.bam b/tests/data/polymerase/internal.subreads.bam

new file mode 100644 (file)

index 0000000..00ad171

Binary files /dev/null and b/tests/data/polymerase/internal.subreads.bam differ
diff --git a/tests/data/polymerase/internal.subreads.bam.pbi b/tests/data/polymerase/internal.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..b0d7e28

Binary files /dev/null and b/tests/data/polymerase/internal.subreads.bam.pbi differ
diff --git a/tests/data/polymerase/multiple_resources.subread.dataset.xml b/tests/data/polymerase/multiple_resources.subread.dataset.xml

new file mode 100644 (file)

index 0000000..109535d
--- /dev/null
+++ b/tests/data/polymerase/multiple_resources.subread.dataset.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+    TimeStampedName="subreadset_150304_231155" 
+    MetaType="PacBio.DataSet.SubreadSet" 
+    Name="DataSet_SubreadSet" 
+    Version="3.0.0" 
+    CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="./production.subreads.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.ScrapsBamFile" 
+                ResourceId="./production.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5197" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.HqRegionBamFile" 
+        ResourceId="./production_hq.hqregion.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5199" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.HqScrapsBamFile" 
+                ResourceId="./production_hq.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+</pbds:SubreadSet>
+\ No newline at end of file
diff --git a/tests/data/polymerase/production.polymerase.bam b/tests/data/polymerase/production.polymerase.bam

new file mode 100644 (file)

index 0000000..4c84b23

Binary files /dev/null and b/tests/data/polymerase/production.polymerase.bam differ
diff --git a/tests/data/polymerase/production.scraps.bam b/tests/data/polymerase/production.scraps.bam

new file mode 100644 (file)

index 0000000..a32bdfb

Binary files /dev/null and b/tests/data/polymerase/production.scraps.bam differ
diff --git a/tests/data/polymerase/production.scraps.bam.pbi b/tests/data/polymerase/production.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..5ef119d

Binary files /dev/null and b/tests/data/polymerase/production.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/production.subreads.bam b/tests/data/polymerase/production.subreads.bam

new file mode 100644 (file)

index 0000000..452aad5

Binary files /dev/null and b/tests/data/polymerase/production.subreads.bam differ
diff --git a/tests/data/polymerase/production.subreads.bam.pbi b/tests/data/polymerase/production.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..7ff2fcc

Binary files /dev/null and b/tests/data/polymerase/production.subreads.bam.pbi differ
diff --git a/tests/data/polymerase/production_hq.hqregion.bam b/tests/data/polymerase/production_hq.hqregion.bam

new file mode 100644 (file)

index 0000000..66d436b

Binary files /dev/null and b/tests/data/polymerase/production_hq.hqregion.bam differ
diff --git a/tests/data/polymerase/production_hq.hqregion.bam.pbi b/tests/data/polymerase/production_hq.hqregion.bam.pbi

new file mode 100644 (file)

index 0000000..ec8f166

Binary files /dev/null and b/tests/data/polymerase/production_hq.hqregion.bam.pbi differ
diff --git a/tests/data/polymerase/production_hq.scraps.bam b/tests/data/polymerase/production_hq.scraps.bam

new file mode 100644 (file)

index 0000000..716e098

Binary files /dev/null and b/tests/data/polymerase/production_hq.scraps.bam differ
diff --git a/tests/data/polymerase/production_hq.scraps.bam.pbi b/tests/data/polymerase/production_hq.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..1017562

Binary files /dev/null and b/tests/data/polymerase/production_hq.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/qnameFiltered.subreads.dataset.xml b/tests/data/polymerase/qnameFiltered.subreads.dataset.xml

new file mode 100644 (file)

index 0000000..c200ded
--- /dev/null
+++ b/tests/data/polymerase/qnameFiltered.subreads.dataset.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c"
+    TimeStampedName="subreadset_150304_231155"
+    MetaType="PacBio.DataSet.SubreadSet"
+    Name="DataSet_SubreadSet"
+    Version="3.0.0"
+    CreatedAt="2015-01-27T09:00:01">
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193"
+        TimeStampedName="subread_bam_150304_231155"
+        MetaType="PacBio.SubreadFile.SubreadBamFile"
+        ResourceId="production.subreads.bam">
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource
+        UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195"
+        TimeStampedName="scraps_bam_150304_231155"
+        MetaType="PacBio.SubreadFile.ScrapsBamFile"
+        ResourceId="production.scraps.bam">
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="qname_file" Value="qname_whitelist.txt" Operator="="/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
diff --git a/tests/data/polymerase/qname_whitelist.txt b/tests/data/polymerase/qname_whitelist.txt

new file mode 100644 (file)

index 0000000..0004061
--- /dev/null
+++ b/tests/data/polymerase/qname_whitelist.txt
@@ -0,0 +1,3 @@
+ArminsFakeMovie/0/3116_3628
+ArminsFakeMovie/0/3722_4267
+ArminsFakeMovie/0/6812_7034
diff --git a/tests/data/polymerase/scrapless.scraps.bam b/tests/data/polymerase/scrapless.scraps.bam

new file mode 100644 (file)

index 0000000..7b989c4

Binary files /dev/null and b/tests/data/polymerase/scrapless.scraps.bam differ
diff --git a/tests/data/polymerase/scrapless.scraps.bam.pbi b/tests/data/polymerase/scrapless.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..140af8a

Binary files /dev/null and b/tests/data/polymerase/scrapless.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/scrapless.subreads.bam b/tests/data/polymerase/scrapless.subreads.bam

new file mode 100644 (file)

index 0000000..739b3b4

Binary files /dev/null and b/tests/data/polymerase/scrapless.subreads.bam differ
diff --git a/tests/data/polymerase/scrapless.subreads.bam.pbi b/tests/data/polymerase/scrapless.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..a20a00f

Binary files /dev/null and b/tests/data/polymerase/scrapless.subreads.bam.pbi differ
diff --git a/tests/data/relative/a/test.bam b/tests/data/relative/a/test.bam

new file mode 100644 (file)

index 0000000..bd06b8a

Binary files /dev/null and b/tests/data/relative/a/test.bam differ
diff --git a/tests/data/relative/b/test1.bam b/tests/data/relative/b/test1.bam

new file mode 100644 (file)

index 0000000..bd06b8a

Binary files /dev/null and b/tests/data/relative/b/test1.bam differ
diff --git a/tests/data/relative/b/test2.bam b/tests/data/relative/b/test2.bam

new file mode 100644 (file)

index 0000000..bd06b8a

Binary files /dev/null and b/tests/data/relative/b/test2.bam differ
diff --git a/tests/data/relative/relative.fofn b/tests/data/relative/relative.fofn

new file mode 100644 (file)

index 0000000..755c589
--- /dev/null
+++ b/tests/data/relative/relative.fofn
@@ -0,0 +1,3 @@
+a/test.bam
+b/test1.bam
+b/test2.bam
diff --git a/tests/data/relative/relative.xml b/tests/data/relative/relative.xml

new file mode 100644 (file)

index 0000000..0e78fe4
--- /dev/null
+++ b/tests/data/relative/relative.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" CreatedAt="2015-01-27T09:00:01" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+        <pbbase:ExternalResources>
+                <pbbase:ExternalResource MetaType="SubreadFile.SubreadBamFile" ResourceId="./a/test.bam" />
+                <pbbase:ExternalResource MetaType="SubreadFile.SubreadBamFile" ResourceId="./b/test1.bam" />
+                <pbbase:ExternalResource MetaType="SubreadFile.SubreadBamFile" ResourceId="./b/test2.bam"/>
+        </pbbase:ExternalResources>
+</pbds:SubreadSet>
diff --git a/tests/data/relative/relative2.fofn b/tests/data/relative/relative2.fofn

new file mode 100644 (file)

index 0000000..f1969ac
--- /dev/null
+++ b/tests/data/relative/relative2.fofn
@@ -0,0 +1,4 @@
+a/test.bam
+b/test1.bam
+b/test2.bam
+relative.xml
diff --git a/tests/data/segfault.bam b/tests/data/segfault.bam

new file mode 100644 (file)

index 0000000..755c7eb

Binary files /dev/null and b/tests/data/segfault.bam differ
diff --git a/tests/data/test_group_query/group.fofn.in b/tests/data/test_group_query/group.fofn.in

new file mode 100644 (file)

index 0000000..4af9e82
--- /dev/null
+++ b/tests/data/test_group_query/group.fofn.in
@@ -0,0 +1,3 @@
+@PacBioBAM_TestsDir@/data/test_group_query/test1.bam
+@PacBioBAM_TestsDir@/data/test_group_query/test2.bam
+@PacBioBAM_TestsDir@/data/test_group_query/test3.bam
diff --git a/tests/data/test_group_query/test1.bam b/tests/data/test_group_query/test1.bam

new file mode 100644 (file)

index 0000000..5673abc

Binary files /dev/null and b/tests/data/test_group_query/test1.bam differ
diff --git a/tests/data/test_group_query/test2.bam b/tests/data/test_group_query/test2.bam

new file mode 100644 (file)

index 0000000..565b224

Binary files /dev/null and b/tests/data/test_group_query/test2.bam differ
diff --git a/tests/data/test_group_query/test2.bam.pbi b/tests/data/test_group_query/test2.bam.pbi

new file mode 100644 (file)

index 0000000..384ad28

Binary files /dev/null and b/tests/data/test_group_query/test2.bam.pbi differ
diff --git a/tests/data/test_group_query/test3.bam b/tests/data/test_group_query/test3.bam

new file mode 100644 (file)

index 0000000..3b1e21b

Binary files /dev/null and b/tests/data/test_group_query/test3.bam differ
diff --git a/tests/data/unmap1.bam b/tests/data/unmap1.bam

new file mode 100644 (file)

index 0000000..3fe2af5

Binary files /dev/null and b/tests/data/unmap1.bam differ
diff --git a/tests/data/unmap1.bam.bai b/tests/data/unmap1.bam.bai

new file mode 100644 (file)

index 0000000..dd19971

Binary files /dev/null and b/tests/data/unmap1.bam.bai differ
diff --git a/tests/data/unmap2.bam b/tests/data/unmap2.bam

new file mode 100644 (file)

index 0000000..8feed79

Binary files /dev/null and b/tests/data/unmap2.bam differ
diff --git a/tests/data/unmap2.bam.bai b/tests/data/unmap2.bam.bai

new file mode 100644 (file)

index 0000000..f495714

Binary files /dev/null and b/tests/data/unmap2.bam.bai differ
diff --git a/tests/files.cmake b/tests/files.cmake

new file mode 100644 (file)

index 0000000..61370ac
--- /dev/null
+++ b/tests/files.cmake
@@ -0,0 +1,56 @@
+# test case headers
+set( PacBioBAMTest_H
+
+)
+
+# test case sources
+set( PacBioBAMTest_CPP
+
+    ${PacBioBAM_TestsDir}/src/test_Accuracy.cpp
+    ${PacBioBAM_TestsDir}/src/test_AlignmentPrinter.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamFile.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamHeader.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecord.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordBuilder.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordClipping.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordImplCore.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordImplTags.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordImplVariableData.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordMapping.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamWriter.cpp
+    ${PacBioBAM_TestsDir}/src/test_BarcodeQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_Cigar.cpp
+    ${PacBioBAM_TestsDir}/src/test_Compare.cpp
+    ${PacBioBAM_TestsDir}/src/test_DataSetCore.cpp
+    ${PacBioBAM_TestsDir}/src/test_DataSetIO.cpp
+    ${PacBioBAM_TestsDir}/src/test_DataSetQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_DataSetXsd.cpp
+    ${PacBioBAM_TestsDir}/src/test_EndToEnd.cpp
+    ${PacBioBAM_TestsDir}/src/test_EntireFileQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_Fasta.cpp
+    ${PacBioBAM_TestsDir}/src/test_FileUtils.cpp
+    ${PacBioBAM_TestsDir}/src/test_Frames.cpp
+    ${PacBioBAM_TestsDir}/src/test_GenomicIntervalQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_IndexedFastaReader.cpp
+    ${PacBioBAM_TestsDir}/src/test_Intervals.cpp
+    ${PacBioBAM_TestsDir}/src/test_PacBioIndex.cpp
+    ${PacBioBAM_TestsDir}/src/test_PbiFilter.cpp
+    ${PacBioBAM_TestsDir}/src/test_PbiFilterQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_QNameQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_QualityValues.cpp
+    ${PacBioBAM_TestsDir}/src/test_Pulse2BaseCache.cpp
+    ${PacBioBAM_TestsDir}/src/test_ReadAccuracyQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_ReadGroupInfo.cpp
+    ${PacBioBAM_TestsDir}/src/test_SamWriter.cpp
+    ${PacBioBAM_TestsDir}/src/test_SequenceUtils.cpp
+    ${PacBioBAM_TestsDir}/src/test_StringUtils.cpp
+    ${PacBioBAM_TestsDir}/src/test_SubreadLengthQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_Tags.cpp
+    ${PacBioBAM_TestsDir}/src/test_TimeUtils.cpp
+    # ${PacBioBAM_TestsDir}/src/test_UnmappedReadsQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_Validator.cpp
+    ${PacBioBAM_TestsDir}/src/test_Version.cpp
+    ${PacBioBAM_TestsDir}/src/test_WhitelistedZmwReadStitcher.cpp
+    ${PacBioBAM_TestsDir}/src/test_ZmwReadStitcher.cpp
+    ${PacBioBAM_TestsDir}/src/test_ZmwQuery.cpp
+)
diff --git a/tests/scripts/cram.py b/tests/scripts/cram.py

new file mode 100755 (executable)

index 0000000..20c4681
--- /dev/null
+++ b/tests/scripts/cram.py
@@ -0,0 +1,516 @@
+#!/usr/bin/env python
+"""Functional testing framework for command line applications"""
+
+import difflib
+import itertools
+import optparse
+import os
+import re
+import signal
+import subprocess
+import sys
+import shutil
+import time
+import tempfile
+
+try:
+    import configparser
+except ImportError:
+    import ConfigParser as configparser
+
+__all__ = ['main', 'test']
+
+def findtests(paths):
+    """Yield tests in paths in sorted order"""
+    for p in paths:
+        if os.path.isdir(p):
+            for root, dirs, files in os.walk(p):
+                if os.path.basename(root).startswith('.'):
+                    continue
+                for f in sorted(files):
+                    if not f.startswith('.') and f.endswith('.t'):
+                        yield os.path.normpath(os.path.join(root, f))
+        else:
+            yield os.path.normpath(p)
+
+def regex(pattern, s):
+    """Match a regular expression or return False if invalid.
+
+    >>> [bool(regex(r, 'foobar')) for r in ('foo.*', '***')]
+    [True, False]
+    """
+    try:
+        return re.match(pattern + r'\Z', s)
+    except re.error:
+        return False
+
+def glob(el, l):
+    r"""Match a glob-like pattern.
+
+    The only supported special characters are * and ?. Escaping is
+    supported.
+
+    >>> bool(glob(r'\* \\ \? fo?b*', '* \\ ? foobar'))
+    True
+    """
+    i, n = 0, len(el)
+    res = ''
+    while i < n:
+        c = el[i]
+        i += 1
+        if c == '\\' and el[i] in '*?\\':
+            res += el[i - 1:i + 1]
+            i += 1
+        elif c == '*':
+            res += '.*'
+        elif c == '?':
+            res += '.'
+        else:
+            res += re.escape(c)
+    return regex(res, l)
+
+annotations = {'glob': glob, 're': regex}
+
+def match(el, l):
+    """Match patterns based on annotations"""
+    for k in annotations:
+        ann = ' (%s)\n' % k
+        if el.endswith(ann) and annotations[k](el[:-len(ann)], l[:-1]):
+            return True
+    return False
+
+class SequenceMatcher(difflib.SequenceMatcher, object):
+    """Like difflib.SequenceMatcher, but matches globs and regexes"""
+
+    def find_longest_match(self, alo, ahi, blo, bhi):
+        """Find longest matching block in a[alo:ahi] and b[blo:bhi]"""
+        # SequenceMatcher uses find_longest_match() to slowly whittle down
+        # the differences between a and b until it has each matching block.
+        # Because of this, we can end up doing the same matches many times.
+        matches = []
+        for n, (el, line) in enumerate(zip(self.a[alo:ahi], self.b[blo:bhi])):
+            if el != line and match(el, line):
+                # This fools the superclass's method into thinking that the
+                # regex/glob in a is identical to b by replacing a's line (the
+                # expected output) with b's line (the actual output).
+                self.a[alo + n] = line
+                matches.append((n, el))
+        ret = super(SequenceMatcher, self).find_longest_match(alo, ahi,
+                                                              blo, bhi)
+        # Restore the lines replaced above. Otherwise, the diff output
+        # would seem to imply that the tests never had any regexes/globs.
+        for n, el in matches:
+            self.a[alo + n] = el
+        return ret
+
+def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
+                 tofiledate='', n=3, lineterm='\n', matcher=SequenceMatcher):
+    """Compare two sequences of lines; generate the delta as a unified diff.
+
+    This is like difflib.unified_diff(), but allows custom matchers.
+    """
+    started = False
+    for group in matcher(None, a, b).get_grouped_opcodes(n):
+        if not started:
+            fromdate = fromfiledate and '\t%s' % fromfiledate or ''
+            todate = fromfiledate and '\t%s' % tofiledate or ''
+            yield '--- %s%s%s' % (fromfile, fromdate, lineterm)
+            yield '+++ %s%s%s' % (tofile, todate, lineterm)
+            started = True
+        i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
+        yield "@@ -%d,%d +%d,%d @@%s" % (i1 + 1, i2 - i1, j1 + 1, j2 - j1,
+                                         lineterm)
+        for tag, i1, i2, j1, j2 in group:
+            if tag == 'equal':
+                for line in a[i1:i2]:
+                    yield ' ' + line
+                continue
+            if tag == 'replace' or tag == 'delete':
+                for line in a[i1:i2]:
+                    yield '-' + line
+            if tag == 'replace' or tag == 'insert':
+                for line in b[j1:j2]:
+                    yield '+' + line
+
+needescape = re.compile(r'[\x00-\x09\x0b-\x1f\x7f-\xff]').search
+escapesub = re.compile(r'[\x00-\x09\x0b-\x1f\\\x7f-\xff]').sub
+escapemap = dict((chr(i), r'\x%02x' % i) for i in range(256))
+escapemap.update({'\\': '\\\\', '\r': r'\r', '\t': r'\t'})
+
+def escape(s):
+    """Like the string-escape codec, but doesn't escape quotes"""
+    return escapesub(lambda m: escapemap[m.group(0)], s[:-1]) + ' (esc)\n'
+
+def makeresetsigpipe():
+    """Make a function to reset SIGPIPE to SIG_DFL (for use in subprocesses).
+
+    Doing subprocess.Popen(..., preexec_fn=makeresetsigpipe()) will prevent
+    Python's SIGPIPE handler (SIG_IGN) from being inherited by the
+    child process.
+    """
+    if sys.platform == 'win32' or getattr(signal, 'SIGPIPE', None) is None:
+        return None
+    return lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+
+def test(path, shell, indent=2):
+    """Run test at path and return input, output, and diff.
+
+    This returns a 3-tuple containing the following:
+
+        (list of lines in test, same list with actual output, diff)
+
+    diff is a generator that yields the diff between the two lists.
+
+    If a test exits with return code 80, the actual output is set to
+    None and diff is set to [].
+    """
+    indent = ' ' * indent
+    cmdline = '%s$ ' % indent
+    conline = '%s> ' % indent
+
+    f = open(path)
+    abspath = os.path.abspath(path)
+    env = os.environ.copy()
+    env['TESTDIR'] = os.path.dirname(abspath)
+    env['TESTFILE'] = os.path.basename(abspath)
+    p = subprocess.Popen([shell, '-'], bufsize=-1, stdin=subprocess.PIPE,
+                         stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                         universal_newlines=True, env=env,
+                         preexec_fn=makeresetsigpipe(),
+                         close_fds=os.name == 'posix')
+    salt = 'CRAM%s' % time.time()
+
+    after = {}
+    refout, postout = [], []
+    i = pos = prepos = -1
+    stdin = []
+    for i, line in enumerate(f):
+        refout.append(line)
+        if line.startswith(cmdline):
+            after.setdefault(pos, []).append(line)
+            prepos = pos
+            pos = i
+            stdin.append('echo "\n%s %s $?"\n' % (salt, i))
+            stdin.append(line[len(cmdline):])
+        elif line.startswith(conline):
+            after.setdefault(prepos, []).append(line)
+            stdin.append(line[len(conline):])
+        elif not line.startswith(indent):
+            after.setdefault(pos, []).append(line)
+    stdin.append('echo "\n%s %s $?"\n' % (salt, i + 1))
+
+    output = p.communicate(input=''.join(stdin))[0]
+    if p.returncode == 80:
+        return (refout, None, [])
+
+    # Add a trailing newline to the input script if it's missing.
+    if refout and not refout[-1].endswith('\n'):
+        refout[-1] += '\n'
+
+    # We use str.split instead of splitlines to get consistent
+    # behavior between Python 2 and 3. In 3, we use unicode strings,
+    # which has more line breaks than \n and \r.
+    pos = -1
+    ret = 0
+    for i, line in enumerate(output[:-1].split('\n')):
+        line += '\n'
+        if line.startswith(salt):
+            presalt = postout.pop()
+            if presalt != '%s\n' % indent:
+                postout.append(presalt[:-1] + ' (no-eol)\n')
+            ret = int(line.split()[2])
+            if ret != 0:
+                postout.append('%s[%s]\n' % (indent, ret))
+            postout += after.pop(pos, [])
+            pos = int(line.split()[1])
+        else:
+            if needescape(line):
+                line = escape(line)
+            postout.append(indent + line)
+    postout += after.pop(pos, [])
+
+    diffpath = os.path.basename(abspath)
+    diff = unified_diff(refout, postout, diffpath, diffpath + '.err')
+    for firstline in diff:
+        return refout, postout, itertools.chain([firstline], diff)
+    return refout, postout, []
+
+def prompt(question, answers, auto=None):
+    """Write a prompt to stdout and ask for answer in stdin.
+
+    answers should be a string, with each character a single
+    answer. An uppercase letter is considered the default answer.
+
+    If an invalid answer is given, this asks again until it gets a
+    valid one.
+
+    If auto is set, the question is answered automatically with the
+    specified value.
+    """
+    default = [c for c in answers if c.isupper()]
+    while True:
+        sys.stdout.write('%s [%s] ' % (question, answers))
+        sys.stdout.flush()
+        if auto is not None:
+            sys.stdout.write(auto + '\n')
+            sys.stdout.flush()
+            return auto
+
+        answer = sys.stdin.readline().strip().lower()
+        if not answer and default:
+            return default[0]
+        elif answer and answer in answers.lower():
+            return answer
+
+def log(msg=None, verbosemsg=None, verbose=False):
+    """Write msg to standard out and flush.
+
+    If verbose is True, write verbosemsg instead.
+    """
+    if verbose:
+        msg = verbosemsg
+    if msg:
+        sys.stdout.write(msg)
+        sys.stdout.flush()
+
+def patch(cmd, diff, path):
+    """Run echo [lines from diff] | cmd -p0"""
+    p = subprocess.Popen([cmd, '-p0'], bufsize=-1, stdin=subprocess.PIPE,
+                         universal_newlines=True,
+                         preexec_fn=makeresetsigpipe(),
+                         cwd=path,
+                         close_fds=os.name == 'posix')
+    p.communicate(''.join(diff))
+    return p.returncode == 0
+
+def run(paths, tmpdir, shell, quiet=False, verbose=False, patchcmd=None,
+        answer=None, indent=2):
+    """Run tests in paths in tmpdir.
+
+    If quiet is True, diffs aren't printed. If verbose is True,
+    filenames and status information are printed.
+
+    If patchcmd is set, a prompt is written to stdout asking if
+    changed output should be merged back into the original test. The
+    answer is read from stdin. If 'y', the test is patched using patch
+    based on the changed output.
+    """
+    cwd = os.getcwd()
+    seen = set()
+    basenames = set()
+    skipped = failed = 0
+    for i, path in enumerate(findtests(paths)):
+        abspath = os.path.abspath(path)
+        if abspath in seen:
+            continue
+        seen.add(abspath)
+
+        log(None, '%s: ' % path, verbose)
+        if not os.stat(abspath).st_size:
+            skipped += 1
+            log('s', 'empty\n', verbose)
+        else:
+            basename = os.path.basename(path)
+            if basename in basenames:
+                basename = '%s-%s' % (basename, i)
+            else:
+                basenames.add(basename)
+            testdir = os.path.join(tmpdir, basename)
+            os.mkdir(testdir)
+            try:
+                os.chdir(testdir)
+                refout, postout, diff = test(abspath, shell, indent)
+            finally:
+                os.chdir(cwd)
+
+            errpath = abspath + '.err'
+            if postout is None:
+                skipped += 1
+                log('s', 'skipped\n', verbose)
+            elif not diff:
+                log('.', 'passed\n', verbose)
+                if os.path.exists(errpath):
+                    os.remove(errpath)
+            else:
+                failed += 1
+                log('!', 'failed\n', verbose)
+                if not quiet:
+                    log('\n', None, verbose)
+                errfile = open(errpath, 'w')
+                try:
+                    for line in postout:
+                        errfile.write(line)
+                finally:
+                    errfile.close()
+                if not quiet:
+                    if patchcmd:
+                        diff = list(diff)
+                    for line in diff:
+                        log(line)
+                    if (patchcmd and
+                        prompt('Accept this change?', 'yN', answer) == 'y'):
+                        if patch(patchcmd, diff, os.path.dirname(abspath)):
+                            log(None, '%s: merged output\n' % path, verbose)
+                            os.remove(errpath)
+                        else:
+                            log('%s: merge failed\n' % path)
+    log('\n', None, verbose)
+    log('# Ran %s tests, %s skipped, %s failed.\n'
+        % (len(seen), skipped, failed))
+    return bool(failed)
+
+def which(cmd):
+    """Return the patch to cmd or None if not found"""
+    for p in os.environ['PATH'].split(os.pathsep):
+        path = os.path.join(p, cmd)
+        if os.path.isfile(path) and os.access(path, os.X_OK):
+            return os.path.abspath(path)
+    return None
+
+def expandpath(path):
+    """Expands ~ and environment variables in path"""
+    return os.path.expanduser(os.path.expandvars(path))
+
+class OptionParser(optparse.OptionParser):
+    """Like optparse.OptionParser, but supports setting values through
+    CRAM= and .cramrc."""
+
+    def __init__(self, *args, **kwargs):
+        self._config_opts = {}
+        optparse.OptionParser.__init__(self, *args, **kwargs)
+
+    def add_option(self, *args, **kwargs):
+        option = optparse.OptionParser.add_option(self, *args, **kwargs)
+        if option.dest and option.dest != 'version':
+            key = option.dest.replace('_', '-')
+            self._config_opts[key] = option.action == 'store_true'
+        return option
+
+    def parse_args(self, args=None, values=None):
+        config = configparser.RawConfigParser()
+        config.read(expandpath(os.environ.get('CRAMRC', '.cramrc')))
+        defaults = {}
+        for key, isbool in self._config_opts.items():
+            try:
+                if isbool:
+                    try:
+                        value = config.getboolean('cram', key)
+                    except ValueError:
+                        value = config.get('cram', key)
+                        self.error('--%s: invalid boolean value: %r'
+                                   % (key, value))
+                else:
+                    value = config.get('cram', key)
+            except (configparser.NoSectionError, configparser.NoOptionError):
+                pass
+            else:
+                defaults[key] = value
+        self.set_defaults(**defaults)
+
+        eargs = os.environ.get('CRAM', '').strip()
+        if eargs:
+            import shlex
+            args = args or []
+            args += shlex.split(eargs)
+
+        try:
+            return optparse.OptionParser.parse_args(self, args, values)
+        except optparse.OptionValueError:
+            self.error(str(sys.exc_info()[1]))
+
+def main(args):
+    """Main entry point.
+
+    args should not contain the script name.
+    """
+    p = OptionParser(usage='cram [OPTIONS] TESTS...', prog='cram')
+    p.add_option('-V', '--version', action='store_true',
+                 help='show version information and exit')
+    p.add_option('-q', '--quiet', action='store_true',
+                 help="don't print diffs")
+    p.add_option('-v', '--verbose', action='store_true',
+                 help='show filenames and test status')
+    p.add_option('-i', '--interactive', action='store_true',
+                 help='interactively merge changed test output')
+    p.add_option('-y', '--yes', action='store_true',
+                 help='answer yes to all questions')
+    p.add_option('-n', '--no', action='store_true',
+                 help='answer no to all questions')
+    p.add_option('-E', '--preserve-env', action='store_true',
+                 help="don't reset common environment variables")
+    p.add_option('--keep-tmpdir', action='store_true',
+                 help='keep temporary directories')
+    p.add_option('--shell', action='store', default='/bin/sh', metavar='PATH',
+                 help='shell to use for running tests')
+    p.add_option('--indent', action='store', default=2, metavar='NUM',
+                 type='int', help='number of spaces to use for indentation')
+    opts, paths = p.parse_args(args)
+
+    if opts.version:
+        sys.stdout.write("""Cram CLI testing framework (version 0.6)
+
+Copyright (C) 2010-2011 Brodie Rao <brodie@bitheap.org> and others
+This is free software; see the source for copying conditions. There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+""")
+        return
+
+    conflicts = [('-y', opts.yes, '-n', opts.no),
+                 ('-q', opts.quiet, '-i', opts.interactive)]
+    for s1, o1, s2, o2 in conflicts:
+        if o1 and o2:
+            sys.stderr.write('options %s and %s are mutually exclusive\n'
+                             % (s1, s2))
+            return 2
+
+    patchcmd = None
+    if opts.interactive:
+        patchcmd = which('patch')
+        if not patchcmd:
+            sys.stderr.write('patch(1) required for -i\n')
+            return 2
+
+    if not paths:
+        sys.stdout.write(p.get_usage())
+        return 2
+
+    badpaths = [path for path in paths if not os.path.exists(path)]
+    if badpaths:
+        sys.stderr.write('no such file: %s\n' % badpaths[0])
+        return 2
+
+    tmpdir = os.environ['CRAMTMP'] = tempfile.mkdtemp('', 'cramtests-')
+    proctmp = os.path.join(tmpdir, 'tmp')
+    os.mkdir(proctmp)
+    for s in ('TMPDIR', 'TEMP', 'TMP'):
+        os.environ[s] = proctmp
+
+    if not opts.preserve_env:
+        for s in ('LANG', 'LC_ALL', 'LANGUAGE'):
+            os.environ[s] = 'C'
+        os.environ['TZ'] = 'GMT'
+        os.environ['CDPATH'] = ''
+        os.environ['COLUMNS'] = '80'
+        os.environ['GREP_OPTIONS'] = ''
+
+    if opts.yes:
+        answer = 'y'
+    elif opts.no:
+        answer = 'n'
+    else:
+        answer = None
+
+    try:
+        return run(paths, tmpdir, opts.shell, opts.quiet, opts.verbose,
+                   patchcmd, answer, opts.indent)
+    finally:
+        if opts.keep_tmpdir:
+            log('# Kept temporary directory: %s\n' % tmpdir)
+        else:
+            shutil.rmtree(tmpdir)
+
+if __name__ == '__main__':
+    try:
+        sys.exit(main(sys.argv[1:]))
+    except KeyboardInterrupt:
+        pass
diff --git a/tests/scripts/generate_data.py b/tests/scripts/generate_data.py

new file mode 100755 (executable)

index 0000000..ac28dbb
--- /dev/null
+++ b/tests/scripts/generate_data.py
@@ -0,0 +1,164 @@
+#!/usr/bin/python
+
+from __future__ import print_function
+
+import os, shutil, sys
+import StringIO
+
+fastaSeq_1 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG"""
+
+fastaSeq_2 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC"""
+
+fastaSeq_3 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT"""
+
+# file creation decorator
+def fileMaker(func):
+    def inner(*args, **kwargs):
+        print(" - Creating file: %s..." % args[1], end='')
+        sys.stdout.flush()
+        retval = func(*args)
+        print("done.")
+        sys.stdout.flush()
+        return retval
+    return inner
+
+# symlink creation decorator
+def fileLinker(func):
+    def inner(*args, **kwargs):
+        print(" - Creating symlink: %s..." % args[1], end='')
+        sys.stdout.flush()
+        retval = func(*args)
+        print("done.")
+        sys.stdout.flush()
+        return retval
+    return inner
+
+# return a copy of original, minues any lines that contain an entry in blacklist
+def trimXmlElements(original, blacklist):
+    out = StringIO.StringIO()
+    for line in original.splitlines():
+        if all(x not in line for x in blacklist):
+            out.write(line + '\n')
+    result = out.getvalue()
+    out.close()
+    return result
+
+class TestDataGenerator:
+
+    def __init__(self, source, dest):
+
+        # source/destination directories
+        self.testDataDir      = source
+        self.generatedDataDir = dest
+
+        # generated output files/symlinks & 'maker' functions
+        self.outputFiles = {
+            'truncated.bam' : self.makeTruncatedBam,
+            'chunking_emptyfilters.subreadset.xml'   : self.makeChunkingXml,
+            'chunking_missingfilters.subreadset.xml' : self.makeChunkingXml,
+            'normal.fa' : self.makeNormalFasta
+        }
+        self.outputSymlinks = {
+            'aligned.bam'      : self.makeAlignedBamCopy,
+            'aligned.bam.bai'  : self.makeAlignedBamCopy,
+            'aligned.bam.pbi'  : self.makeAlignedBamCopy,
+            'aligned2.bam'     : self.makeAlignedBamCopy,
+            'aligned2.bam.bai' : self.makeAlignedBamCopy,
+            'aligned2.bam.pbi' : self.makeAlignedBamCopy,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam'     : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi' : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam'     : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi' : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam'     : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi' : self.makeChunkingSymlink,
+            'missing_pbi.bam' : self.makeMissingPbiBam,
+        }
+
+    def editChunkingXml(self, outputFn, removeFiltersNode):
+        inputXmlFn  = os.path.join(self.testDataDir,'chunking','chunking.subreadset.xml')
+        outputXmlFn = os.path.join(self.generatedDataDir,outputFn)
+
+        blacklist = ['pbds:Filter>', 'pbbase:Properties>', '<pbbase:Property']
+        if removeFiltersNode:
+            blacklist.append('pbds:Filters>')
+
+        inputXml = ''
+        with open(inputXmlFn, 'r') as xml_infile:
+            inputXml = xml_infile.read()
+        outputXml = trimXmlElements(inputXml, blacklist)
+        with open(outputXmlFn, 'w') as xml_outfile:
+            xml_outfile.write(outputXml)
+
+    @fileLinker
+    def makeAlignedBamCopy(self, outputFn):
+        source = os.path.join(self.testDataDir,outputFn)
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        os.symlink(source, dest)
+
+    @fileLinker
+    def makeChunkingSymlink(self, outputFn):
+        source = os.path.join(self.testDataDir,'chunking', outputFn)
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        os.symlink(source, dest)
+  
+    @fileLinker
+    def makeMissingPbiBam(self, outputFn):
+        source = os.path.join(self.testDataDir, 'phi29.bam')
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        os.symlink(source, dest)
+
+    @fileMaker
+    def makeChunkingXml(self, outputFn):
+        if outputFn == 'chunking_emptyfilters.subreadset.xml':
+            removeFiltersNode = False
+        else:
+            removeFiltersNode = True
+        self.editChunkingXml(outputFn, removeFiltersNode)
+
+    @fileMaker
+    def makeNormalFasta(self, outputFn):
+        content = ">1\n" + fastaSeq_1 + "\n>2\n" + fastaSeq_2 + "\n>3\n" + fastaSeq_3
+        dest = os.path.join(self.generatedDataDir, outputFn)
+        with open(outputFn, 'w') as fasta_out:
+            fasta_out.write(content)
+
+    @fileMaker
+    def makeTruncatedBam(self, outputFn):
+        source = os.path.join(self.testDataDir, 'phi29.bam')
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        shutil.copyfile(source, dest)
+        with open(dest, 'r+b') as in_file:
+            in_file.truncate(200)
+
+    # main entry point
+    def generate(self):
+
+        # skip file if it exists
+        os.chdir(self.generatedDataDir)
+        filenames = self.outputFiles.keys()
+        for file in filenames:
+            if os.path.exists(file) :
+                del self.outputFiles[file]
+
+        # skip symlink if it exists
+        symlinks = self.outputSymlinks.keys()
+        for link in symlinks:
+            if os.path.lexists(link):
+                del self.outputSymlinks[link]
+
+        # only print message & run makers, if any files/symlinks to be created
+        # else silent success
+        if self.outputFiles or self.outputSymlinks:
+            print('Generating test data in %s ' % self.generatedDataDir)
+            for file, func in self.outputFiles.iteritems():
+                func(file)
+            for link, func in self.outputSymlinks.iteritems():
+                func(link)
+
+# script entry point
+if __name__ == '__main__':
+    g = TestDataGenerator(sys.argv[1], sys.argv[2])
+    g.generate()
diff --git a/tests/src/CSharp/CheckSWIG.cs b/tests/src/CSharp/CheckSWIG.cs

new file mode 100644 (file)

index 0000000..d748774
--- /dev/null
+++ b/tests/src/CSharp/CheckSWIG.cs
@@ -0,0 +1,15 @@
+
+
+using PacBio.BAM;
+
+public class CheckSWIG
+{
+   public static void Main()
+   {
+       var header = new BamHeader();
+       header.ToSam();
+       System.Console.WriteLine("");
+       System.Console.WriteLine("pbbam SWIG binding to C# worked!");
+       System.Console.WriteLine("");
+   }
+}
diff --git a/tests/src/CSharp/TestPbbam.cs.in b/tests/src/CSharp/TestPbbam.cs.in

new file mode 100644 (file)

index 0000000..2913fc6
--- /dev/null
+++ b/tests/src/CSharp/TestPbbam.cs.in
@@ -0,0 +1,134 @@
+#pragma warning disable 168, 219
+
+using System;
+using System.IO;
+using System.Linq;
+using System.Collections;
+using System.Collections.Generic;
+using System.Reflection;
+
+using PacBio.BAM;
+
+namespace TestStuff
+{
+    //
+    // This approach is the best we can do for now, without requiring nunit.
+    //
+    public class TestPbbam
+    {
+        public static readonly string DATA_DIR = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location) + "/TestData";
+        public static readonly string BAM_FILENAME_1 = Path.Combine(DATA_DIR, "test_group_query", "test1.bam");
+        public static readonly string BAM_FILENAME_2 = Path.Combine(DATA_DIR, "test_group_query", "test2.bam");
+        public static readonly string STITCHING_FN_1 = Path.Combine(DATA_DIR, "polymerase", "production.subreads.bam");
+        public static readonly string STITCHING_FN_2 = Path.Combine(DATA_DIR, "polymerase", "production.scraps.bam");
+        public static readonly string FASTA_FILENAME = Path.Combine(DATA_DIR, "lambdaNEB.fa");
+
+        public TestPbbam ()
+        {
+        }
+
+        public static void TestExceptions()
+        {
+            try
+            {
+                var badFile = new BamFile("nonexistent.bam");
+            }
+            catch (Exception e)
+            {
+                //Console.Write(e.ToString());
+                Console.WriteLine("Exceptions - OK!");
+                return;
+            }
+            throw new Exception("doh!");
+        }
+
+
+        public static void TestCigar()
+        {
+            string s = "3=3I3D";
+            var c = new CigarType(s);
+            string cs = c.ToStdString();
+            if (s != cs)
+            {
+                throw new Exception("Cigar not working!");
+            }
+
+            // This used to crash
+            var c2 = CigarType.FromStdString("3=3I3D");
+
+            Console.WriteLine("TestCigar - OK!");
+        }
+
+        public static void TestBamFileEnumeration()
+        {
+            var bf = new BamFile(BAM_FILENAME_2);
+            var q = new EntireFileQuery(new DataSet(bf));
+
+            if (q.Count() !=  4)
+            {
+                throw new Exception("Enumeration not working!");
+            }
+
+            Console.WriteLine("TesBamFileEnumeration - OK!");
+        }
+
+        public static void TestIndexedFasta()
+        {
+            var f = new IndexedFastaReader(FASTA_FILENAME);
+            bool check = (f.NumSequences() == 1  &&
+                          f.HasSequence("lambda_NEB3011") &&
+                          f.Subsequence("lambda_NEB3011:0-10") == "GGGCGGCGAC");
+            if (!check)
+            {
+                throw new Exception("Indexed FASTA files not working");
+            }
+
+            var b = new BamRecord();
+            var x = b.Impl();
+
+            Console.WriteLine("TestIndexedFasta - OK!");
+        }
+
+        public static void TestZmwQuery()
+        {
+            var d = new DataSet(BAM_FILENAME_2);
+            var q = new ZmwQuery(new IntList {1, 2, 3}, d);
+            var q2 = new ZmwQuery(new IntList { 14743 }, d);
+
+            if (0 != q.Count() || 4 != q2.Count())
+            {
+                throw new Exception("ZmwQuery not working");
+            }
+            Console.WriteLine("TestZmwQuery - OK!");
+        }
+        
+        public static void TestStitching()
+        {
+            var stitcher = new ZmwReadStitcher(STITCHING_FN_1, STITCHING_FN_2);
+            if (!stitcher.HasNext())
+            {
+                throw new Exception("Error stitching via ZmwReadStitcher");
+            }
+            var zmwRecord = stitcher.Next();
+            
+            Console.WriteLine("TestStitching - OK!");
+        }
+
+        public void RunAllTests()
+        {
+            TestExceptions();
+            TestCigar();
+            TestBamFileEnumeration();
+            TestIndexedFasta();
+            TestZmwQuery();
+            TestStitching();
+        }
+
+        public static void Main()
+        {
+            var t = new TestPbbam();
+            t.RunAllTests();
+        }
+
+    }
+}
diff --git a/tests/src/CSharp/buildAssembly.sh.in b/tests/src/CSharp/buildAssembly.sh.in

new file mode 100644 (file)

index 0000000..7e667b3
--- /dev/null
+++ b/tests/src/CSharp/buildAssembly.sh.in
@@ -0,0 +1,95 @@
+#!/bin/bash
+set -euo pipefail
+set -x
+
+
+# This is a temporary hack to build windows C# bindings, while I work on getting it
+# to work nicely through CMake --DHA
+
+# UNIX setup (before doing this!)
+#  - install swig, cmake, and mono
+#
+# Windows setup (before doing this!):
+#  - install msys2 (64-bit) from https://msys2.github.io/
+#  - install: (pacman -S [package-name]); in all cases use the mingw-w64 64bit version when
+#    available:
+#   + gcc, g++, make
+#   + zlib-dev
+#   + swig
+#   + cmake
+#  - for cmake, generate MSYS makefiles
+
+if [ ${WIN32} ]; then
+    PLATFORM="Windows"
+else
+    PLATFORM="Unix"
+fi
+
+MSBUILD=${CSHARP_PROJECT_BUILDER}
+CSC=${CSHARP_COMPILER}
+
+CSPROJ_ROOT=${PacBioBAM_CSharpLibDir}
+CSPROJ=${PacBioBAM_CSharpLibDir}/PacBio.BAM.csproj
+ASSEMBLY_ROOT=${PacBioBAM_CSharpLibDir}/bin/Debug
+
+# get expanded cmake generator expression ( $<TARGET:hts> or externally defined -DHTSLIB_LIBRARIES="") 
+# from cmd line
+EXPANDED_HTSLIB_LIBRARIES=("$@")
+
+#
+# Make the managed DLL
+#
+( cd $CSPROJ_ROOT; $MSBUILD $CSPROJ ) || { echo "Failed to build managed DLL" && exit -1; }
+
+#
+# Copy the dependency libs
+#
+cp ${PacBioBAM_LibDir}/libpbbam${CMAKE_SHARED_LIBRARY_SUFFIX}  $ASSEMBLY_ROOT
+cp "$EXPANDED_HTSLIB_LIBRARIES"                                $ASSEMBLY_ROOT  # Need "libhts*.dylib"
+
+if [ "$PLATFORM" == "Windows" ]
+then
+    # stuff we need to bundle on windows
+    cp /mingw64/bin/zlib1.dll                                 $ASSEMBLY_ROOT
+    cp /mingw64/bin/libwinpthread-1.dll                       $ASSEMBLY_ROOT
+    cp ${PacBioBAM_CSharpLibDir}/libPacBioBam.dll             $ASSEMBLY_ROOT/PacBioBam.dll
+else
+    # For UNIX this is .so, even Mac.  Not sure why.
+    cp ${PacBioBAM_CSharpLibDir}/libPacBioBam.so              $ASSEMBLY_ROOT
+#    cp ${HTSLIB_LIBRARIES_VERSIONED_LINK}                     $ASSEMBLY_ROOT  # Need "libhts*.dylib"
+fi
+
+# Bundle test data
+mkdir -p $ASSEMBLY_ROOT/TestData
+cp -rf ${PacBioBAM_TestsDir}/data/* $ASSEMBLY_ROOT/TestData
+
+#
+# Make the "check" program, which we need to put next to the assembly to
+# allow it to be resolved.
+#
+
+CHECK_SRC=${CSharpTestRootDir}/CheckSWIG.cs
+CHECK_BIN=$ASSEMBLY_ROOT/CheckSWIG.exe
+
+
+if [ "$PLATFORM" == "Windows" ]
+then
+    ( $CSC  /lib:$ASSEMBLY_ROOT //r:PacBio.BAM.dll /out:$CHECK_BIN $(cygpath -w $CHECK_SRC) && $CHECK_BIN)
+else
+    ( $CSC  /lib:$ASSEMBLY_ROOT /r:PacBio.BAM.dll /out:$CHECK_BIN $CHECK_SRC && cd $ASSEMBLY_ROOT && mono CheckSWIG.exe )
+fi
+
+
+#
+# Build the fuller test suite, and bundle 
+#
+TEST_SRC=${CSharpTestRootDir}/TestPbbam.cs
+TEST_BIN=$ASSEMBLY_ROOT/TestPbbam.exe
+
+
+if [ "$PLATFORM" == "Windows" ]
+then
+    ( $CSC /lib:$ASSEMBLY_ROOT //r:PacBio.BAM.dll /out:$TEST_BIN $(cygpath -w $TEST_SRC) && $TEST_BIN )
+else
+    ( $CSC /lib:$ASSEMBLY_ROOT /r:PacBio.BAM.dll /out:$TEST_BIN $TEST_SRC && cd $ASSEMBLY_ROOT && mono TestPbbam.exe )
+fi
diff --git a/tests/src/CSharp/check_swig.sh.in b/tests/src/CSharp/check_swig.sh.in

new file mode 100755 (executable)

index 0000000..ba07b75
--- /dev/null
+++ b/tests/src/CSharp/check_swig.sh.in
@@ -0,0 +1,18 @@
+#!/bin/sh
+(cd ${PacBioBAM_CSharpLibDir}; xbuild PacBio.BAM.csproj)
+
+${CSHARP_COMPILER} -lib:${PacBioBAM_CSharpLibDir}/bin/Debug -r:PacBio.BAM.dll CheckSWIG.cs
+
+#
+# For deployment these all need to be installed somewhere more sensible.
+# This is just a hack for testing if the build works.
+#
+LIBRARY_PATHS=\
+${PacBioBAM_CSharpLibDir}:\
+${PacBioBAM_LibDir}:\
+${Htslib_LibDir}
+
+DYLD_LIBRARY_PATH=$LIBRARY_PATHS \
+LD_LIBRARY_PATH=$LIBRARY_PATHS \
+MONO_PATH=${PacBioBAM_CSharpLibDir}/bin/Debug \
+mono CheckSWIG.exe
diff --git a/tests/src/R/check_swig.R.in b/tests/src/R/check_swig.R.in

new file mode 100644 (file)

index 0000000..13b07c4
--- /dev/null
+++ b/tests/src/R/check_swig.R.in
@@ -0,0 +1,58 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+# load PacBioBAM lib & wrapper script
+
+# htslib_libname <- "@Htslib_Libraries@"
+# dyn.load(htslib_libname)
+# # htslib_libname <- paste(htslib_libpath, "libhts", sep="/")
+# # dyn.load(paste(htslib_libname, ".dylib", sep=""))
+
+pbbam_libpath <- "@PacBioBAM_RLibDir@"
+pbbam_libname <- paste(pbbam_libpath, "PacBioBam",   sep="/")
+dyn.load(paste(pbbam_libname,  .Platform$dynlib.ext, sep=""))
+
+pbbam_wrapper <- paste(pbbam_libpath, "PacBioBam.R", sep="/")
+source(pbbam_wrapper)
+
+cacheMetaData(1)
+
+h <- BamHeader() 
+
+message = "\nR Wrapper OK.\n"
+cat(message)
+cat("\n")
diff --git a/tests/src/R/test_pbbam.R b/tests/src/R/test_pbbam.R

new file mode 100644 (file)

index 0000000..55c71a8
--- /dev/null
+++ b/tests/src/R/test_pbbam.R
@@ -0,0 +1,23 @@
+
+# usage: R [args] < test_pbbam.R --args <tests_scripts_dir> <PacBioBam_lib_dir> <test_data_dir>
+args <- commandArgs(TRUE)
+tests_path     <- args[1]
+lib_path       <- args[2]
+test_data_path <- args[3]
+
+# load PacBioBAM lib & wrapper script
+pbbam_libname <- paste(lib_path, "PacBioBam",   sep="/")
+pbbam_wrapper <- paste(lib_path, "PacBioBam.R", sep="/")
+dyn.load(paste(pbbam_libname, .Platform$dynlib.ext, sep=""))
+source(pbbam_wrapper)
+cacheMetaData(1)
+
+# init test utils & run test cases
+source(paste(tests_path, "utils.R", sep="/"))
+run_test_suite(tests_path)
+
+# print results & exit
+results <- test_suite_results()
+results$print_summary()
+if (results$any_failed())
+       quit(status=1)
diff --git a/tests/src/R/test_pbbam.sh.in b/tests/src/R/test_pbbam.sh.in

new file mode 100644 (file)

index 0000000..458b149
--- /dev/null
+++ b/tests/src/R/test_pbbam.sh.in
@@ -0,0 +1,54 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+#! /usr/bin/sh
+
+GENERATED_BAM=@PacBioBAM_TestsDir@/data/generated.bam
+
+touch $GENERATED_BAM
+chmod 644 $GENERATED_BAM
+
+R --slave --no-save < @RTestRootDir@/test_pbbam.R --args \
+       @RTestRootDir@/tests \
+       @PacBioBAM_RLibDir@ \
+       @PacBioBAM_TestsDir@/data
+    
+STATUS=$?
+       
+rm $GENERATED_BAM
+
+exit $STATUS
+\ No newline at end of file
diff --git a/tests/src/R/tests/test_Accuracy.R b/tests/src/R/tests/test_Accuracy.R

new file mode 100644 (file)

index 0000000..e7e98e6
--- /dev/null
+++ b/tests/src/R/tests/test_Accuracy.R
@@ -0,0 +1,62 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+test_case("Accuracy_Clamp", {
+       
+    a_zero     <- Accuracy(0.0)
+    a_neg      <- Accuracy(-0.5)
+    a_min      <- Accuracy(0.0)
+    a_normal   <- Accuracy(0.9)
+    a_max      <- Accuracy(1.0)
+    a_tooLarge <- Accuracy(1.1)
+       
+    tolerance = 1e-5
+
+    assertTrue( abs(0.0 - a_zero$ToFloat())     <= tolerance )
+    assertTrue( abs(0.0 - a_neg$ToFloat())      <= tolerance )
+    assertTrue( abs(0.0 - a_min$ToFloat())      <= tolerance )
+    assertTrue( abs(0.9 - a_normal$ToFloat())   <= tolerance )
+    assertTrue( abs(1.0 - a_max$ToFloat())      <= tolerance )
+    assertTrue( abs(1.0 - a_tooLarge$ToFloat()) <= tolerance )
+    
+    # assertEqual(0.0, a_zero$ToFloat())
+    # assertEqual(0.0, a_neg$ToFloat())
+    # assertEqual(0.0, a_min$ToFloat())
+    # assertEqual(0.9, a_normal$ToFloat())
+    # assertEqual(1.0, a_max$ToFloat())
+    # assertEqual(1.0, a_tooLarge$ToFloat())
+})
diff --git a/tests/src/R/tests/test_BamFile.R b/tests/src/R/tests/test_BamFile.R

new file mode 100644 (file)

index 0000000..a53b243
--- /dev/null
+++ b/tests/src/R/tests/test_BamFile.R
@@ -0,0 +1,76 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+test_case("BamFile_NonExistentFile", {
+       result <- tryCatch(
+               {
+                       f <- BamFile("does_not_exist.bam")
+                       assertTrue(FALSE) # should have thrown
+                       invisible()
+               },
+               warning = function(w) {
+                       assertTrue(TRUE)
+                       invisible()
+               },
+               error = function(e) {
+                       assertTrue(TRUE) 
+                       invisible()
+               }
+       )
+       return(result)
+})
+
+test_case("BamFile_Ctor", {
+       
+        fn <- paste(test_data_path, "aligned.bam", sep="/")
+       
+       result <- tryCatch(
+               {
+                       f <- BamFile(fn)
+                       invisible()
+               },
+               warning = function(w) {
+                       assertTrue(FALSE)
+                       invisible()
+               },
+               error = function(e) {
+                       assertTrue(FALSE)    # should not throw
+                       invisible()
+               }
+       )
+       return(result)
+})
diff --git a/tests/src/R/tests/test_BamHeader.R b/tests/src/R/tests/test_BamHeader.R

new file mode 100644 (file)

index 0000000..cd2716c
--- /dev/null
+++ b/tests/src/R/tests/test_BamHeader.R
@@ -0,0 +1,193 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+empty_program <- function(header) {
+       result <- tryCatch(
+               {
+                       pg <- header$Program("foo")
+                       assertTrue(FALSE) # should have thrown 
+                       invisible()
+               },
+               warning = function(w) {
+                       assertTrue(TRUE)
+                       invisible()
+               }
+       )
+       return(result)
+}
+
+empty_readgroup <- function(header) {
+       result <- tryCatch(
+               {
+                       pg <- header$ReadGroup("foo")
+                       assertTrue(FALSE) # should have thrown 
+                       invisible()
+               },
+               warning = function(w) {
+                       assertTrue(TRUE)
+                       invisible()
+               }
+       )
+       return(result)
+}
+
+empty_sequenceid <- function(header) {
+       result <- tryCatch(
+               {
+                       pg <- header$SequenceId("foo")
+                       assertTrue(FALSE) # should have thrown 
+                       invisible()
+               },
+               warning = function(w) {
+                       assertTrue(TRUE)
+                       invisible()
+               }
+       )
+       return(result)
+}
+
+test_case("BamHeader_Defaults", {
+       
+    header <- BamHeader()
+       
+       assertEqual(0L, nchar(header$Version()))
+       assertEqual(0L, nchar(header$SortOrder()))
+       assertTrue(header$ReadGroups()$empty())
+       assertTrue(header$Sequences()$empty())
+       assertTrue(header$Programs()$empty())
+       
+       pg <- empty_program(header)
+       rg <- empty_readgroup(header)
+       id <- empty_sequenceid(header)
+       
+       # TODO: get comment fetching working
+       #assertEqual(1, length(header$Comments()))
+})
+
+test_case("BamHeader_Decode", { 
+       
+    text <- paste("@HD\tVN:1.1\tSO:queryname\tpb:3.0.1",
+                             "@SQ\tSN:chr1\tLN:2038\tSP:chocobo",
+                                 "@SQ\tSN:chr2\tLN:3042\tSP:chocobo",
+                                 "@RG\tID:rg1\tSM:control",
+                                 "@RG\tID:rg2\tSM:condition1",
+                                 "@RG\tID:rg3\tSM:condition1",
+                                 "@PG\tID:_foo_\tPN:ide",
+                                 "@CO\tipsum and so on",
+                                 "@CO\tcitation needed",
+                                 sep="\n"
+                    )
+                               
+       header <- BamHeader(text)
+       
+       assertEqual("1.1",       header$Version())
+       assertEqual("queryname", header$SortOrder())
+       assertEqual("3.0.1",     header$PacBioBamVersion())
+
+       assertEqual(3L, header$ReadGroups()$size())
+       assertTrue(header$HasReadGroup("rg1"))
+       assertTrue(header$HasReadGroup("rg2"))
+       assertTrue(header$HasReadGroup("rg3"))
+       assertEqual("control",    header$ReadGroup("rg1")$Sample())
+       assertEqual("condition1", header$ReadGroup("rg2")$Sample())
+       assertEqual("condition1", header$ReadGroup("rg3")$Sample())
+       
+       assertEqual(2L, header$Sequences()$size())
+       assertTrue(header$HasSequence("chr1"))
+       assertTrue(header$HasSequence("chr2"))
+       assertEqual("chocobo", header$Sequence("chr1")$Species())
+       assertEqual("chocobo", header$Sequence("chr2")$Species())
+       assertEqual("2038",    header$Sequence("chr1")$Length())
+       assertEqual("3042",    header$Sequence("chr2")$Length())
+  
+       assertEqual(1L, header$Programs()$size())
+       assertTrue(header$HasProgram("_foo_"))
+       assertEqual("ide", header$Program("_foo_")$Name())
+
+       # TODO: get comment fetching working
+       # assertEqual(2, header$Comments()$size())
+       # assertEqual("ipsum and so on", header$Comments()[1])
+       # assertEqual("citation needed", header$Comments()[2])
+})
+       
+test_case("BamHeader_Encode", { 
+       
+    expectedText <- paste("@HD\tVN:1.1\tSO:queryname\tpb:3.0.1",
+                                     "@SQ\tSN:chr1\tLN:2038\tSP:chocobo",
+                                         "@SQ\tSN:chr2\tLN:3042\tSP:chocobo",
+                                          "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL",
+                                          "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL",
+                                          "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL",
+                                         "@PG\tID:_foo_\tPN:ide",
+                                         "@CO\tipsum and so on",
+                                         "@CO\tcitation needed",
+                                                 "",
+                                         sep="\n"
+                            )
+
+       rg1 <- ReadGroupInfo("rg1")
+       rg1$Sample("control")
+       rg2 <- ReadGroupInfo("rg2")
+       rg2$Sample("condition1")
+       rg3 <- ReadGroupInfo("rg3")
+       rg3$Sample("condition1")
+
+       seq1 <- SequenceInfo("chr1")
+       seq1$Length("2038")
+       seq1$Species("chocobo")
+       seq2 <- SequenceInfo("chr2")
+       seq2$Length("3042")
+       seq2$Species("chocobo")
+    
+       prog1 <- ProgramInfo("_foo_")
+       prog1$Name("ide")
+
+       header <- BamHeader()
+       header$Version("1.1")
+       header$SortOrder("queryname")
+       header$PacBioBamVersion("3.0.1")
+       header$AddReadGroup(rg1)
+       header$AddReadGroup(rg2)
+       header$AddReadGroup(rg3)
+       header$AddSequence(seq1)
+       header$AddSequence(seq2)
+       header$AddProgram(prog1)
+       header$AddComment("ipsum and so on")
+       header$AddComment("citation needed")
+       
+       assertEqual(expectedText, header$ToSam())
+})
diff --git a/tests/src/R/tests/test_Cigar.R b/tests/src/R/tests/test_Cigar.R

new file mode 100644 (file)

index 0000000..dd3e544
--- /dev/null
+++ b/tests/src/R/tests/test_Cigar.R
@@ -0,0 +1,219 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+test_case("Cigar_TypeToChar", {
+    assertEqual('M', CigarOperation_TypeToChar('ALIGNMENT_MATCH'))
+       assertEqual('I', CigarOperation_TypeToChar('INSERTION'))
+       assertEqual('D', CigarOperation_TypeToChar('DELETION'))
+       assertEqual('N', CigarOperation_TypeToChar('REFERENCE_SKIP'))
+       assertEqual('S', CigarOperation_TypeToChar('SOFT_CLIP'))
+       assertEqual('H', CigarOperation_TypeToChar('HARD_CLIP'))
+       assertEqual('P', CigarOperation_TypeToChar('PADDING'))
+       assertEqual('=', CigarOperation_TypeToChar('SEQUENCE_MATCH'))
+       assertEqual('X', CigarOperation_TypeToChar('SEQUENCE_MISMATCH'))
+})
+
+test_case("Cigar_CharToType", {
+       
+    assertEqual('ALIGNMENT_MATCH',   CigarOperation_CharToType('M'))
+    assertEqual('INSERTION',         CigarOperation_CharToType('I'))
+    assertEqual('DELETION',          CigarOperation_CharToType('D'))
+    assertEqual('REFERENCE_SKIP',    CigarOperation_CharToType('N'))
+    assertEqual('SOFT_CLIP',         CigarOperation_CharToType('S'))
+    assertEqual('HARD_CLIP',         CigarOperation_CharToType('H'))
+    assertEqual('PADDING',           CigarOperation_CharToType('P'))
+    assertEqual('SEQUENCE_MATCH',    CigarOperation_CharToType('='))
+    assertEqual('SEQUENCE_MISMATCH', CigarOperation_CharToType('X'))
+})
+
+test_case("Cigar_SetType", {
+       
+    m = CigarOperation()
+    i = CigarOperation()
+    d = CigarOperation()
+    n = CigarOperation()
+    s = CigarOperation()
+    h = CigarOperation()
+    p = CigarOperation()
+    e = CigarOperation()
+    x = CigarOperation()
+    
+    m$Type('ALIGNMENT_MATCH')
+    i$Type('INSERTION')
+    d$Type('DELETION')
+    n$Type('REFERENCE_SKIP')
+    s$Type('SOFT_CLIP')
+    h$Type('HARD_CLIP')
+    p$Type('PADDING')
+    e$Type('SEQUENCE_MATCH')
+    x$Type('SEQUENCE_MISMATCH')
+    
+    assertEqual('M', m$Char())
+    assertEqual('I', i$Char())
+    assertEqual('D', d$Char())
+    assertEqual('N', n$Char())
+    assertEqual('S', s$Char())
+    assertEqual('H', h$Char())
+    assertEqual('P', p$Char())
+    assertEqual('=', e$Char())
+    assertEqual('X', x$Char())
+})
+
+test_case("Cigar_SetChar", {
+       
+    m = CigarOperation()
+    i = CigarOperation()
+    d = CigarOperation()
+    n = CigarOperation()
+    s = CigarOperation()
+    h = CigarOperation()
+    p = CigarOperation()
+    e = CigarOperation()
+    x = CigarOperation()
+
+    m$Char('M')
+    i$Char('I')
+    d$Char('D')
+    n$Char('N')
+    s$Char('S')
+    h$Char('H')
+    p$Char('P')
+    e$Char('=')
+    x$Char('X')
+
+    assertEqual('ALIGNMENT_MATCH',   m$Type())
+    assertEqual('INSERTION',         i$Type())
+    assertEqual('DELETION',          d$Type())
+    assertEqual('REFERENCE_SKIP',    n$Type())
+    assertEqual('SOFT_CLIP',         s$Type())
+    assertEqual('HARD_CLIP',         h$Type())
+    assertEqual('PADDING',           p$Type())
+    assertEqual('SEQUENCE_MATCH',    e$Type())
+    assertEqual('SEQUENCE_MISMATCH', x$Type())
+})
+
+test_case("Cigar_CigarOpCtors", {
+
+       c1 <- CigarOperation("S", 10)
+       c2 <- CigarOperation(CigarOperation_TypeToChar('SOFT_CLIP'), 10)
+
+       assertEqual('S', c1$Char())
+       assertEqual('S', c2$Char())
+       assertEqual('SOFT_CLIP', c1$Type())
+       assertEqual('SOFT_CLIP', c2$Type())
+       assertEqual(10L, c1$Length())
+       assertEqual(10L, c2$Length())
+})
+
+test_case("Cigar_FromEmptyString", {
+       
+       s <- ""
+       cigar <- Cigar(s)
+       assertEqual(0L, cigar$size())
+})
+
+test_case("Cigar_FromString", {
+       
+    singleCigarString <- "100="
+    multiCigarString  <- "100=2D34I6=6X6="
+    
+    singleCigar <- Cigar(singleCigarString)
+    multiCigar  <- Cigar(multiCigarString)
+    
+    assertEqual(1L, singleCigar$size())
+       
+       c <- singleCigar$front()
+       assertEqual('=',              c$Char())
+       assertEqual('SEQUENCE_MATCH', c$Type())
+       assertEqual(100L,             c$Length())
+
+    assertEqual(6L, multiCigar$size())
+       
+       # haven't quite figured out [ ] accessors via SWIG, 
+       # but this method does work w/ !ZERO!-based indices
+    op0 <- multiCigar$'__getitem__'(0) 
+    op1 <- multiCigar$'__getitem__'(1)
+    op2 <- multiCigar$'__getitem__'(2)
+    op3 <- multiCigar$'__getitem__'(3)
+    op4 <- multiCigar$'__getitem__'(4)
+    op5 <- multiCigar$'__getitem__'(5)
+    
+    assertEqual('=', op0$Char())
+    assertEqual('D', op1$Char())
+    assertEqual('I', op2$Char())
+    assertEqual('=', op3$Char())
+    assertEqual('X', op4$Char())
+    assertEqual('=', op5$Char())
+       assertEqual('SEQUENCE_MATCH',    op0$Type())
+       assertEqual('DELETION',          op1$Type())
+       assertEqual('INSERTION',         op2$Type())
+       assertEqual('SEQUENCE_MATCH',    op3$Type())
+    assertEqual('SEQUENCE_MISMATCH', op4$Type())
+    assertEqual('SEQUENCE_MATCH',    op5$Type())
+    assertEqual(100L, op0$Length())
+    assertEqual(2L,   op1$Length())
+    assertEqual(34L,  op2$Length())
+    assertEqual(6L,   op3$Length())
+    assertEqual(6L,   op4$Length())
+    assertEqual(6L,   op5$Length())
+})
+
+test_case("Cigar_ToEmptyString", {
+       
+       cigar <- Cigar()
+       assertEqual(0L, nchar(cigar$ToStdString())) # empty string is 1
+})
+
+test_case("Cigar_ToString", {
+       
+    singleCigarString <- "100="
+    multiCigarString  <- "100=2D34I6=6X6="
+    
+       singleCigar <- Cigar()
+       singleCigar$push_back( CigarOperation(CigarOperation_TypeToChar('SEQUENCE_MATCH'), 100) )
+
+       multiCigar <- Cigar()
+       multiCigar$push_back(CigarOperation('=', 100))
+       multiCigar$push_back(CigarOperation('D', 2))
+       multiCigar$push_back(CigarOperation('I', 34))
+       multiCigar$push_back(CigarOperation('=', 6))
+    multiCigar$push_back(CigarOperation('X', 6))
+    multiCigar$push_back(CigarOperation('=', 6))
+
+       assertEqual(singleCigarString, singleCigar$ToStdString())
+       assertEqual(multiCigarString,  multiCigar$ToStdString())
+})
diff --git a/tests/src/R/tests/test_EndToEnd.R b/tests/src/R/tests/test_EndToEnd.R

new file mode 100644 (file)

index 0000000..65f76b2
--- /dev/null
+++ b/tests/src/R/tests/test_EndToEnd.R
@@ -0,0 +1,105 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+originalNames <- function(inputFn, generatedFn) {
+       
+    result <- tryCatch(
+    {
+        file <- BamFile(inputFn)
+        writer <- BamWriter(generatedFn, file$Header())
+
+        ds <- DataSet(inputFn)
+        entireFile <- EntireFileQuery(ds)
+
+        names_in <- list()
+        iter <- entireFile$begin()
+        end <- entireFile$end()
+        while ( iter$'__ne__'(end) ) {
+            record <- iter$value()
+            names_in <- c(names_in, record$FullName())
+            writer$Write(record)
+            iter$incr()
+        }
+        writer$'delete_BamWriter'()
+        return(names_in)
+    },
+    error = function(e) {
+        assertEqual("why:", e$message)     # should not throw
+        return(list())
+    }
+    )
+    return(result)
+}
+
+generatedNames <- function(generatedFn) {
+       
+    result <- tryCatch(
+    {
+        ds <- DataSet(generatedFn)
+        entireFile <- EntireFileQuery(ds)
+
+        names_out <- list()
+        iter <- entireFile$begin()
+        end <- entireFile$end()
+        while ( iter$'__ne__'(end) ) {
+            names_out <- c(names_out, iter$FullName())
+            iter$incr()
+        }
+        return(names_out)
+    },
+    error = function(e) {
+        assertEqual("why:", e$message)     # should not throw
+        return(list())
+    }
+    )
+    return(result)
+}
+
+#test_case("EndToEnd_CopyFileAndReadBack", {
+#
+#    inputFn     <- paste(test_data_path, "aligned.bam", sep="/")
+#    generatedFn <- paste(test_data_path, "generated.bam", sep="/")
+#
+#    # loop over original file, store names, write to generated file
+#    names_in  <- originalNames(inputFn, generatedFn)
+#
+#    # read names from new file
+#    names_out <- generatedNames(generatedFn)
+#
+#    # ensure equal
+#    assertEqual(names_in, names_out)
+#})
diff --git a/tests/src/R/tests/test_Frames.R b/tests/src/R/tests/test_Frames.R

new file mode 100644 (file)

index 0000000..f9d6eb5
--- /dev/null
+++ b/tests/src/R/tests/test_Frames.R
@@ -0,0 +1,95 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTA,
+# SPECIA, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+testFrames_rawData <- list(
+    0,  8,  140, 0,   0,  7,   4,  0,  85, 2,
+    1,  3,  2,   10,  1,  20,  47, 10, 9,  60,
+    20, 3,  12,  5,   13, 165, 6,  14, 22, 12,
+    2,  4,  9,   218, 27, 3,   15, 2,  17, 2,
+    45, 24, 89,  10,  7,  1,   11, 15, 0,  7,
+    0,  28, 17,  12,  6,  10,  37, 0,  12, 52,
+    0,  7,  1,   14,  3,  26,  12, 0,  20, 17,
+    2,  13, 2,   9,   13, 7,   15, 29, 3,   6,
+    2,  1,  28,  10,  3,  14,  7,  1,  22, 1,
+    6,  6,  0,   19,  31, 6,   2,  14, 0,  0,
+    1000, 947, 948
+)
+
+encodedFrames_rawData <- list(
+    0,     8,  102,   0,   0,   7,   4,   0,  75,   2,   1,   3,   2,
+    10,    1,   20,  47,  10,   9,  60,  20,   3,  12,   5,  13, 115,
+    6,    14,   22,  12,   2,   4,   9, 135,  27,   3,  15,   2,  17,
+    2,    45,   24,  77,  10,   7,   1,  11,  15,   0,   7,   0,  28,
+    17,   12,    6,  10,  37,   0,  12,  52,   0,   7,   1,  14,   3,
+    26,   12,    0,  20,  17,   2,  13,   2,   9,  13,   7,  15,  29,
+    3,     6,    2,   1,  28,  10,   3,  14,   7,   1,  22,   1,   6,
+    6,     0,   19,  31,   6,   2,  14,   0,   0,
+    255, 254,  255
+)
+
+testFrames <- UInt16List()
+for (x in testFrames_rawData) 
+       testFrames$push_back(x)
+
+encodedFrames <- UInt16List()
+for (x in encodedFrames_rawData) 
+       encodedFrames$push_back(x)
+
+test_case("Frames_Basic", {
+       
+       f <- Frames()
+       assertEqual(0L, f$Data()$size())
+       
+       f2 <- Frames(testFrames)
+       d <- f2$Data()
+       assertEqual(length(testFrames), length(d))
+       
+       numFrames <- length(testFrames)
+       for (i in 1:numFrames) 
+               assertEqual(testFrames[i], d[i])
+})
+
+test_case("Frames_Downsample", {
+       
+       f <- Frames(testFrames)
+       d <- f$Data()
+       assertEqual(length(encodedFrames), length(d))
+       
+       numFrames <- length(encodedFrames)
+       for (i in 1:numFrames)
+               assertEqual(encodedFrames[i], d[i])
+})
diff --git a/tests/src/R/tests/test_Intervals.R b/tests/src/R/tests/test_Intervals.R

new file mode 100644 (file)

index 0000000..0071750
--- /dev/null
+++ b/tests/src/R/tests/test_Intervals.R
@@ -0,0 +1,340 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+       
+test_case("Intervals_UnmappedPosition", { 
+       assertEqual(-1L, UnmappedPosition())
+})
+
+test_case("Intervals_Ctors", { 
+       
+       empty  <- PositionInterval()
+       single <- PositionInterval(4)
+       normal <- PositionInterval(5, 8)
+
+    assertEqual(0L, empty$Start())
+    assertEqual(0L, empty$Stop())
+    assertEqual(4L, single$Start())
+    assertEqual(5L, single$Stop())
+    assertEqual(5L, normal$Start())
+    assertEqual(8L, normal$Stop())
+})
+
+test_case("Intervals_Equality", { 
+       
+    empty           <- PositionInterval()
+    empty2          <- PositionInterval()
+    singleton       <- PositionInterval(4)
+    sameAsSingleton <- PositionInterval(4, 5)
+    normal          <- PositionInterval(5, 8)
+    sameAsNormal    <- PositionInterval(5, 8)
+    different       <- PositionInterval(20, 40)
+    
+    # self-equality
+    assertEqual(empty,     empty)
+    assertEqual(singleton, singleton)
+    assertEqual(normal,    normal)
+    assertEqual(different, different)
+
+    # same values
+       # TODO: fix this to work with == or *anything* cleaner
+       assertTrue(empty$'__eq__'(empty2))
+       assertTrue(singleton$'__eq__'(sameAsSingleton))
+       assertTrue(normal$'__eq__'(sameAsNormal))
+       
+    # different values
+    assertNotEqual(empty,     singleton)
+    assertNotEqual(empty,     normal)
+    assertNotEqual(empty,     different)
+    assertNotEqual(singleton, normal)
+    assertNotEqual(normal,    different)
+})
+
+test_case("Intervals_Copy", { 
+       
+    interval1 <- PositionInterval(5,8)
+    interval2 <- PositionInterval(interval1)
+    interval3 <- interval1
+
+       # TODO: fix this to work with == or *anything* cleaner
+       assertTrue(interval1$'__eq__'(interval1))
+       assertTrue(interval1$'__eq__'(interval2))
+       assertTrue(interval1$'__eq__'(interval3))
+})
+
+test_case("Intervals_Modifiers", { 
+       
+       interval1 <- PositionInterval(5,8)
+    interval2 <- PositionInterval(interval1)
+    interval2$Start(2)
+    interval2$Stop(10)
+        
+    assertNotEqual(interval1, interval2)
+    assertEqual(2L,  interval2$Start())
+    assertEqual(10L, interval2$Stop())
+})
+
+test_case("Intervals_Cover", { 
+       
+    a <- PositionInterval(2,4)
+    b <- PositionInterval(3,5)
+    c <- PositionInterval(6,8)
+    d <- PositionInterval(1,7)
+    e <- PositionInterval(5,8)
+    
+    #   0123456789  
+    # a   --
+    # b    --
+    # c       --
+    # d  ------
+    # e      ---
+
+    # self-cover
+    assertTrue(a$Covers(a)) 
+    assertTrue(a$CoveredBy(a))
+
+    # basic covers/covered
+    assertTrue(b$CoveredBy(d))
+    assertTrue(d$Covers(b))
+    assertNotEqual(b, d)
+    assertFalse(b$Covers(d))
+
+    # completely disjoint
+    assertFalse(b$Covers(c))
+    assertFalse(c$Covers(b))
+    assertFalse(b$CoveredBy(c))
+    assertFalse(c$CoveredBy(b))
+    
+    # b.stop == e.start
+    assertFalse(b$Covers(e))
+    assertFalse(b$CoveredBy(e))
+
+    # shared endpoint, start contained
+    assertTrue(e$Covers(c))
+    assertTrue(c$CoveredBy(e))
+})
+
+test_case("Intervals_Intersect", { 
+       
+    a <- PositionInterval(2,4)
+    b <- PositionInterval(3,5)
+    c <- PositionInterval(6,8)
+    d <- PositionInterval(1,7)
+    e <- PositionInterval(5,8)
+    
+    #   0123456789  
+    # a   --
+    # b    --
+    # c       --
+    # d  ------
+    # e      ---
+    
+    # self-intersection
+    assertTrue(a$Intersects(a))
+    
+    # intersection is commutative
+    assertTrue(a$Intersects(b))
+    assertTrue(b$Intersects(a))
+    
+    # covered implies intersection
+    assertTrue(d$Covers(a))
+    assertTrue(a$Intersects(d))
+    assertTrue(d$Intersects(a))
+
+    # c.start > b.stop (obvious disjoint)
+    assertFalse(b$Intersects(c))
+    
+    # b.stop == e.start (intervals are right-open, so disjoint)
+    assertFalse(b$Intersects(e))
+})
+
+test_case("Intervals_Validity", { 
+       
+    a <- PositionInterval()     # default ctor
+    b <- PositionInterval(0,0)  # start == stop (zero)
+    c <- PositionInterval(4,4)  # start == stop (nonzero)
+    d <- PositionInterval(0,1)  # start < stop  (start is zero)
+    e <- PositionInterval(4,5)  # start < stop  (start is nonzero)
+    f <- PositionInterval(5,4)  # start > stop
+    
+    assertFalse(a$IsValid())
+    assertFalse(b$IsValid())
+    assertFalse(c$IsValid())
+    assertTrue(d$IsValid())
+    assertTrue(e$IsValid())
+    assertFalse(f$IsValid())
+})
+
+test_case("Intervals_Length",{ 
+       
+    a <- PositionInterval(2,4)
+    b <- PositionInterval(3,5)
+    c <- PositionInterval(6,8)
+    d <- PositionInterval(1,7)
+    e <- PositionInterval(5,8)
+    
+    assertEqual(2L, a$Length())
+    assertEqual(2L, b$Length())
+    assertEqual(2L, c$Length())
+    assertEqual(6L, d$Length())
+    assertEqual(3L, e$Length()) 
+})
+
+test_case("GenomicIntervals_Ctors", { 
+       
+    empty  <- GenomicInterval()
+    normal <- GenomicInterval("seq1", 100, 200)
+    
+    assertEqual("",  empty$Name())
+    assertEqual(0L,  empty$Start())
+    assertEqual(0L,  empty$Stop())
+    
+    assertEqual("seq1", normal$Name())
+    assertEqual(100L,   normal$Start())
+    assertEqual(200L,   normal$Stop())
+})
+
+test_case("GenomicIntervals_Copy", { 
+       
+    a <- GenomicInterval("seq1", 10, 20)
+    b <- GenomicInterval(a)
+    c <- a
+    
+       # TODO: fix this to work with == or *anything* cleaner
+       assertTrue(a$'__eq__'(a))
+       assertTrue(a$'__eq__'(b))
+       assertTrue(a$'__eq__'(c))
+})
+
+test_case("GenomicIntervals_Modifiers", { 
+       
+    a <- GenomicInterval("seq1", 10, 20)
+    
+    b <- GenomicInterval(a)
+    b$Name("seq5")
+       b$Start(2)
+       b$Stop(10)
+    
+    c <- GenomicInterval(a)
+    c$Interval(b$Interval())
+    
+    assertNotEqual(a, b)
+       
+    assertEqual("seq5",  b$Name())
+    assertEqual(2L,  b$Start())
+    assertEqual(10L, b$Stop())        
+       
+    assertEqual(a$Name(), c$Name())
+       
+       # TODO: fix this to work with == or *anything* cleaner
+       assertTrue(b$Interval()$'__eq__'(c$Interval()))
+})
+
+test_case("GenomicIntervals_Cover", { 
+       
+    a <- GenomicInterval("seq1",2,4)
+    b <- GenomicInterval("seq1",3,5)
+    c <- GenomicInterval("seq1",6,8)
+    d <- GenomicInterval("seq1",1,7)
+    e <- GenomicInterval("seq1",5,8)
+    f <- GenomicInterval("seq2",3,5)  # same as b, different ref
+    
+    #   0123456789  
+    # a   --
+    # b    --
+    # c       --
+    # d  ------
+    # e      ---
+    
+    # self-cover
+    assertTrue(a$Covers(a))
+    assertTrue(a$CoveredBy(a))
+    
+    # basic covers/covered
+    assertTrue(b$CoveredBy(d))
+    assertTrue(d$Covers(b))
+    assertNotEqual(b, d)
+    assertFalse(b$Covers(d))
+    
+    # same coords as b, but different ref
+    assertFalse(f$CoveredBy(d))
+    assertFalse(d$Covers(f))
+    assertNotEqual(f, d)
+    assertFalse(f$Covers(d))
+    
+    # obvious disjoint
+    assertFalse(b$Covers(c))
+    assertFalse(c$Covers(b))
+    assertFalse(b$CoveredBy(c))
+    assertFalse(c$CoveredBy(b))
+    
+    # b.stop == e.start (intervals are right-open, so disjoint)
+    assertFalse(b$Covers(e))
+    assertFalse(b$CoveredBy(e))
+    
+    # shared endpoint, start contained
+    assertTrue(e$Covers(c))
+    assertTrue(c$CoveredBy(e))
+       
+       # assertTrue(FALSE)
+})
+
+test_case("GenomicIntervals_Validity", { 
+       
+    a <- GenomicInterval()       # default
+    b <- GenomicInterval("seq1",0,0)  # valid id, start == stop (zero)
+    c <- GenomicInterval("seq1",4,4)  # valid id, start == stop (non-zero)
+    d <- GenomicInterval("seq",0,1)  # valid id, start <  stop (start == zero)     OK
+    e <- GenomicInterval("seq1",4,5)  # valid id, start <  stop (start >  zero)     OK
+    f <- GenomicInterval("seq1",5,4)  # valid id, start >  stop 
+    g <- GenomicInterval("",0,0) # invalid id, start == stop (zero)
+    h <- GenomicInterval("",4,4) # invalid id, start == stop (non-zero)
+    i <- GenomicInterval("",0,1) # invalid id, start <  stop (start == zero)
+    j <- GenomicInterval("",4,5) # invalid id, start <  stop (start >  zero)
+    k <- GenomicInterval("",5,4) # invalid id, start >  stop 
+         
+    assertTrue(d$IsValid())
+    assertTrue(e$IsValid())
+    assertFalse(a$IsValid())
+    assertFalse(b$IsValid())
+    assertFalse(c$IsValid())
+    assertFalse(f$IsValid())
+    assertFalse(g$IsValid())
+    assertFalse(h$IsValid())
+    assertFalse(i$IsValid())
+    assertFalse(j$IsValid())
+    assertFalse(k$IsValid())
+})
diff --git a/tests/src/R/tests/test_PolymeraseStitching.R b/tests/src/R/tests/test_PolymeraseStitching.R

new file mode 100644 (file)

index 0000000..401ecce
--- /dev/null
+++ b/tests/src/R/tests/test_PolymeraseStitching.R
@@ -0,0 +1,415 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF
+# USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+compareContainers <- function(c1, c2) {
+    
+       assertEqual(length(c1), length(c2))
+    
+       numElements <- length(c1)
+       for (i in 1:numElements)
+               assertEqual(c1[i], c2[i])
+}
+
+compareFrames <- function(f1, f2) {
+    
+    d1 <- f1$Data()
+    d2 <- f2$Data()
+    compareContainers(d1, d2)
+}
+
+compareRecords <- function(b1, b2) {
+
+    assertTrue(b1$HasDeletionQV())
+    assertTrue(b1$HasDeletionTag())
+    assertTrue(b1$HasInsertionQV())
+    assertTrue(b1$HasMergeQV())
+    assertTrue(b1$HasSubstitutionQV())
+    assertTrue(b1$HasSubstitutionTag())
+    assertTrue(b1$HasLabelQV())
+    assertTrue(b1$HasAltLabelQV())
+    assertTrue(b1$HasAltLabelTag())
+    assertTrue(b1$HasPkmean())
+    assertTrue(b1$HasPkmid())
+    assertTrue(b1$HasPulseCall())
+    assertTrue(b1$HasIPD())
+    assertTrue(b1$HasPulseWidth())
+    assertTrue(b1$HasPrePulseFrames())
+    assertTrue(b1$HasPulseCallWidth())
+    assertTrue(b1$HasPulseMergeQV())
+    
+    assertTrue(b2$HasDeletionQV())
+    assertTrue(b2$HasDeletionTag())
+    assertTrue(b2$HasInsertionQV())
+    assertTrue(b2$HasMergeQV())
+    assertTrue(b2$HasSubstitutionQV())
+    assertTrue(b2$HasSubstitutionTag())
+    assertTrue(b2$HasLabelQV())
+    assertTrue(b2$HasAltLabelQV())
+    assertTrue(b2$HasAltLabelTag())
+    assertTrue(b2$HasPkmean())
+    assertTrue(b2$HasPkmid())
+    assertTrue(b2$HasPulseCall())
+    assertTrue(b2$HasIPD())
+    assertTrue(b2$HasPulseWidth())
+    assertTrue(b2$HasPrePulseFrames())
+    assertTrue(b2$HasPulseCallWidth())
+    assertTrue(b2$HasPulseMergeQV())
+ 
+    assertEqual(b1$FullName(),        b2$FullName())
+    assertEqual(b1$HoleNumber(),      b2$HoleNumber())
+    assertEqual(b1$NumPasses(),       b2$NumPasses())
+    assertEqual(b1$Sequence(),        b2$Sequence())
+    assertEqual(b1$DeletionTag(),     b2$DeletionTag())
+    assertEqual(b1$SubstitutionTag(), b2$SubstitutionTag())
+    assertEqual(b1$AltLabelTag(),     b2$AltLabelTag())
+    assertEqual(b1$PulseCall(),       b2$PulseCall())
+    
+    # compareContainers(b1$Pkmean(), b2$Pkmean())
+    # compareContainers(b1$Pkmid(), b2$Pkmid())
+    #
+    # compareFrames(b1$IPD(),             b2$IPD())
+    # compareFrames(b1$PulseWidth(),      b2$PulseWidth())
+    # compareFrames(b1$PrePulseFrames(),  b2$PrePulseFrames())
+    # compareFrames(b1$PulseCallWidth(),  b2$PulseCallWidth())
+
+    assertEqual(b1$ReadGroup()$Id(), b2$ReadGroup()$Id())
+    
+    assertEqual(b1$Qualities()$Fastq(),       b2$Qualities()$Fastq())
+    assertEqual(b1$DeletionQV()$Fastq(),      b2$DeletionQV()$Fastq())
+    assertEqual(b1$InsertionQV()$Fastq(),     b2$InsertionQV()$Fastq())
+    assertEqual(b1$MergeQV()$Fastq(),         b2$MergeQV()$Fastq())
+    assertEqual(b1$SubstitutionQV()$Fastq(),  b2$SubstitutionQV()$Fastq())
+    assertEqual(b1$LabelQV()$Fastq(),         b2$LabelQV()$Fastq())
+    assertEqual(b1$AltLabelQV()$Fastq(),      b2$AltLabelQV()$Fastq())
+    assertEqual(b1$PulseMergeQV()$Fastq(),    b2$PulseMergeQV()$Fastq())
+    
+    return
+}
+
+getVirtualRecord <- function(fn1, fn2) {
+    
+    result <- tryCatch(
+        {
+            vpr <- ZmwReadStitcher(fn1, fn2)
+            assertTrue(vpr$HasNext())
+            virtualRecord <- vpr$Next()
+            return(virtualRecord)
+        },
+        error = function(e) {
+            print(paste('e:',e))
+            assertTrue(FALSE) # should not throw
+            return
+        }    
+    )
+    return(result)
+}
+
+getPolymeraseRecord <- function(fn) {
+    
+    result <- tryCatch(
+        {
+            ds <- DataSet(fn)
+            entireFile <- EntireFileQuery(ds)
+            polyIter <- entireFile$begin()
+            polyEnd <- entireFile$end()
+            assertTrue(polyIter$'__ne__'(polyEnd))
+            polyRecord <- polyIter$value()
+            return(polyRecord)
+        },
+        error = function(e) {
+            print(paste('e:',e))
+            assertTrue(FALSE) # should not throw
+            return
+        }    
+    )
+    return(result)
+}
+
+test_case("PolymeraseStitching_VirtualRegions", {
+       
+       subreadsFn <- paste(test_data_path, "polymerase/internal.subreads.bam", sep="/")
+       scrapsFn   <- paste(test_data_path, "polymerase/internal.scraps.bam", sep="/")
+    virtualRecord <- getVirtualRecord(subreadsFn, scrapsFn)
+    
+    # -- ADAPTER -- #
+    
+    adapter <- virtualRecord$VirtualRegionsTable('ADAPTER')
+    assertEqual(7L, adapter$size())
+    
+    region <- adapter$'__getitem__'(0)
+    assertEqual(3047L, region$beginPos)
+    assertEqual(3095L, region$endPos)
+    
+    region <- adapter$'__getitem__'(1)
+    assertEqual(3650L, region$beginPos)
+    assertEqual(3700L, region$endPos)
+    
+    region <- adapter$'__getitem__'(2)
+    assertEqual(4289L, region$beginPos)
+    assertEqual(4335L, region$endPos)
+    
+    region <- adapter$'__getitem__'(3)
+    assertEqual(4888L, region$beginPos)
+    assertEqual(4939L, region$endPos)
+    
+    region <- adapter$'__getitem__'(4)
+    assertEqual(5498L, region$beginPos)
+    assertEqual(5546L, region$endPos)
+    
+    region <- adapter$'__getitem__'(5)
+    assertEqual(6116L, region$beginPos)
+    assertEqual(6173L, region$endPos)
+    
+    region <- adapter$'__getitem__'(6)
+    assertEqual(6740L, region$beginPos)
+    assertEqual(6790L, region$endPos)
+
+    # -- BARCODE -- #
+
+    barcode = virtualRecord$VirtualRegionsTable('BARCODE')
+    assertEqual(14L, barcode$size())
+
+    region <- barcode$'__getitem__'(0)
+    assertEqual(3025L, region$beginPos)
+    assertEqual(3047L, region$endPos)
+    
+    region <- barcode$'__getitem__'(1)
+    assertEqual(3095L, region$beginPos)
+    assertEqual(3116L, region$endPos)
+    
+    region <- barcode$'__getitem__'(2)
+    assertEqual(3628L, region$beginPos)
+    assertEqual(3650L, region$endPos)
+    
+    region <- barcode$'__getitem__'(3)
+    assertEqual(3700L, region$beginPos)
+    assertEqual(3722L, region$endPos)
+    
+    region <- barcode$'__getitem__'(4)
+    assertEqual(4267L, region$beginPos)
+    assertEqual(4289L, region$endPos)
+    
+    region <- barcode$'__getitem__'(5)
+    assertEqual(4335L, region$beginPos)
+    assertEqual(4356L, region$endPos)
+    
+    region <- barcode$'__getitem__'(6)
+    assertEqual(4864L, region$beginPos)
+    assertEqual(4888L, region$endPos)
+
+    region <- barcode$'__getitem__'(7)
+    assertEqual(4939L, region$beginPos)
+    assertEqual(4960L, region$endPos)
+    
+    region <- barcode$'__getitem__'(8)
+    assertEqual(5477L, region$beginPos)
+    assertEqual(5498L, region$endPos)
+    
+    region <- barcode$'__getitem__'(9)
+    assertEqual(5546L, region$beginPos)
+    assertEqual(5571L, region$endPos)
+    
+    region <- barcode$'__getitem__'(10)
+    assertEqual(6087L, region$beginPos)
+    assertEqual(6116L, region$endPos)
+    
+    region <- barcode$'__getitem__'(11)
+    assertEqual(6173L, region$beginPos)
+    assertEqual(6199L, region$endPos)
+    
+    region <- barcode$'__getitem__'(12)
+    assertEqual(6719L, region$beginPos)
+    assertEqual(6740L, region$endPos)
+    
+    region <- barcode$'__getitem__'(13)
+    assertEqual(6790L, region$beginPos)
+    assertEqual(6812L, region$endPos)
+
+    # -- LQREGION -- #
+
+    lqregion = virtualRecord$VirtualRegionsTable('LQREGION')
+    assertEqual(2L, lqregion$size())
+    
+    region <- lqregion$'__getitem__'(0)
+    assertEqual(0L, region$beginPos)
+    assertEqual(2659L, region$endPos)
+    
+    region <- lqregion$'__getitem__'(1)
+    assertEqual(7034L, region$beginPos)
+    assertEqual(7035L, region$endPos)
+    
+    # -- HQREGION -- #
+
+    hqregion = virtualRecord$VirtualRegionsTable('HQREGION')
+    assertEqual(1L, hqregion$size())
+    
+    region <- hqregion$'__getitem__'(0)
+    assertEqual(2659L, region$beginPos)
+    assertEqual(7034L, region$endPos)
+})
+
+test_case("PolymeraseStitching_InternalSubreadsToOriginal", {
+  
+    # stitch virtual polymerase record
+    subreadsFn <- paste(test_data_path, "polymerase/internal.subreads.bam", sep="/")
+    scrapsFn   <- paste(test_data_path, "polymerase/internal.scraps.bam", sep="/")
+    virtualRecord <- getVirtualRecord(subreadsFn, scrapsFn)
+
+    # fetch original polymerase record
+    polyFn <- paste(test_data_path, "polymerase/internal.polymerase.bam", sep="/")
+    polyRecord <- getPolymeraseRecord(polyFn)      
+
+    # check
+    compareRecords(polyRecord, virtualRecord)
+})
+
+test_case("PolymeraseStitching_InternalHQToOriginal", {
+  
+    # stitch virtual polymerase record
+    hqRegionFn <- paste(test_data_path, "polymerase/internal.hqregions.bam", sep="/")
+    lqRegionFn <- paste(test_data_path, "polymerase/internal.lqregions.bam", sep="/")
+    virtualRecord <- getVirtualRecord(hqRegionFn, lqRegionFn)
+    
+    # fetch original polymerase record
+    polyFn <- paste(test_data_path, "polymerase/internal.polymerase.bam", sep="/")
+    polyRecord <- getPolymeraseRecord(polyFn)      
+
+    # check
+    compareRecords(polyRecord, virtualRecord)
+})
+
+test_case("PolymeraseStitching_ProductionSubreadsToOriginal", {
+  
+    # stitch virtual polymerase record
+    subreadsFn <- paste(test_data_path, "polymerase/production.subreads.bam", sep="/")
+    scrapsFn   <- paste(test_data_path, "polymerase/production.scraps.bam", sep="/")
+    virtualRecord <- getVirtualRecord(subreadsFn, scrapsFn)
+    
+    # fetch original polymerase record
+    polyFn <- paste(test_data_path, "polymerase/production.polymerase.bam", sep="/")
+    polyRecord <- getPolymeraseRecord(polyFn)  
+    
+    # compare
+    assertEqual(polyRecord$FullName(),        virtualRecord$FullName())
+    assertEqual(polyRecord$HoleNumber(),      virtualRecord$HoleNumber())
+    assertEqual(polyRecord$NumPasses(),       virtualRecord$NumPasses())
+    assertEqual(polyRecord$Sequence(),        virtualRecord$Sequence())
+    assertEqual(polyRecord$DeletionTag(),     virtualRecord$DeletionTag())
+    assertEqual(polyRecord$SubstitutionTag(), virtualRecord$SubstitutionTag())
+    
+    compareFrames(polyRecord$IPD(),                virtualRecord$IPDV1Frames())
+    assertEqual(polyRecord$ReadGroup()$Id(),       virtualRecord$ReadGroup()$Id())
+    
+    tolerance = 1e-5
+    assertTrue( abs(polyRecord$ReadAccuracy()$ToFloat() - virtualRecord$ReadAccuracy()$ToFloat()) <= tolerance )
+    # assertEqual(polyRecord$ReadAccuracy()$ToFloat(), virtualRecord$ReadAccuracy()$ToFloat())
+
+    assertEqual(polyRecord$Qualities()$Fastq(),       virtualRecord$Qualities()$Fastq())
+    assertEqual(polyRecord$DeletionQV()$Fastq(),      virtualRecord$DeletionQV()$Fastq())
+    assertEqual(polyRecord$InsertionQV()$Fastq(),     virtualRecord$InsertionQV()$Fastq())
+    assertEqual(polyRecord$MergeQV()$Fastq(),         virtualRecord$MergeQV()$Fastq())
+    assertEqual(polyRecord$SubstitutionQV()$Fastq(),  virtualRecord$SubstitutionQV()$Fastq())
+})
+
+test_case("PolymeraseStitching_ProductionHQToOriginal", {
+  
+    # stitch virtual polymerase record
+    hqRegionFn <- paste(test_data_path, "polymerase/production_hq.hqregion.bam", sep="/")
+    lqRegionFn <- paste(test_data_path, "polymerase/production_hq.scraps.bam", sep="/")
+    virtualRecord <- getVirtualRecord(hqRegionFn, lqRegionFn)
+  
+    # fetch original polymerase record
+    polyFn <- paste(test_data_path, "polymerase/production.polymerase.bam", sep="/")
+    polyRecord <- getPolymeraseRecord(polyFn)
+  
+    # compare
+    assertEqual(polyRecord$FullName(),        virtualRecord$FullName())
+    assertEqual(polyRecord$HoleNumber(),      virtualRecord$HoleNumber())
+    assertEqual(polyRecord$NumPasses(),       virtualRecord$NumPasses())
+    assertEqual(polyRecord$Sequence(),        virtualRecord$Sequence())
+    assertEqual(polyRecord$DeletionTag(),     virtualRecord$DeletionTag())
+    assertEqual(polyRecord$SubstitutionTag(), virtualRecord$SubstitutionTag())
+
+    compareFrames(polyRecord$IPD(),                virtualRecord$IPDV1Frames())
+    assertEqual(polyRecord$ReadGroup()$Id(),       virtualRecord$ReadGroup()$Id())
+    
+    tolerance = 1e-5
+    assertTrue( abs(polyRecord$ReadAccuracy()$ToFloat() - virtualRecord$ReadAccuracy()$ToFloat()) <= tolerance )
+    # assertEqual(polyRecord$ReadAccuracy()$ToInt(), virtualRecord$ReadAccuracy()$ToInt())
+    
+    assertEqual(polyRecord$Qualities()$Fastq(),       virtualRecord$Qualities()$Fastq())
+    assertEqual(polyRecord$DeletionQV()$Fastq(),      virtualRecord$DeletionQV()$Fastq())
+    assertEqual(polyRecord$InsertionQV()$Fastq(),     virtualRecord$InsertionQV()$Fastq())
+    assertEqual(polyRecord$MergeQV()$Fastq(),         virtualRecord$MergeQV()$Fastq())
+    assertEqual(polyRecord$SubstitutionQV()$Fastq(),  virtualRecord$SubstitutionQV()$Fastq())
+
+    assertTrue(polyRecord$HasDeletionQV())
+    assertTrue(polyRecord$HasDeletionTag())
+    assertTrue(polyRecord$HasInsertionQV())
+    assertTrue(polyRecord$HasMergeQV())
+    assertTrue(polyRecord$HasSubstitutionQV())
+    assertTrue(polyRecord$HasSubstitutionTag())
+    assertTrue(polyRecord$HasIPD())
+    assertFalse(polyRecord$HasLabelQV())
+    assertFalse(polyRecord$HasAltLabelQV())
+    assertFalse(polyRecord$HasAltLabelTag())
+    assertFalse(polyRecord$HasPkmean())
+    assertFalse(polyRecord$HasPkmid())
+    assertFalse(polyRecord$HasPulseCall())
+    assertFalse(polyRecord$HasPulseWidth())
+    assertFalse(polyRecord$HasPrePulseFrames())
+    assertFalse(polyRecord$HasPulseCallWidth())
+    assertFalse(polyRecord$HasPulseCall())
+
+    assertTrue(virtualRecord$HasDeletionQV())
+    assertTrue(virtualRecord$HasDeletionTag())
+    assertTrue(virtualRecord$HasInsertionQV())
+    assertTrue(virtualRecord$HasMergeQV())
+    assertTrue(virtualRecord$HasSubstitutionQV())
+    assertTrue(virtualRecord$HasSubstitutionTag())
+    assertTrue(virtualRecord$HasIPD())
+    assertFalse(virtualRecord$HasLabelQV())
+    assertFalse(virtualRecord$HasAltLabelQV())
+    assertFalse(virtualRecord$HasAltLabelTag())
+    assertFalse(virtualRecord$HasPkmean())
+    assertFalse(virtualRecord$HasPkmid())
+    assertFalse(virtualRecord$HasPulseCall())
+    assertFalse(virtualRecord$HasPulseWidth())
+    assertFalse(virtualRecord$HasPrePulseFrames())
+    assertFalse(virtualRecord$HasPulseCallWidth()) 
+    assertFalse(virtualRecord$HasPulseCall())
+})
diff --git a/tests/src/R/tests/test_QualityValues.R b/tests/src/R/tests/test_QualityValues.R

new file mode 100644 (file)

index 0000000..b0671d0
--- /dev/null
+++ b/tests/src/R/tests/test_QualityValues.R
@@ -0,0 +1,113 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+test_case("QualityValue_Defaults", { 
+       
+       value <- QualityValue()
+       assertEqual(0L, value$ToInt())
+       assertEqual('!', value$Fastq())
+})
+
+test_case("QualityValue_FromNumber", { 
+       
+    zero        <- QualityValue(0)
+    thirtythree <- QualityValue(33)
+    normal      <- QualityValue(42)
+    maxQV       <- QualityValue(93)
+    tooHigh     <- QualityValue(94)
+    max8bit     <- QualityValue(126)
+    
+    assertEqual(0L,  zero$ToInt())
+    assertEqual(33L, thirtythree$ToInt())
+    assertEqual(42L, normal$ToInt())
+    assertEqual(93L, maxQV$ToInt())
+    assertEqual(93L, tooHigh$ToInt())
+    assertEqual(93L, max8bit$ToInt())
+
+    assertEqual('!', zero$Fastq())
+    assertEqual('B', thirtythree$Fastq())
+    assertEqual('K', normal$Fastq())
+    assertEqual('~', maxQV$Fastq())
+    assertEqual('~', tooHigh$Fastq())
+    assertEqual('~', max8bit$Fastq())
+})
+
+test_case("QualityValue_FromFastq", { 
+       
+    zero        <- QualityValue_FromFastq('!')
+    thirtythree <- QualityValue_FromFastq('B')
+    normal      <- QualityValue_FromFastq('K')
+    maxQV       <- QualityValue_FromFastq('~')
+
+    assertEqual(0L,  zero$ToInt())
+    assertEqual(33L, thirtythree$ToInt())
+    assertEqual(42L, normal$ToInt())
+    assertEqual(93L, maxQV$ToInt())
+})
+
+test_case("QualityValues_Defaults", { 
+    values <- QualityValues()   
+       assertEqual(0L, nchar(values$Fastq()))
+})
+
+test_case("QualityValues_FromNumbers", { 
+       
+       fastqString <- "~~~KKBB!!"
+       values <- c(93, 93, 93, 42, 42, 33, 33, 0, 0)
+       
+       qvs <- QualityValues()
+       for (v in values) 
+               qvs$push_back(QualityValue(v))
+       
+       assertEqual(fastqString, qvs$Fastq())
+})
+
+test_case("QualityValues_FromFastq", { 
+       
+       fastqString <- "~~~KKBB!!"
+       values <- c(93L, 93L, 93L, 42L, 42L, 33L, 33L, 0L, 0L)
+       
+       qvs <- QualityValues(fastqString)
+       assertEqual(nchar(fastqString), qvs$size())
+       assertEqual(length(values),     qvs$size())
+       
+       numValues <- length(values)
+       for ( i in 1:numValues ) {
+               qv <- qvs$'__getitem__'(i-1)
+               assertEqual(values[i], qv$ToInt())
+       }
+})
diff --git a/tests/src/R/tests/utils.R b/tests/src/R/tests/utils.R

new file mode 100644 (file)

index 0000000..ff793a6
--- /dev/null
+++ b/tests/src/R/tests/utils.R
@@ -0,0 +1,307 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+# main test suite runner
+run_test_suite <- function(path) {
+       test_files <- dir(path, "^test.*\\.[rR]$", full.names = TRUE)
+       lapply(test_files, run_test_file)
+       invisible()
+}
+
+run_test_file <- function(filename) {
+       source(filename)
+       invisible()
+}
+
+# main test case definition
+#
+# Example:
+# test_case("name", {
+#     ...tested code here...   
+# })
+#
+test_case <- function(name, code) {
+       test_case_runner(name, substitute(code))
+       invisible()
+}
+
+# main assert definitions
+#
+# assertEqual(expected, actual)
+# assertNotEqual(expected, actual)
+# assertTrue(expr)
+# assertFalse(expr)
+#
+
+assertEqual <- function(expected, actual) {
+       assertHelper(identical(expected, actual), 
+                    TRUE, 
+                                expression_string("expected"), 
+                                deparse(expected), 
+                                expression_string("actual"), 
+                                deparse(actual), 
+                                "==")
+}
+
+assertAllEqual <- function(expected, actual) {
+       assertHelper(all.equal(expected, actual), 
+                    TRUE, 
+                                expression_string("expected"), 
+                                deparse(expected), 
+                                expression_string("actual"), 
+                                deparse(actual), 
+                                "==")
+}
+
+assertNotEqual <- function(expected, actual) {
+       assertHelper(identical(expected, actual), 
+                    FALSE, 
+                                expression_string("expected"), 
+                                deparse(expected), 
+                                expression_string("actual"), 
+                                deparse(actual), 
+                                "!=")
+}
+
+assertTrue <- function(expr) {
+       assertHelper(as.vector(expr), 
+                    TRUE, 
+                                "TRUE", 
+                                "TRUE", 
+                                expression_string("expr"), 
+                                deparse(expr), 
+                                "==")
+}
+
+assertFalse <- function(expr) {
+       assertHelper(as.vector(expr), 
+                    FALSE, 
+                                "FALSE", 
+                                "FALSE", 
+                                expression_string("expr"), 
+                                deparse(expr), 
+                                "==")
+}
+
+# TODO: (if needed) assertLessThan, assertGreaterThan, assertNull, etc
+
+# ------------------------------- #
+# internals 
+# ------------------------------- #
+
+expression_string <- function(name, env = parent.frame()) {
+  subs <- do.call("substitute", list(as.name(name), env))
+  paste0(deparse(subs, width.cutoff = 500), collapse = "\n")
+}
+
+assertHelper <- function(compare, 
+                            to, 
+                            expected_expr, 
+                                                expected_value, 
+                                                actual_expr, 
+                                                actual_value,
+                                                compare_type) 
+{
+       success <- identical(compare, to)
+       
+       result <- make_assert_result(success,
+                                            expected_expr, 
+                                 expected_value, 
+                                                        actual_expr, 
+                                                        actual_value,
+                                                        compare_type)
+       
+       
+       # record result with testCaseCollector
+       testCaseResults <- test_case_results()
+       testCaseResults$add_result(result)
+       invisible()
+}
+
+make_assert_result <- function(success,
+                                  expected_expr, 
+                                                  expected_value, 
+                                                  actual_expr, 
+                                                  actual_value,
+                                                  compare_type)
+{
+       structure(list(
+               success = success,
+               expectedExpression = expected_expr,
+               expectedValue = expected_value,
+               actualExpression = actual_expr,
+               actualValue = actual_value,
+               compareType = compare_type      
+       ))                                              
+}
+
+TestCaseResults <- setRefClass("TestCaseResults",
+
+       fields = list(
+               test = "character",
+               anyFailed = "logical",
+               results = "list"
+       ),
+       methods = list(
+               initialize = function(...) {
+                       test <<- ""
+                       anyFailed <<- FALSE
+               
+                       initFields(...)
+               },
+       
+               start_test = function(name) {
+                       test <<- name
+                       results <<- list()
+               },
+               
+               add_result = function(result) {
+                       if (!result$success) 
+                               anyFailed <<- TRUE
+                       results <<- c(results, list(result))
+               },
+               
+               end_test = function() {
+               
+                       # summarize test case results
+                       testOutput <- format_results()
+               
+                       # report to test collector
+                       suiteResults <- test_suite_results()
+                       suiteResults$add_test_case_result(anyFailed, testOutput)
+               
+                       # reset
+                       test <<- ""
+                       anyFailed <<- FALSE
+                       results <<- list()
+               },
+               
+               format_results = function() {
+                       
+                       lines <- list()
+                       
+                       status <- "OK"
+                       if (anyFailed) 
+                               status <- "FAILED"
+                       
+                       headerLine <- paste("TestCase:", test, "...", status, sep=" ")
+                       lines <- c(lines, list(headerLine))
+                       
+                       for (result in results) {
+                               if (!result$success) {
+                                       valueOfLabel <- paste(result$actualExpression, result$compareType, result$expectedExpression, sep=" ")
+                                       valueOf  <- paste("  Value of:", valueOfLabel,         sep=" ")
+                                       actual   <- paste("    Actual:", result$actualValue,   sep=" ")
+                                       expected <- paste("  Expected:", result$expectedValue, sep=" ")
+                                       lines <- c(lines, valueOf, actual, expected, "")
+                               }
+                       }
+                       invisible(lines)
+               } 
+       )
+)
+
+TestSuiteResults <- setRefClass("TestSuiteResults",
+
+       fields = list(
+               numTests = "integer",
+               numFailed = "integer",
+               results = "list"
+       ),
+       methods = list(
+               initialize = function(...) {
+                       numTests <<- 0L
+                       numFailed <<- 0L
+                       results <<- list()
+               
+                       initFields(...)
+               },
+               
+               add_test_case_result = function(testHasFailures, testOutput) {  #(results)
+                       numTests <<- numTests + 1L
+                       if (testHasFailures)
+                               numFailed <<- numFailed + 1L
+                       results <<- c(results, testOutput)
+               },
+               
+               any_failed = function() {
+                       return (numFailed != 0L)
+               },
+               
+               print_summary = function(...) {
+                               
+                       cat("\n")
+                       cat("-------------------------\n")
+                       cat("Tests Complete\n")
+                       cat("-------------------------\n")
+                       cat("\n")
+                       
+                       for (result in results) {
+                               cat(result)
+                               cat("\n")
+                       }
+                       cat("-------------------------\n")
+                       
+                       if (numFailed == 1L) {
+                               footer <- paste(numFailed, "test failed out of", numTests, sep=" ")
+                       } else {
+                               footer <- paste(numFailed, "tests failed out of", numTests, sep=" ")
+                       }
+                       cat(footer)
+                       cat("\n\n")
+               }
+       )
+)
+
+test_env = new.env()
+test_env$testSuiteResults <- TestSuiteResults$new()
+test_env$testCaseResults <- TestCaseResults$new()
+
+test_suite_results <- function() {
+       test_env$testSuiteResults
+}
+
+test_case_results <- function() {
+       test_env$testCaseResults
+}
+
+test_case_runner <- function(name, code) {
+       testCaseResults <- test_case_results()
+       testCaseResults$start_test(name)
+       eval(code, test_env)
+       testCaseResults$end_test()
+}
diff --git a/tests/src/TestData.h.in b/tests/src/TestData.h.in

new file mode 100644 (file)

index 0000000..b25d262
--- /dev/null
+++ b/tests/src/TestData.h.in
@@ -0,0 +1,58 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef TESTDATA_H
+#define TESTDATA_H
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace tests {
+
+const std::string Source_Dir = std::string("@PacBioBAM_TestsDir@");
+const std::string Bin_Dir    = std::string("@CMAKE_CURRENT_BINARY_DIR@");
+const std::string Data_Dir   = std::string("@PacBioBAM_TestsDir@/data");
+const std::string Generated_Dir     = std::string("@GeneratedDir@");
+const std::string GeneratedData_Dir = std::string("@GeneratedTestDataDir@");
+const std::string Bam2Sam    = std::string("@PacBioBAM_BinDir@/bam2sam");
+
+} // namespace tests
+} // namespace BAM
+} // namespace PacBio
+
+#endif // TESTDATA_H
diff --git a/tests/src/cram/bam2sam.t.in b/tests/src/cram/bam2sam.t.in

new file mode 100644 (file)

index 0000000..66645c4
--- /dev/null
+++ b/tests/src/cram/bam2sam.t.in
@@ -0,0 +1,63 @@
+Setup:
+
+  $ BAM2SAM="@PacBioBAM_BinDir@/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+
+Normal:
+
+  $ $BAM2SAM < $DATADIR/phi29.bam | head -n 5
+  @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc)
+  @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc)
+  @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+
+Explicit Filename (not stdin):
+
+  $ $BAM2SAM $DATADIR/phi29.bam | head -n 5
+  @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc)
+  @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc)
+  @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+
+Header-Only:
+
+  $ $BAM2SAM --header-only < $DATADIR/phi29.bam | head -n 5
+  @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc)
+  @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc)
+  @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc)
+
+No-Header:
+
+  $ $BAM2SAM --no-header < $DATADIR/phi29.bam | head -n 5
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/4151_6183\t4\t*\t0\t255\t*\t*\t0\t0\tGATCCCGCGAATTAATTACGACTCACTATAGGGGAATTGTGAGCGGATAACAATTCCCGCCTCTAGAAATAATTTTGTTTAAACTTTTAAGAAAGGAGATATTACATATGAAACACAGCCACGTAAAATGTATTCCTGCGACTTGGAGACTACCACCAAGGTGAAGATTTGCCGCGTAATGGGCATACGGTTTACATGAAACATCGAAGAACAAACTCGAGTATAAGATTGGTAACTCCCCTGGATGAATTATGGCTTGGGTTACTGAAAGTTCGAGGTCTGACCTGTACTTCGCACAAATCTGAAAATTTGATGGCCGCAAATTTCAATTCATCACTGGCTGGAACGTAAACGGTTTTAAATGGTCCGCAGATCGGTCTGTGCCAAATACCCTGATCAACACATCATTTCTTCGCAATGCGGCCAGTGTAATGATTGATATCTTGCCCTGGGTTGACAAGGGGTAAACGCAAGATCCACACCTGTGATCTACGACCTCTCTGAAGAAAACTGCGTTTCCGGTTAAGAAAATTGCGAAAGACTTTAAGCTGAACGGTACTGAAAAGCGACATGACTATCATAATGAGCGCCCGGTCGTTACAAAATCACCCCGGAAAGAATATGCCTACATTTAAAAACGATATTCAGATTATCGCAAGAACTCTGCTGATCAGTTCAAGCAAGGGTCTGGATCGTAAATGACGGCAGGTTCTGACTCTCCTGAAAGGCTTCAAAAGACATTATCACCACCTAAAAGAAGTTTAAAAAGGTTTTTTCACCGACCCTGAGCCTAGGGCTGGACAAGGAAAGTTGTTAATGCCCATACCGTGGTGGTTTCACCTGGCTGAAAGACCGTTTTAAAGAAAAAGAGATCGGCGAAGGTATGGTTTTTGATGTTAATTCCCTGTAACCAAGCCTTCAATGTACTCTCGCCTGCTTGCCGTCACACGGGCGAGCGACGTATTCGAAAGGGTAAAATACGTTCTGGGACGGAGGATTTACCCTCTGCAATTCGGCACATTCCGTTGTGAATTTGGAACTGAAAGGAAGGCTTAGATCCCGACCATCCCAGATCAAGCGTTCCCATTTCTAACAAAGGGTAACGAATACCTGAAATCTTCCAGGCGGTGAAATTGCTGACCTGTGGCTGTCTAAATGTTTGATCTTGGAAACTGATGAAAGAGCACTACGACCTGGTACAATGTTGAATATATCTCTGGTCTGAAGTTCAAAGCAACCACTGGCCTGTTCAAGGACTTTATCGACAATGGACGTATATCAAAAGACTACCTCTGAAGACGCCATCAAACAGCTGGCGAAGCTGATGCTGACAAGCCTGTACGGTAAATTCGCGTCCCACCCGGACGTTTACCGGGTAAAGTGCCATATGCTGAAAGAGAAAGCGGTGCTCTGGTTTTTCGTCTAGGTGGAAGGAGGAAACGAAGACACTGTATATACCGCCGAATGGGTGTCTTTATCCAAGCGGCCTGGCACGCTATACGACCATCACAGGCAAGCGCAGGCTTTTGTTAATGATCGTATTATCTACTGCGATTACCGATTCTACTTCACTGACTGGTACTGAAATCTGGACGTTATCAAAGACATCGTAGACCCGAAGAAACTGGGCTACTGGCACACGAATTCCACTTTAAGCGTGCAAAATATCTGCGTCAGAAAACCTACATCCCAGGATATTTACATGAAAGAAGTAGACGGCAAACTGGTAGAGGGCTCTCCGTGACGACTACACTGACATCAAGTTCTCTGTGAAATGCGCAGGCAAATGACGGCACAAAATCCAAAAAAGGAAGTGACTTTCGAAAACTTCAAAAGTGGGTTCTCGTAAAATGAAACCGAAAGCTGTTCAGGTTTAAACCCGGGTGGCGTAGTGCCTGGTTGATGAACACTTTTTACTATCAAAATAACTTCGAAAGCTGCAGGAATTCAAGCTGATCCGGCTGCTAACAAAGCCCGAAGGAAGCTGAGTTGGCTGCTGCCACCGTGAGCAATACTCTAAATACATGACTCT\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:6183\tqs:i:4151\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/6234_8214\t4\t*\t0\t255\t*\t*\t0\t0\tAGAGTCATGTATAAGAGTTATTGCTCAGCGGTGGCAGCAGACAACTCAGCTTCCTTTCGGGCCTTTGTTAGCAGCCGGATCCAAGCTTGAATTCCTGCAAGCTCGAGTTATTTGATAGTAAAAGTGTCATCAAACCAGCACTACGGCCGAACCCGGTACCTGAACAGATTCGTTTCATTTTACGAGAAAAACCCACTTTGAAGTTTTGCCGAAAGTCACTTCTTTTTGATTTGTCCGTCATGCTGCGCATTTCACAGAGACTTGAATGTCAGTGTAGTCGTCATCGGGGGGGGGAAGAGCCCTCTACCAGTTTTGCCGTCTACTTCTTTCATGTAAATATCTGGATGTAGGTTTTCTGAACGCAGATATTTGCAGCTTAAAAGTGGATTCGTGTGCCCAGTAGCCCGTTTTCTTCGGGTCCTACGATGTCTTTTGATAACGTCCAGAATTTCAGTACCAGTCAGGTGAATAGAATCGGTATCGCAGTAGATAAATACGATCATAACAAGCCTGCGCTGCCTGTTGATGGTCGTATAGCGTGCCCAGGCCCGTGATAAAGAACCATCGGGGTATATAACAGGGTCTTTCGTTCCTCCTCACCTAGACGAAAAACCCAGAGCACCGTTCTCTTTCAAGGTATGGCCTTTACCGGTAACGTCCGGGTTGGACGCGAATTTAAGCCGTAACAGGCTGTCTCAGCATACAGCTTTCGCCAGCCTGTTTGATGGCCGTCTTCAGAGGTAGTTTTGATATACGTTCCATTTGTCGATAAAGTCCCTTGAGCAGGCCCAGTGGGTTGGCTTTGAACTCAGACGCAGAATATATTCAACATTGTAACAGGTCGTAGTGCTCTTTCATCAGTTCAGATTCAACATTAGACAAGCCACAGGTCAGCATTTCACCGCCGGGAAGAATTTCAAGGTATTCGTTTACCCTTGGTAGAAATGGAACGCTTGTAATCTGGATGGTCGGGATCTAGCCTTTTCAGTTCAAAATTCACACGAATGTGCTGAATGTGCAGAGGGTAATACCTCGTCCCAGACGTATTTACCCTCGAATAAGCGAATCGGCTCGCCGTATCGCAGCAGGCGAGAGTAAACATTTGAGCTGGGTAACAGGGAATTACATCCAAAACCATACCCTTTCGCACGATCTCTTTTTCTTTAAAACGTCATTCAGCCAGGTGAAACCACCAGGTAGGCATAACGAACTTCCTGTCCAGACCCAAGGCTCAGGTCGGGAAAACTTTTTAAACTTCCTTGTGGTGATAATGTCTTTTGAAGCTTTCAGAGAGTCAGAACCTGCGTCATACGATCCAGACCGCTGCTTGAAGCTGGATCAGCAAGCTTCTGCGATAATCTGAATATCGTTTATTAATTAGGCATATTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAGTCGATGTCGCCCCTTTCAAGTACCGTCAGCTAAAGTCTTTCGCATTTTCTTACCGGAAACGGCAGTTTCTTCAGAGAGTCGTAAGATCACGTGTGGATCTTGCGTTTACCGCTTGTAACCCAGGCAAATATCAATCATATACCACTGGCCATGCGAGAAATGATGGTGTTGTAGGTTATTGGCAGACCATCTGCGGACCATTTTAAAACCGTTACGTTCAGCCAGTTGATGAATGAATGCGCCATGCAAATTTCAGATTGTGGAAGTACAAGGTCAGCCTGACTTTCAGAACCCAAAGCCATAAATTCATCCAGGGAGTTACCATCTTATACTCCGGAGTTGGTCTTCGATGTTCATGTAACCGTATGCCCATACGCGGCAAATCTTCAACCTTGGTGTGTAGTCCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTTCATAATGTATATCTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTATCCCGCTCACAATCCCCTATCAGTGAGTCGTATTAATTTCGCGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:8214\tqs:i:6234\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/8294_10277\t4\t*\t0\t255\t*\t*\t0\t0\tGATTCCCGCGAAATTAATACGAATCACTATAAGGGGAATTGTGAGCGGATAACAATTCCCCTCTAGAAATAATTTTGTTTAACTTTAAGAGGGACGATATACATATGAACACATGCCTACGTAAAATGTATTCCTGCGAACTGTTGAGACTACCACCAAGGTTGAAGATTTGCCGCGTAATGGGCATACGGTTACATGAACATCGAAGACCACTCCGATATGAAGATTGGTTAACCCCTGGATGAATTTATGGCTTGGGTTCTGAAAGTTCAGGCTGACCTGTACTTCACAATCTGAAATTTGATGGCCGCATTCATCAATCACTGGCTGGAACGTAAAACGGTTTAAAAATGGTCCCGCAGATGGTCTGACAAATTAACTACAACACCATCATTTCTCGCATGGGCCCAGTGGTATATGAAATTGATATTTGCCTGGGTTACAAGGAGGTAAACGCAAGATCCACACGTGGATCTACGACTCTTCTGAAGAAACCTGGCCGTTTCCGTTAAGAAAATGCGAAAGAACTTAAGCTGACGGTAACTGAAAGGCGACATCGACTATCATATAATGAAGCGCCCGTCGTTACAAAATCACCCCGGAAGAATATGCCTTACATTAAAAAACGATATTCAGATTTCGCAGAAGCTCTGCTGATCCAGTTCAAAGCAGGGTCCTGGATCGTAATGACGGCAGGTTCTGACTCTCTGAAAGGCTTCAAAGAACATTATCACCCACCAAGAAGTTTAAAAAGGTTTTCCCGACAACTGAGCCTGGGTCTGGACAAGGAAGTTTCGTTTGCCTACCGTGGTGGTTTTCAACCTGCTGACTGAACCGTTTTAAAAGAAAATAGAGATCGGCGGAAAGGTATGGTTTTTGATGTTAATTCCTGTAACCAGCCTCAAAATGTACTCTCGCCTGCTGCCGTACGGCGGCCGATCGTATTCGAAGGGTAAATACGTCTGGGACCGAGGATAGCCCTCTGCACATTCAGCACATTCGTTGTGAAATTTGAACTGAAGGAAGCTGATCCCGACGCATCCAGATCAAGCGTTCCCATTTTCTACAAGGTAACGAATACCTGAAATCTTCCCGGCGGTGAAATTGCTGCCTGTGGCTGTCTAATGTTGATCTGGAAACTGATGAAAGAGCACTACGAGACCTGTACAATGTTGAATATATCTCTGGTCTGAAGTTCAAAGCAACCACTGGCCTGTTCAAGGACTTTATCGACAAATGGCGTATTATCAAAACTACCTCTGAAGACGCCATCAAACAGCTGGCGAAGCTGATGCTGACAGCCTGTACGGTAAATTCGCGTCGCAACCCGGACGTTTCCGTAAAGTGCCCATACCTGAAAGAGAAACGGTGCTCTGGGTTTTCGTCTAGGTGAGGAGGAAACGAAAGACCCTGTAATATACCCGATGGTGTCTTTTATCACGGCCTGGGCACGCTAGTACGACCAATCACAGCAGCGCAGGCTTGTTATGATCGTATTTCTACTGCGGATACCGATTCTATTCCACCTGACTGGTACTGAAATTCTGGAACGTTATCAAAGACATCGTAGACCCGAAGAAACTGGGCTACTGGGGCACCACGAATCCACTTTTAAGCGTGGCAAAATATCTGACGTCAGAAAACCTACATCCAGGATATTTACATGAAAGAAGTAGACGGCAACTGTAGAGGGCTCTTCCTGACGAACCTACACTGACATCAAGTTCTCTGTGAAATGCGCAGGCATGACGGACCAAAATCAAAAAGGAAGTGAACTTTTCGAAAACTTCAAAGTGGGTTTTCTCGTAAAATGAAACCGAAGCCTGTCAGGTACCGGGTGGCGTAGTGCTGGTTGATCGGACACTTTACTATCAATAACTCGAGCTGCAGAATTCCAAGCTTGGATTCCGGCTGCTAACAAAGCCCGAAAGGAAGCTGAGTTGGCTGCTGCACCGCTGAGCAATAACTCTATACATGACTCAT\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:10277\tqs:i:8294\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/10327_12283\t4\t*\t0\t255\t*\t*\t0\t0\tAGAGTCATGTATAGAGTTATTGCTCAGCGGTGGCAGCACCAACTCAGCTTCCTTTCGGCTTTGTTAGCAGCCGATCCAAGCTTGAATTCCTGCAGCTCGGAGTTATTTGATAGTAAAAGTTGTCATCCAAACGCAGCACTACGCCCACCCGTACCTGAACAGGCTTTCGGTTTCATTTTACGAGAAAAACACTTTTGAAAGTTTTCGAAAGTCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAGAGAACTTGATGTCAGTGTAGTCGTCAGGAGAGCCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGAATGTAGGTTTTTCTGACGCAGATTATTTTGCACGCTTAAAAGTGGATTCGTGTGGCCCCAGTAGCCCAGTTTCTTCGGTCTACGATGTCTTTGATACGTCCAGAATTTCAGTAAACAGTCAGGTGAATAGAAATCCGGTATCGCAGTAGAATAATACGATCATAACAACCTGCGCTGCTGTGTGGTCGTATAGCGTGCCCAGGCCGTGATAACAGACACCTCGGGGTAATATACAGGGTCTTTCCGTTCCTCCTCAACCTAGACGAAACCCAGAGCACCGTTCTCTTTTCAGGTATGGCACTTTAACCGGTACGTCCGGGTTGGACGCGAATTTACCGTAGCAGGCTGTTCAGCATCAGCTTTCGCCAGCCTGTTTGATGGCGCTCTTCAGAGGTAGTTTGAATATACGTCCATTTGTCGAATAAAGTCCTTGGAACAGGCCCAGTGGTTGCTTTGAACTTCCAGACCAGAGATATATTTCAACATTGTACAGGTCGTAGTGCTCTTTCCACTCAGTTCCAGATCAACATTAAGACAGCCACAGGTCAGATTTCCCCGCCGGAAGATTCAGGTAATTCTAGTTACCCTTGTAGAAATGGCGACGCTTGATCTGGATGGTCGGGATCCTAGCTTCCCTTCAGTTCAAATTCACAACGAATGTTGCTGAATCTGTGCAGAGGGTAATCCTCGGTCCAGACGTATTTACCCTCGAATACGATGCTCGCCGTACGGCAGCAGCGAGAGTACATTTGAGCTGGTACAGGGAATTAACATCAAAAAACATACTTCGCCGATCTCTTTTTCTTTAAAACGGTCATTCAGCCAGGTGAAACCACCACGGTAGGCATAACGAAACTTCCTGTCCAGACCCAGGCTCAGGTCGGAAAACTTGTTAAACTTCTTGGTGGTGATAATGTCTTTGAAAGCCTTTCAGGAAGTCAGAACCATGCCGTCATCCGATCCAGACCCCTGCTTTGAACTGGAATCAGCAGAGGCTCTGCGATAATCGAATATCGTTTTTAAATGTAGGCATATTTTCTTCGGGGTGATTTGTAACGCGACCGGGCGCTCATTATGATAGTCGATGTCGCCTTTCAGTACCGTCAGCTTAAAGTCTTTCGCAATTTTCTTAACCGACGGCAGTTTCTTCAGAGAGGTCGTAGATCACGGTGTGGATCTTGCGTTTACCCTTGTAACCAGGCAAATATCAATCATATACCACTGGCCCATGCGAGAATGATGGTGTTGTAGGTATTTGGCAGACGCATCTGCGGACCATTTAAACCGTTACGTTCCAGCCAGTTGATGATGAATGCGCCCATCATTTCAGATTTGTGGAAGGTACAGGTCAGCCTGAACTTGTCAGAAACCCAAGCCATAAATTCATCCAGGGAGTACATCTTATAATCTCGAAGTGGTCTTCGATGTTCATGTAACCGTATGCCCATACGCGCAATCTTCACCTTGGTGGTAGTCTGCAGTCGCAGAATAATTTTACGTGGCATGTGTTTCATATGTTATTAGTCTCCTTCTTAAAGTTAAACAAAATTATTTTTAGAAGGGGAATTGTTATCCGCTCACAATTCCCCTATAGTGGAGTCGTATTAATTTCGCGGGTATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:12283\tqs:i:10327\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+
+Invalid-Args:
+
+  $ $BAM2SAM --header-only --no-header < $DATADIR/phi29.bam 
+  
+  ERROR: conflicting arguments requested: --no-header and --header-only
+  
+  Usage: bam2sam [options] [input]
+  
+  bam2sam converts a BAM file to SAM. It is essentially a stripped-down 'samtools
+  view', mostly useful for testing/debugging without requiring samtools. Input BAM
+  file is read from a file or stdin, and SAM output is written to stdout.
+  
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+  
+    Options:
+      input               Input BAM file. If not provided, stdin will be used as input.
+      --no-header         Omit header from output.
+      --header-only       Print only the header (no records).
+  [1]
+
diff --git a/tests/src/cram/pbindexdump_cpp.t.in b/tests/src/cram/pbindexdump_cpp.t.in

new file mode 100644 (file)

index 0000000..18a210c
--- /dev/null
+++ b/tests/src/cram/pbindexdump_cpp.t.in
@@ -0,0 +1,39 @@
+Setup:
+
+  $ PBINDEXDUMP="@PacBioBAM_BinDir@/pbindexdump" && export PBINDEXDUMP
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+
+Normal C++:
+
+  $ $PBINDEXDUMP --format=cpp $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  PbiRawData rawData;
+  rawData.Version(PbiFile::Version_3_0_1);
+  rawData.FileSections(PbiFile::BASIC);
+  rawData.NumReads(1);
+  
+  PbiRawBasicData& basicData = rawData.BasicData();
+  basicData.rgId_       = {-898246524};
+  basicData.qStart_     = {2659};
+  basicData.qEnd_       = {7034};
+  basicData.holeNumber_ = {0};
+  basicData.readQual_   = {0.01};
+  basicData.ctxtFlag_   = {0};
+  basicData.fileOffset_ = {20054016};
+  
+  
+--(leave the blank lines above this)--
+
+Request C++, with JSON options (stdout includes usage/help, so we just want to check stderr):
+
+  $ $PBINDEXDUMP --format=cpp --json-indent-level=2 $DATADIR/polymerase/production_hq.hqregion.bam.pbi > /dev/null
+  
+  ERROR: JSON formatting options not valid on non-JSON output
+  
+  [1]
+
+  $ $PBINDEXDUMP --format=cpp --json-raw $DATADIR/polymerase/production_hq.hqregion.bam.pbi > /dev/null
+  
+  ERROR: JSON formatting options not valid on non-JSON output
+  
+  [1]
diff --git a/tests/src/cram/pbindexdump_json.t.in b/tests/src/cram/pbindexdump_json.t.in

new file mode 100644 (file)

index 0000000..0c1cbcd
--- /dev/null
+++ b/tests/src/cram/pbindexdump_json.t.in
@@ -0,0 +1,83 @@
+Setup:
+
+  $ PBINDEXDUMP="@PacBioBAM_BinDir@/pbindexdump" && export PBINDEXDUMP
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+
+Default settings (JSON):
+
+  $ $PBINDEXDUMP $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  {
+      "fileSections": [
+          "BasicData"
+      ],
+      "numReads": 1,
+      "reads": [
+          {
+              "contextFlag": 0,
+              "fileOffset": 20054016,
+              "holeNumber": 0,
+              "qEnd": 7034,
+              "qStart": 2659,
+              "readQuality": 0.00999999977648258,
+              "rgId": -898246524
+          }
+      ],
+      "version": "3.0.1"
+  }
+
+JSON indent level(2):
+
+  $ $PBINDEXDUMP --json-indent-level=2 $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  {
+    "fileSections": [
+      "BasicData"
+    ],
+    "numReads": 1,
+    "reads": [
+      {
+        "contextFlag": 0,
+        "fileOffset": 20054016,
+        "holeNumber": 0,
+        "qEnd": 7034,
+        "qStart": 2659,
+        "readQuality": 0.00999999977648258,
+        "rgId": -898246524
+      }
+    ],
+    "version": "3.0.1"
+  }
+
+JSON raw:
+
+  $ $PBINDEXDUMP --json-raw $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  {
+      "basicData": {
+          "ctxtFlag": [
+              0
+          ],
+          "fileOffset": [
+              20054016
+          ],
+          "holeNumber": [
+              0
+          ],
+          "qEnd": [
+              7034
+          ],
+          "qStart": [
+              2659
+          ],
+          "readQual": [
+              0.00999999977648258
+          ],
+          "rgId": [
+              -898246524
+          ]
+      },
+      "fileSections": [
+          "BasicData"
+      ],
+      "numReads": 1,
+      "version": "3.0.1"
+  }
diff --git a/tests/src/cram/pbmerge_aligned_ordering.t.in b/tests/src/cram/pbmerge_aligned_ordering.t.in

new file mode 100644 (file)

index 0000000..58171bb
--- /dev/null
+++ b/tests/src/cram/pbmerge_aligned_ordering.t.in
@@ -0,0 +1,197 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ INPUT_1="$DATADIR/dataset/bam_mapping_1.bam" && export INPUT_1
+  $ INPUT_2="$DATADIR/dataset/bam_mapping_2.bam" && export INPUT_2
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/aligned_ordering_merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/aligned_ordering_merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $INPUT_1
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $INPUT_1 | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+
+  $ $BAM2SAM --header-only $INPUT_2
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $INPUT_2 | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+Normal Merge:
+
+  $ $PBMERGE $INPUT_1 $INPUT_2 > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+  $ rm $MERGED_BAM
+
+Shuffle Input:
+
+  $ $PBMERGE $INPUT_2 $INPUT_2 > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7046_7293\tlambda_NEB3011\t5136 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/38025/6255_7894\tlambda_NEB3011\t5427 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5311_5508\tlambda_NEB3011\t5943 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/36363/899_1197\tlambda_NEB3011\t6258 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/36363/605_853\tlambda_NEB3011\t6312 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/31174/0_1029\tlambda_NEB3011\t6487 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/31174/1075_1271\tlambda_NEB3011\t6499 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/38025/5743_6211\tlambda_NEB3011\t6606 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/50257/6944_7361\tlambda_NEB3011\t6942 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/50257/6546_6903\tlambda_NEB3011\t7010 (esc)
+
+  $ rm $MERGED_BAM
+
+Explicit Output Filename (also enables PBI):
+
+  $ $PBMERGE -o $MERGED_BAM $INPUT_1 $INPUT_2
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Explicit Output Filename (with disabled PBI):
+
+  $ $PBMERGE -o $MERGED_BAM --no-pbi $INPUT_1 $INPUT_2
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_dataset.t.in b/tests/src/cram/pbmerge_dataset.t.in

new file mode 100644 (file)

index 0000000..1c7cb7a
--- /dev/null
+++ b/tests/src/cram/pbmerge_dataset.t.in
@@ -0,0 +1,144 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ INPUT_XML="$DATADIR/polymerase/consolidate.subread.dataset.xml" && export INPUT_XML
+  $ BAM_1="$DATADIR/polymerase/production.subreads.bam" && export BAM_1
+  $ BAM_2="$DATADIR/polymerase/production.scraps.bam" && export BAM_2
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --no-header $BAM_1 | cut -f 1
+  ArminsFakeMovie/0/2659_3025
+  ArminsFakeMovie/0/3116_3628
+  ArminsFakeMovie/0/3722_4267
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4960_5477
+  ArminsFakeMovie/0/5571_6087
+  ArminsFakeMovie/0/6199_6719
+  ArminsFakeMovie/0/6812_7034
+
+  $ $BAM2SAM --no-header $BAM_2  | cut -f 1
+  ArminsFakeMovie/0/0_2659
+  ArminsFakeMovie/0/3025_3047
+  ArminsFakeMovie/0/3047_3095
+  ArminsFakeMovie/0/3095_3116
+  ArminsFakeMovie/0/3628_3650
+  ArminsFakeMovie/0/3650_3700
+  ArminsFakeMovie/0/3700_3722
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/5477_5498
+  ArminsFakeMovie/0/5498_5546
+  ArminsFakeMovie/0/5546_5571
+  ArminsFakeMovie/0/6087_6116
+  ArminsFakeMovie/0/6116_6173
+  ArminsFakeMovie/0/6173_6199
+  ArminsFakeMovie/0/6719_6740
+  ArminsFakeMovie/0/6740_6790
+  ArminsFakeMovie/0/6790_6812
+  ArminsFakeMovie/0/7034_7035
+
+Normal Merge from XML:
+
+  $ $PBMERGE -o $MERGED_BAM $INPUT_XML
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc)
+  @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc)
+  @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/4960_5477
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Normal Merge from XML (disabled PBI):
+
+  $ $PBMERGE --no-pbi -o $MERGED_BAM $INPUT_XML
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc)
+  @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc)
+  @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/4960_5477
+
+  $ rm $MERGED_BAM
+
+Write to stdout:
+
+  $ $PBMERGE --no-pbi $INPUT_XML > $MERGED_BAM
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc)
+  @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc)
+  @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/4960_5477
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_fofn.t.in b/tests/src/cram/pbmerge_fofn.t.in

new file mode 100644 (file)

index 0000000..34e9af6
--- /dev/null
+++ b/tests/src/cram/pbmerge_fofn.t.in
@@ -0,0 +1,120 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ INPUT_FOFN="$DATADIR/merge.fofn" && export INPUT_FOFN
+  $ INPUT_1="$DATADIR/aligned.bam" && export INPUT_1
+  $ INPUT_2="$DATADIR/aligned2.bam" && export INPUT_2
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/aligned_ordering_merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/aligned_ordering_merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $INPUT_1
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc)
+  @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc)
+
+  $ $BAM2SAM --no-header $INPUT_1 | cut -f 1,3,4 | head -n 10
+  singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+
+  $ $BAM2SAM --header-only $INPUT_2
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:b89a4406\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $INPUT_2 | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+
+Normal Merge from FOFN:
+
+  $ $PBMERGE -o $MERGED_BAM $INPUT_FOFN
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc)
+  @RG\tID:b89a4406\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Normal Merge from FOFN (disabled PBI):
+
+  $ $PBMERGE --no-pbi -o $MERGED_BAM $INPUT_FOFN
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc)
+  @RG\tID:b89a4406\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_mixed_ordering.t.in b/tests/src/cram/pbmerge_mixed_ordering.t.in

new file mode 100644 (file)

index 0000000..6f1f3f9
--- /dev/null
+++ b/tests/src/cram/pbmerge_mixed_ordering.t.in
@@ -0,0 +1,57 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ UNALIGNED_BAM="$DATADIR/polymerase/internal.hqregions.bam" && export UNALIGNED_BAM
+  $ ALIGNED_BAM="$DATADIR/dataset/bam_mapping_1.bam" && export ALIGNED_BAM
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/mixed_ordering_merged.bam" && export MERGED_BAM
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $UNALIGNED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+
+  $ $BAM2SAM --no-header $UNALIGNED_BAM | cut -f 1
+  ArminsFakeMovie/100000/2659_7034
+
+  $ $BAM2SAM --header-only $ALIGNED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $ALIGNED_BAM | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+
+Normal Merge - should fail:
+
+  $ $PBMERGE $UNALIGNED_BAM $ALIGNED_BAM > $MERGED_BAM
+  ERROR: BAM file sort orders do not match, aborting merge
+  [1]
+
+Shuffle Input - should fail:
+
+  $ $PBMERGE $ALIGNED_BAM $UNALIGNED_BAM > $MERGED_BAM
+  ERROR: BAM file sort orders do not match, aborting merge
+  [1]
+
+Cleanup:
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_pacbio_ordering.t.in b/tests/src/cram/pbmerge_pacbio_ordering.t.in

new file mode 100644 (file)

index 0000000..f52759f
--- /dev/null
+++ b/tests/src/cram/pbmerge_pacbio_ordering.t.in
@@ -0,0 +1,457 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ HQREGION_BAM="$DATADIR/polymerase/internal.hqregions.bam" && export HQREGION_BAM
+  $ SCRAPS_BAM="$DATADIR/polymerase/internal.scraps.bam" && export SCRAPS_BAM
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/pacbio_ordering_merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/pacbio_ordering_merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $HQREGION_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+
+  $ $BAM2SAM --no-header $HQREGION_BAM | cut -f 1
+  ArminsFakeMovie/100000/2659_7034
+
+  $ $BAM2SAM --header-only $SCRAPS_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+
+  $ $BAM2SAM --no-header $SCRAPS_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+Normal Merge:
+
+  $ $PBMERGE $HQREGION_BAM $SCRAPS_BAM > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ rm $MERGED_BAM
+
+Shuffle Input:
+
+  $ $PBMERGE $SCRAPS_BAM $HQREGION_BAM  > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ rm $MERGED_BAM
+
+Explicit Output Filename (also enables PBI):
+
+  $ $PBMERGE -o $MERGED_BAM $HQREGION_BAM $SCRAPS_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Explicit Output Filename (with disabled PBI):
+
+  $ $PBMERGE -o $MERGED_BAM --no-pbi $HQREGION_BAM $SCRAPS_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/python/check_swig.py b/tests/src/python/check_swig.py

new file mode 100755 (executable)

index 0000000..9a17f7e
--- /dev/null
+++ b/tests/src/python/check_swig.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+try:
+    import PacBioBam as bam
+    header = bam.BamHeader()
+    print "\nPython wrapper OK.\n"
+except ImportError:
+    print "\nPython wrapper failed!\n"
+
diff --git a/tests/src/python/test/__init__.py b/tests/src/python/test/__init__.py

new file mode 100755 (executable)

index 0000000..fb74df1
--- /dev/null
+++ b/tests/src/python/test/__init__.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+# EMPTY ON PURPOSE. 
+# This file just needs to exist for unit test discovery.
diff --git a/tests/src/python/test/config.py.in b/tests/src/python/test/config.py.in

new file mode 100644 (file)

index 0000000..c2b900e
--- /dev/null
+++ b/tests/src/python/test/config.py.in
@@ -0,0 +1,43 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+class TestData(object):
+    def __init__(self):
+        
+        # Main test data directory
+        self.directory = "@PacBioBAM_TestsDir@/data"
+        
+\ No newline at end of file
diff --git a/tests/src/python/test/test_Accuracy.py b/tests/src/python/test/test_Accuracy.py

new file mode 100755 (executable)

index 0000000..a8b2112
--- /dev/null
+++ b/tests/src/python/test/test_Accuracy.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+import PacBioBam
+import config 
+import unittest
+
+class AccuracyTest(unittest.TestCase):
+    
+    # ------------ SETUP --------------
+    
+    def runTest(self):
+        self.test_clamp()
+        
+    # ------------ TESTS --------------
+        
+    def test_clamp(self):
+        a_zero     = PacBioBam.Accuracy(0.0)
+        a_neg      = PacBioBam.Accuracy(-0.5)
+        a_min      = PacBioBam.Accuracy(0.0)
+        a_normal   = PacBioBam.Accuracy(0.9)
+        a_max      = PacBioBam.Accuracy(1.0)
+        a_tooLarge = PacBioBam.Accuracy(1.1)
+        
+        self.assertAlmostEqual(float(0.0), float(a_zero))
+        self.assertAlmostEqual(float(0.0), float(a_neg))
+        self.assertAlmostEqual(float(0.0), float(a_min))
+        self.assertAlmostEqual(float(0.9), float(a_normal))
+        self.assertAlmostEqual(float(1.0), float(a_max))
+        self.assertAlmostEqual(float(1.0), float(a_tooLarge))
+        
+\ No newline at end of file
diff --git a/tests/src/python/test/test_BamFile.py b/tests/src/python/test/test_BamFile.py

new file mode 100755 (executable)

index 0000000..26062c6
--- /dev/null
+++ b/tests/src/python/test/test_BamFile.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+import PacBioBam
+import config 
+import unittest
+
+class BamFileTest(unittest.TestCase):
+    
+    # ------------ SETUP --------------
+    
+    def setUp(self):
+        self.data = config.TestData()
+        self.bamFn = self.data.directory + "/aligned.bam"
+    
+    def runTest(self):
+        self.test_ctor()
+        self.test_nonExistentFile()
+    
+    # ------------ TESTS --------------
+        
+    def test_ctor(self):
+        f = PacBioBam.BamFile(self.bamFn)
+        
+    def test_nonExistentFile(self):
+        with self.assertRaises(RuntimeError):
+            f = PacBioBam.BamFile("non_existent_file.bam")
+            
diff --git a/tests/src/python/test/test_BamHeader.py b/tests/src/python/test/test_BamHeader.py

new file mode 100755 (executable)

index 0000000..065eee3
--- /dev/null
+++ b/tests/src/python/test/test_BamHeader.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+import PacBioBam
+import config 
+import unittest
+
+class BamHeaderTest(unittest.TestCase):
+    
+    # ------------ SETUP --------------
+    
+    def runTest(self):
+        self.test_defaultCtor()
+        self.test_decode()
+        self.test_encode()
+        
+    # ------------ TESTS --------------
+    
+    def test_defaultCtor(self):   
+             
+        header = PacBioBam.BamHeader()
+
+        self.assertFalse(header.Version())
+        self.assertFalse(header.SortOrder())
+        self.assertEqual(0, len(header.ReadGroups()))
+        self.assertEqual(0, len(header.Sequences()))
+        self.assertEqual(0, len(header.Programs()))
+        self.assertEqual(0, len(header.Comments()))
+        
+        with self.assertRaises(RuntimeError):
+            pg = header.Program("foo")
+            rg = header.ReadGroup("foo")
+            sq = header.SequenceId("foo")
+            sl = header.SequenceLength(42)
+            sn = header.SequenceName(42)
+        
+        
+    def test_decode(self):
+        
+        text = ("@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+               "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+               "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+               "@RG\tID:rg1\tSM:control\n"
+               "@RG\tID:rg2\tSM:condition1\n"
+               "@RG\tID:rg3\tSM:condition1\n"
+               "@PG\tID:_foo_\tPN:ide\n"
+               "@CO\tipsum and so on\n"
+               "@CO\tcitation needed\n")
+
+        header = PacBioBam.BamHeader(text)
+
+        self.assertEqual("1.1",       header.Version())
+        self.assertEqual("queryname", header.SortOrder())
+        self.assertEqual("3.0.1",     header.PacBioBamVersion())
+
+        self.assertEqual(3, len(header.ReadGroups()))
+        self.assertTrue(header.HasReadGroup("rg1"))
+        self.assertTrue(header.HasReadGroup("rg2"))
+        self.assertTrue(header.HasReadGroup("rg3"))
+        self.assertEqual("control",    header.ReadGroup("rg1").Sample())
+        self.assertEqual("condition1", header.ReadGroup("rg2").Sample())
+        self.assertEqual("condition1", header.ReadGroup("rg3").Sample())
+
+        self.assertEqual(2, len(header.Sequences()))
+        self.assertTrue(header.HasSequence("chr1"))
+        self.assertTrue(header.HasSequence("chr2"))
+        self.assertEqual("chocobo", header.Sequence("chr1").Species())
+        self.assertEqual("chocobo", header.Sequence("chr2").Species())
+        self.assertEqual("2038",    header.Sequence("chr1").Length())
+        self.assertEqual("3042",    header.Sequence("chr2").Length())
+
+        self.assertEqual(1, len(header.Programs()))
+        self.assertTrue(header.HasProgram("_foo_"))
+        self.assertEqual("ide", header.Program("_foo_").Name())
+
+        self.assertEqual(2, len(header.Comments()))
+        self.assertEqual("ipsum and so on", header.Comments()[0])
+        self.assertEqual("citation needed", header.Comments()[1])
+        
+    def test_encode(self):
+        
+        expectedText = ("@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+                        "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+                        "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+                        "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
+                        "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+                        "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+                        "@PG\tID:_foo_\tPN:ide\n"
+                        "@CO\tipsum and so on\n"
+                        "@CO\tcitation needed\n")
+
+        rg1 = PacBioBam.ReadGroupInfo("rg1")
+        rg1.Sample("control")
+        rg2 = PacBioBam.ReadGroupInfo("rg2")
+        rg2.Sample("condition1")
+        rg3 = PacBioBam.ReadGroupInfo("rg3")
+        rg3.Sample("condition1")
+
+        seq1 = PacBioBam.SequenceInfo("chr1")
+        seq1.Length("2038")
+        seq1.Species("chocobo")
+        seq2 = PacBioBam.SequenceInfo("chr2")
+        seq2.Length("3042")
+        seq2.Species("chocobo")
+
+        prog1 = PacBioBam.ProgramInfo("_foo_")
+        prog1.Name("ide")
+
+        header = PacBioBam.BamHeader()
+        header.Version("1.1")
+        header.SortOrder("queryname")
+        header.PacBioBamVersion("3.0.1")
+        header.AddReadGroup(rg1)
+        header.AddReadGroup(rg2)
+        header.AddReadGroup(rg3)
+        header.AddSequence(seq1)
+        header.AddSequence(seq2)
+        header.AddProgram(prog1)
+        header.AddComment("ipsum and so on")
+        header.AddComment("citation needed")
+
+        self.assertEqual(expectedText, header.ToSam())
+        
diff --git a/tests/src/python/test/test_Cigar.py b/tests/src/python/test/test_Cigar.py

new file mode 100755 (executable)

index 0000000..7b3f1df
--- /dev/null
+++ b/tests/src/python/test/test_Cigar.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+import PacBioBam
+import config 
+import unittest
+
+class CigarTest(unittest.TestCase):
+    
+    # ------------ SETUP --------------
+    
+    def runTest(self):
+        self.test_typeToChar()
+        self.test_charToType()
+        self.test_setType()
+        self.test_setChar()
+        self.test_cigarOpCtors()
+        self.test_fromEmptyString()
+        self.test_fromString()
+        self.test_toEmptyString()
+        self.test_toString()
+        
+    # ------------ TESTS --------------
+    
+    def test_typeToChar(self):
+        self.assertEqual('M', PacBioBam.CigarOperation.TypeToChar(PacBioBam.ALIGNMENT_MATCH))
+        self.assertEqual('I', PacBioBam.CigarOperation.TypeToChar(PacBioBam.INSERTION))
+        self.assertEqual('D', PacBioBam.CigarOperation.TypeToChar(PacBioBam.DELETION))
+        self.assertEqual('N', PacBioBam.CigarOperation.TypeToChar(PacBioBam.REFERENCE_SKIP))
+        self.assertEqual('S', PacBioBam.CigarOperation.TypeToChar(PacBioBam.SOFT_CLIP))
+        self.assertEqual('H', PacBioBam.CigarOperation.TypeToChar(PacBioBam.HARD_CLIP))
+        self.assertEqual('P', PacBioBam.CigarOperation.TypeToChar(PacBioBam.PADDING))
+        self.assertEqual('=', PacBioBam.CigarOperation.TypeToChar(PacBioBam.SEQUENCE_MATCH))
+        self.assertEqual('X', PacBioBam.CigarOperation.TypeToChar(PacBioBam.SEQUENCE_MISMATCH))
+        
+    def test_charToType(self):
+        self.assertEqual(PacBioBam.ALIGNMENT_MATCH,   PacBioBam.CigarOperation.CharToType('M'))
+        self.assertEqual(PacBioBam.INSERTION,         PacBioBam.CigarOperation.CharToType('I'))
+        self.assertEqual(PacBioBam.DELETION,          PacBioBam.CigarOperation.CharToType('D'))
+        self.assertEqual(PacBioBam.REFERENCE_SKIP,    PacBioBam.CigarOperation.CharToType('N'))
+        self.assertEqual(PacBioBam.SOFT_CLIP,         PacBioBam.CigarOperation.CharToType('S'))
+        self.assertEqual(PacBioBam.HARD_CLIP,         PacBioBam.CigarOperation.CharToType('H'))
+        self.assertEqual(PacBioBam.PADDING,           PacBioBam.CigarOperation.CharToType('P'))
+        self.assertEqual(PacBioBam.SEQUENCE_MATCH,    PacBioBam.CigarOperation.CharToType('='))
+        self.assertEqual(PacBioBam.SEQUENCE_MISMATCH, PacBioBam.CigarOperation.CharToType('X'))
+        
+    def test_setType(self):
+        m = PacBioBam.CigarOperation()
+        i = PacBioBam.CigarOperation()
+        d = PacBioBam.CigarOperation()
+        n = PacBioBam.CigarOperation()
+        s = PacBioBam.CigarOperation()
+        h = PacBioBam.CigarOperation()
+        p = PacBioBam.CigarOperation()
+        e = PacBioBam.CigarOperation()
+        x = PacBioBam.CigarOperation()
+        
+        m.Type(PacBioBam.ALIGNMENT_MATCH)
+        i.Type(PacBioBam.INSERTION)
+        d.Type(PacBioBam.DELETION)
+        n.Type(PacBioBam.REFERENCE_SKIP)
+        s.Type(PacBioBam.SOFT_CLIP)
+        h.Type(PacBioBam.HARD_CLIP)
+        p.Type(PacBioBam.PADDING)
+        e.Type(PacBioBam.SEQUENCE_MATCH)
+        x.Type(PacBioBam.SEQUENCE_MISMATCH)
+        
+        self.assertEqual('M', m.Char())
+        self.assertEqual('I', i.Char())
+        self.assertEqual('D', d.Char())
+        self.assertEqual('N', n.Char())
+        self.assertEqual('S', s.Char())
+        self.assertEqual('H', h.Char())
+        self.assertEqual('P', p.Char())
+        self.assertEqual('=', e.Char())
+        self.assertEqual('X', x.Char())
+        
+    def test_setChar(self):
+        m = PacBioBam.CigarOperation()
+        i = PacBioBam.CigarOperation()
+        d = PacBioBam.CigarOperation()
+        n = PacBioBam.CigarOperation()
+        s = PacBioBam.CigarOperation()
+        h = PacBioBam.CigarOperation()
+        p = PacBioBam.CigarOperation()
+        e = PacBioBam.CigarOperation()
+        x = PacBioBam.CigarOperation()
+        
+        m.Char('M')
+        i.Char('I')
+        d.Char('D')
+        n.Char('N')
+        s.Char('S')
+        h.Char('H')
+        p.Char('P')
+        e.Char('=')
+        x.Char('X')
+        
+        self.assertEqual(PacBioBam.ALIGNMENT_MATCH,   m.Type())
+        self.assertEqual(PacBioBam.INSERTION,         i.Type())
+        self.assertEqual(PacBioBam.DELETION,          d.Type())
+        self.assertEqual(PacBioBam.REFERENCE_SKIP,    n.Type())
+        self.assertEqual(PacBioBam.SOFT_CLIP,         s.Type())
+        self.assertEqual(PacBioBam.HARD_CLIP,         h.Type())
+        self.assertEqual(PacBioBam.PADDING,           p.Type())
+        self.assertEqual(PacBioBam.SEQUENCE_MATCH,    e.Type())
+        self.assertEqual(PacBioBam.SEQUENCE_MISMATCH, x.Type())
+        
+    def test_cigarOpCtors(self):
+        c1 = PacBioBam.CigarOperation('S', 10)
+        c2 = PacBioBam.CigarOperation(PacBioBam.SOFT_CLIP, 10)
+        
+        self.assertEqual('S', c1.Char())
+        self.assertEqual('S', c2.Char())
+        self.assertEqual(PacBioBam.SOFT_CLIP, c1.Type())
+        self.assertEqual(PacBioBam.SOFT_CLIP, c2.Type())
+        self.assertEqual(10, c1.Length())
+        self.assertEqual(10, c2.Length())
+        
+    def test_fromEmptyString(self):
+        s = ""
+        cigar = PacBioBam.Cigar(s)
+        self.assertEqual(0, len(cigar))
+        
+    def test_fromString(self):
+        singleCigarString = "100="
+        multiCigarString  = "100=2D34I6=6X6="
+        
+        singleCigar = PacBioBam.Cigar(singleCigarString)
+        multiCigar  = PacBioBam.Cigar(multiCigarString)
+        
+        self.assertEqual(1, len(singleCigar))
+        c = singleCigar[0]
+        self.assertEqual('=', c.Char())
+        self.assertEqual(100, c.Length())
+
+        self.assertEqual(6, len(multiCigar))
+        op0 = multiCigar[0]
+        op1 = multiCigar[1]
+        op2 = multiCigar[2]
+        op3 = multiCigar[3]
+        op4 = multiCigar[4]
+        op5 = multiCigar[5]
+
+        self.assertEqual('=', op0.Char())
+        self.assertEqual('D', op1.Char())
+        self.assertEqual('I', op2.Char())
+        self.assertEqual('=', op3.Char())
+        self.assertEqual('X', op4.Char())
+        self.assertEqual('=', op5.Char())
+        self.assertEqual(100, op0.Length())
+        self.assertEqual(2,   op1.Length())
+        self.assertEqual(34,  op2.Length())
+        self.assertEqual(6,   op3.Length())
+        self.assertEqual(6,   op4.Length())
+        self.assertEqual(6,   op5.Length())
+        
+    def test_toEmptyString(self):
+        cigar = PacBioBam.Cigar()
+        self.assertFalse(cigar.ToStdString())
+
+    def test_toString(self):
+        
+        singleCigarString = "100="
+        multiCigarString  = "100=2D34I6=6X6="
+        
+        singleCigar = PacBioBam.Cigar()
+        singleCigar.append(PacBioBam.CigarOperation(PacBioBam.SEQUENCE_MATCH, 100))
+        
+        multiCigar = PacBioBam.Cigar()
+        multiCigar.append(PacBioBam.CigarOperation(PacBioBam.SEQUENCE_MATCH, 100))
+        multiCigar.append(PacBioBam.CigarOperation(PacBioBam.DELETION,         2))
+        multiCigar.append(PacBioBam.CigarOperation(PacBioBam.INSERTION,       34))
+        multiCigar.append(PacBioBam.CigarOperation(PacBioBam.SEQUENCE_MATCH,   6))
+        multiCigar.append(PacBioBam.CigarOperation(PacBioBam.SEQUENCE_MISMATCH,6))
+        multiCigar.append(PacBioBam.CigarOperation(PacBioBam.SEQUENCE_MATCH,   6))
+        
+        self.assertEqual(singleCigarString, singleCigar.ToStdString())
+        self.assertEqual(multiCigarString,  multiCigar.ToStdString())
+        
diff --git a/tests/src/python/test/test_EndToEnd.py b/tests/src/python/test/test_EndToEnd.py

new file mode 100755 (executable)

index 0000000..90a76c2
--- /dev/null
+++ b/tests/src/python/test/test_EndToEnd.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+import PacBioBam
+import config 
+
+import os
+import unittest
+
+class EndToEndTest(unittest.TestCase):
+        
+    def originalNames(self):
+        # loop over original file, store names, write to generated file
+        try:
+            file = PacBioBam.BamFile(self.ex2BamFn)
+            writer = PacBioBam.BamWriter(self.generatedBamFn, file.Header())
+            
+            dataset = PacBioBam.DataSet(self.ex2BamFn)
+            entireFile = PacBioBam.EntireFileQuery(dataset)
+         
+            names_in = []
+            for record in PacBioBam.Iterate(entireFile):
+                names_in.append(record.FullName())
+                writer.Write(record)
+            return names_in
+            
+        except RuntimeError:
+            self.assertTrue(False) # should not throw
+        
+    def generatedNames(self):
+        try:   
+            # open dataset on generated BAM file, read in names
+            dataset = PacBioBam.DataSet(self.generatedBamFn)
+            entireFile = PacBioBam.EntireFileQuery(dataset)
+            names_out = []
+            for record in PacBioBam.Iterate(entireFile):
+                names_out.append(record.FullName())
+            return names_out
+                
+        except RuntimeError:
+            self.assertTrue(False) # should not throw
+        
+    def runTest(self):
+        
+        self.testData = config.TestData()
+        self.ex2BamFn = self.testData.directory + "/aligned.bam"
+        self.generatedBamFn = self.testData.directory + "/generated.bam"
+        
+        # compare input records to generated copy's records
+        names_in = self.originalNames()
+        names_out = self.generatedNames()
+        self.assertEqual(names_in, names_out)
+        
+        # clean up
+        os.remove(self.generatedBamFn)
diff --git a/tests/src/python/test/test_Frames.py b/tests/src/python/test/test_Frames.py

new file mode 100755 (executable)

index 0000000..7855bbb
--- /dev/null
+++ b/tests/src/python/test/test_Frames.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+import PacBioBam
+import config 
+import unittest
+
+class FramesTest(unittest.TestCase):
+    
+    # ------------ SETUP --------------
+    
+    def setUp(self):
+        self.testframes = [
+            0,  8,  140, 0,   0,  7,   4,  0,  85, 2,
+            1,  3,  2,   10,  1,  20,  47, 10, 9,  60,
+            20, 3,  12,  5,   13, 165, 6,  14, 22, 12,
+            2,  4,  9,   218, 27, 3,   15, 2,  17, 2,
+            45, 24, 89,  10,  7,  1,   11, 15, 0,  7,
+            0,  28, 17,  12,  6,  10,  37, 0,  12, 52,
+            0,  7,  1,   14,  3,  26,  12, 0,  20, 17,
+            2,  13, 2,   9,   13, 7,   15, 29, 3,   6,
+            2,  1,  28,  10,  3,  14,  7,  1,  22, 1,
+            6,  6,  0,   19,  31, 6,   2,  14, 0,  0,
+            1000, 947, 948
+        ]
+
+        self.encoded  = [
+            0,     8,  102,   0,   0,   7,   4,   0,  75,   2,   1,   3,   2,
+            10,    1,   20,  47,  10,   9,  60,  20,   3,  12,   5,  13, 115,
+            6,    14,   22,  12,   2,   4,   9, 135,  27,   3,  15,   2,  17,
+            2,    45,   24,  77,  10,   7,   1,  11,  15,   0,   7,   0,  28,
+            17,   12,    6,  10,  37,   0,  12,  52,   0,   7,   1,  14,   3,
+            26,   12,    0,  20,  17,   2,  13,   2,   9,  13,   7,  15,  29,
+            3,     6,    2,   1,  28,  10,   3,  14,   7,   1,  22,   1,   6,
+            6,     0,   19,  31,   6,   2,  14,   0,   0,
+            255, 254,  255
+        ]
+        
+    def runTest(self):
+        self.test_ctors()
+        self.test_encode()
+        
+    # ------------ TESTS --------------
+    
+    def test_ctors(self):
+        f = PacBioBam.Frames()
+        self.assertEqual(0, len(f.Data()))
+
+        f2 = PacBioBam.Frames(self.testframes)
+        d  = f2.Data()
+        self.assertEqual(len(self.testframes), len(d))
+        for i, v in enumerate(d):
+            self.assertEqual(int(self.testframes[i]), int(v))
+    
+    def test_encode(self):
+        f = PacBioBam.Frames(self.testframes)
+        e = f.Encode()
+        self.assertEqual(len(self.encoded), len(e))
+        for i, v in enumerate(e):
+            self.assertEqual(int(self.encoded[i]), int(v))
+        
+\ No newline at end of file
diff --git a/tests/src/python/test/test_Intervals.py b/tests/src/python/test/test_Intervals.py

new file mode 100755 (executable)

index 0000000..9e484eb
--- /dev/null
+++ b/tests/src/python/test/test_Intervals.py
@@ -0,0 +1,348 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+import PacBioBam
+import config 
+import unittest
+
+class IntervalsTest(unittest.TestCase):
+    
+    # ------------ SETUP --------------
+    
+    def runTest(self):
+        self.test_unmappedPosition()
+        self.test_ctors()
+        self.test_equality()
+        self.test_copy()
+        self.test_modifiers()
+        self.test_cover()
+        self.test_intersect()
+        self.test_validity()
+        self.test_length()
+        
+    # ------------ TESTS --------------
+    
+    def test_unmappedPosition(self):
+        self.assertEqual(-1, PacBioBam.UnmappedPosition)
+    
+    def test_ctors(self):
+        empty  = PacBioBam.PositionInterval()
+        single = PacBioBam.PositionInterval(4)
+        normal = PacBioBam.PositionInterval(5, 8)
+        
+        self.assertEqual(0, empty.Start())
+        self.assertEqual(0, empty.Stop())
+        self.assertEqual(4, single.Start())
+        self.assertEqual(5, single.Stop())
+        self.assertEqual(5, normal.Start())
+        self.assertEqual(8, normal.Stop())
+        
+    def test_equality(self):
+        
+        empty           = PacBioBam.PositionInterval()
+        empty2          = PacBioBam.PositionInterval()
+        singleton       = PacBioBam.PositionInterval(4)
+        sameAsSingleton = PacBioBam.PositionInterval(4, 5)
+        normal          = PacBioBam.PositionInterval(5, 8)
+        sameAsNormal    = PacBioBam.PositionInterval(5, 8)
+        different       = PacBioBam.PositionInterval(20, 40)
+        
+        # self-equality
+        self.assertEqual(empty, empty)
+        self.assertEqual(singleton, singleton)
+        self.assertEqual(normal, normal)
+        self.assertEqual(different, different)
+
+        # same values
+        self.assertEqual(empty, empty2)
+        self.assertEqual(singleton, sameAsSingleton)
+        self.assertEqual(normal, sameAsNormal)
+        
+        # different values
+        self.assertNotEqual(empty, singleton)
+        self.assertNotEqual(empty, normal)
+        self.assertNotEqual(empty, different)
+        self.assertNotEqual(singleton, normal)
+        self.assertNotEqual(normal, different)
+        
+    def test_copy(self):
+        interval1 = PacBioBam.PositionInterval(5,8)
+        interval2 = PacBioBam.PositionInterval(interval1)
+        interval3 = interval1
+        
+        self.assertEqual(interval1, interval1)
+        self.assertEqual(interval1, interval2)
+        self.assertEqual(interval1, interval3)
+        
+    def test_modifiers(self):
+        
+        interval1 = PacBioBam.PositionInterval(5,8)
+        interval2 = PacBioBam.PositionInterval(interval1)
+        interval2.Start(2)
+        interval2.Stop(10)
+        
+        self.assertNotEqual(interval1, interval2)
+        self.assertEqual(2,  interval2.Start())
+        self.assertEqual(10, interval2.Stop())
+        
+    def test_cover(self):
+        
+        a = PacBioBam.PositionInterval(2,4)
+        b = PacBioBam.PositionInterval(3,5)
+        c = PacBioBam.PositionInterval(6,8)
+        d = PacBioBam.PositionInterval(1,7)
+        e = PacBioBam.PositionInterval(5,8)
+        
+        #   0123456789  
+        # a   --
+        # b    --
+        # c       --
+        # d  ------
+        # e      ---
+
+        # self-cover
+        self.assertTrue(a.Covers(a)) 
+        self.assertTrue(a.CoveredBy(a))
+
+        # basic covers/covered
+        self.assertTrue(b.CoveredBy(d))
+        self.assertTrue(d.Covers(b))
+        self.assertNotEqual(b, d)
+        self.assertFalse(b.Covers(d))
+ 
+        # completely disjoint
+        self.assertFalse(b.Covers(c))
+        self.assertFalse(c.Covers(b))
+        self.assertFalse(b.CoveredBy(c))
+        self.assertFalse(c.CoveredBy(b))
+        
+        # b.stop == e.start
+        self.assertFalse(b.Covers(e))
+        self.assertFalse(b.CoveredBy(e))
+
+        # shared endpoint, start contained
+        self.assertTrue(e.Covers(c))
+        self.assertTrue(c.CoveredBy(e))
+        
+    def test_intersect(self):
+        
+        a = PacBioBam.PositionInterval(2,4)
+        b = PacBioBam.PositionInterval(3,5)
+        c = PacBioBam.PositionInterval(6,8)
+        d = PacBioBam.PositionInterval(1,7)
+        e = PacBioBam.PositionInterval(5,8)
+        
+        #   0123456789  
+        # a   --
+        # b    --
+        # c       --
+        # d  ------
+        # e      ---
+        
+        # self-intersection
+        self.assertTrue(a.Intersects(a))
+        
+        # intersection is commutative
+        self.assertTrue(a.Intersects(b))
+        self.assertTrue(b.Intersects(a))
+        
+        # covered implies intersection
+        self.assertTrue(d.Covers(a))
+        self.assertTrue(a.Intersects(d))
+        self.assertTrue(d.Intersects(a))
+
+        # c.start > b.stop (obvious disjoint)
+        self.assertFalse(b.Intersects(c))
+        
+        # b.stop == e.start (intervals are right-open, so disjoint)
+        self.assertFalse(b.Intersects(e))
+
+    def test_validity(self):
+        
+        a = PacBioBam.PositionInterval()     # default ctor
+        b = PacBioBam.PositionInterval(0,0)  # start == stop (zero)
+        c = PacBioBam.PositionInterval(4,4)  # start == stop (nonzero)
+        d = PacBioBam.PositionInterval(0,1)  # start < stop  (start is zero)
+        e = PacBioBam.PositionInterval(4,5)  # start < stop  (start is nonzero)
+        f = PacBioBam.PositionInterval(5,4)  # start > stop
+        
+        self.assertFalse(a.IsValid())
+        self.assertFalse(b.IsValid())
+        self.assertFalse(c.IsValid())
+        self.assertTrue(d.IsValid())
+        self.assertTrue(e.IsValid())
+        self.assertFalse(f.IsValid())
+        
+    def test_length(self):
+        
+        a = PacBioBam.PositionInterval(2,4)
+        b = PacBioBam.PositionInterval(3,5)
+        c = PacBioBam.PositionInterval(6,8)
+        d = PacBioBam.PositionInterval(1,7)
+        e = PacBioBam.PositionInterval(5,8)
+        
+        self.assertEqual(2, a.Length())
+        self.assertEqual(2, b.Length())
+        self.assertEqual(2, c.Length())
+        self.assertEqual(6, d.Length())
+        self.assertEqual(3, e.Length())
+        
+class GenomicIntervalsTest(unittest.TestCase):
+    
+    # ------------ SETUP --------------
+    
+    def runTest(self):
+        self.test_ctors()
+        self.test_copy()
+        self.test_modifiers()
+        self.test_cover()
+        self.test_validity()
+    
+    # ------------ TESTS --------------
+    
+    def test_ctors(self):
+        
+        empty  = PacBioBam.GenomicInterval()
+        normal = PacBioBam.GenomicInterval("foo", 100, 200)
+        
+        self.assertEqual("", empty.Name())
+        self.assertEqual(0,  empty.Start())
+        self.assertEqual(0,  empty.Stop())
+        
+        self.assertEqual("foo", normal.Name())
+        self.assertEqual(100,   normal.Start())
+        self.assertEqual(200,   normal.Stop())
+
+        
+    def test_copy(self):
+        
+        a = PacBioBam.GenomicInterval("foo", 10, 20)
+        b = PacBioBam.GenomicInterval(a)
+        c = a
+        
+        self.assertEqual(a, a)
+        self.assertEqual(a, b)
+        self.assertEqual(a, c)
+        
+    def test_modifiers(self):
+        
+        a = PacBioBam.GenomicInterval("foo", 10, 20)
+        
+        b = PacBioBam.GenomicInterval(a)
+        b.Name("bar").Start(2).Stop(10)
+        
+        c = PacBioBam.GenomicInterval(a)
+        c.Interval(b.Interval())
+        
+        self.assertNotEqual(a, b)
+        self.assertEqual("bar",  b.Name())
+        self.assertEqual(2,  b.Start())
+        self.assertEqual(10, b.Stop())        
+        self.assertEqual(a.Name(), c.Name())
+        self.assertEqual(b.Interval(), c.Interval())
+        
+    def test_cover(self):
+        
+        a = PacBioBam.GenomicInterval("foo",2,4)
+        b = PacBioBam.GenomicInterval("foo",3,5)
+        c = PacBioBam.GenomicInterval("foo",6,8)
+        d = PacBioBam.GenomicInterval("foo",1,7)
+        e = PacBioBam.GenomicInterval("foo",5,8)
+        f = PacBioBam.GenomicInterval("bar",3,5)  # same as b, different ref
+        
+        #   0123456789  
+        # a   --
+        # b    --
+        # c       --
+        # d  ------
+        # e      ---
+        
+        # self-cover
+        self.assertTrue(a.Covers(a))
+        self.assertTrue(a.CoveredBy(a))
+        
+        # basic covers/covered
+        self.assertTrue(b.CoveredBy(d))
+        self.assertTrue(d.Covers(b))
+        self.assertNotEqual(b, d)
+        self.assertFalse(b.Covers(d))
+        
+        # same coords as b, but different ref
+        self.assertFalse(f.CoveredBy(d))
+        self.assertFalse(d.Covers(f))
+        self.assertNotEqual(f, d)
+        self.assertFalse(f.Covers(d))
+        
+        # obvious disjoint
+        self.assertFalse(b.Covers(c))
+        self.assertFalse(c.Covers(b))
+        self.assertFalse(b.CoveredBy(c))
+        self.assertFalse(c.CoveredBy(b))
+        
+        # b.stop == e.start (intervals are right-open, so disjoint)
+        self.assertFalse(b.Covers(e))
+        self.assertFalse(b.CoveredBy(e))
+        
+        # shared endpoint, start contained
+        self.assertTrue(e.Covers(c))
+        self.assertTrue(c.CoveredBy(e)) 
+        
+    def test_validity(self):
+        
+        a = PacBioBam.GenomicInterval()       # default
+        b = PacBioBam.GenomicInterval("foo",0,0)  # valid id, start == stop (zero)
+        c = PacBioBam.GenomicInterval("foo",4,4)  # valid id, start == stop (non-zero)
+        d = PacBioBam.GenomicInterval("foo",0,1)  # valid id, start <  stop (start == zero)     OK
+        e = PacBioBam.GenomicInterval("foo",4,5)  # valid id, start <  stop (start >  zero)     OK
+        f = PacBioBam.GenomicInterval("foo",5,4)  # valid id, start >  stop 
+        g = PacBioBam.GenomicInterval("",0,0) # invalid id, start == stop (zero)
+        h = PacBioBam.GenomicInterval("",4,4) # invalid id, start == stop (non-zero)
+        i = PacBioBam.GenomicInterval("",0,1) # invalid id, start <  stop (start == zero)
+        j = PacBioBam.GenomicInterval("",4,5) # invalid id, start <  stop (start >  zero)
+        k = PacBioBam.GenomicInterval("",5,4) # invalid id, start >  stop 
+             
+        self.assertTrue(d.IsValid())
+        self.assertTrue(e.IsValid())
+        self.assertFalse(a.IsValid())
+        self.assertFalse(b.IsValid())
+        self.assertFalse(c.IsValid())
+        self.assertFalse(f.IsValid())
+        self.assertFalse(g.IsValid())
+        self.assertFalse(h.IsValid())
+        self.assertFalse(i.IsValid())
+        self.assertFalse(j.IsValid())
+        self.assertFalse(k.IsValid())
diff --git a/tests/src/python/test/test_PolymeraseStitching.py b/tests/src/python/test/test_PolymeraseStitching.py

new file mode 100755 (executable)

index 0000000..13ee448
--- /dev/null
+++ b/tests/src/python/test/test_PolymeraseStitching.py
@@ -0,0 +1,358 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+import PacBioBam
+import config 
+import unittest
+
+class PolymeraseStitchingTest(unittest.TestCase):
+    
+    # ------------ SETUP --------------
+    
+    def setUp(self):
+        self.data = config.TestData()
+    
+    def runTest(self):
+        self.test_virtualRegions()
+        self.test_internalSubreadsToOriginal()
+        self.test_internalHqToOriginal()
+        self.test_productionSubreadsToOriginal()
+        self.test_productionHqToOriginal()
+        
+    # ------------ TESTS --------------
+    
+    def test_virtualRegions(self):
+
+        subreadBam = self.data.directory + "/polymerase/internal.subreads.bam"
+        scrapsBam  = self.data.directory + "/polymerase/internal.scraps.bam"
+        vpr = PacBioBam.VirtualPolymeraseReader(subreadBam, scrapsBam)
+    
+        virtualRecord = vpr.Next()
+        
+        # NOTE: this method is disabled 
+        #
+        # Any attempt to retrive this value resulted in several 
+        #   "swig/python detected a memory leak of type 'unknown', no destructor found."
+        # errors (& an empty dictionary result). The same info is available via the 
+        # VirtualRegionsTable(regionType) method, though a bit clunkier if you just want 
+        # to iterate. But access to region info for specific types are available & correct, 
+        # so I'm just going to leave this one out for now. - DB
+        #
+        # regionMap = virtualRecord.VirtualRegionsMap();
+    
+        # ADAPTER
+        adapter = virtualRecord.VirtualRegionsTable(PacBioBam.VirtualRegionType_ADAPTER)
+        self.assertEqual(7, len(adapter))
+        self.assertEqual(3047, adapter[0].beginPos);
+        self.assertEqual(3095, adapter[0].endPos);
+        self.assertEqual(3650, adapter[1].beginPos);
+        self.assertEqual(3700, adapter[1].endPos);
+        self.assertEqual(4289, adapter[2].beginPos);
+        self.assertEqual(4335, adapter[2].endPos);
+        self.assertEqual(4888, adapter[3].beginPos);
+        self.assertEqual(4939, adapter[3].endPos);
+        self.assertEqual(5498, adapter[4].beginPos);
+        self.assertEqual(5546, adapter[4].endPos);
+        self.assertEqual(6116, adapter[5].beginPos);
+        self.assertEqual(6173, adapter[5].endPos);
+        self.assertEqual(6740, adapter[6].beginPos);
+        self.assertEqual(6790, adapter[6].endPos);
+    
+        # BARCODE
+        barcode = virtualRecord.VirtualRegionsTable(PacBioBam.VirtualRegionType_BARCODE)
+        self.assertEqual(14, len(barcode))
+        self.assertEqual(3025, barcode[0].beginPos);
+        self.assertEqual(3047, barcode[0].endPos);
+        self.assertEqual(3095, barcode[1].beginPos);
+        self.assertEqual(3116, barcode[1].endPos);
+        self.assertEqual(3628, barcode[2].beginPos);
+        self.assertEqual(3650, barcode[2].endPos);
+        self.assertEqual(3700, barcode[3].beginPos);
+        self.assertEqual(3722, barcode[3].endPos);
+        self.assertEqual(4267, barcode[4].beginPos);
+        self.assertEqual(4289, barcode[4].endPos);
+        self.assertEqual(4335, barcode[5].beginPos);
+        self.assertEqual(4356, barcode[5].endPos);
+        self.assertEqual(4864, barcode[6].beginPos);
+        self.assertEqual(4888, barcode[6].endPos);
+        self.assertEqual(4939, barcode[7].beginPos);
+        self.assertEqual(4960, barcode[7].endPos);
+        self.assertEqual(5477, barcode[8].beginPos);
+        self.assertEqual(5498, barcode[8].endPos);
+        self.assertEqual(5546, barcode[9].beginPos);
+        self.assertEqual(5571, barcode[9].endPos);
+        self.assertEqual(6087, barcode[10].beginPos);
+        self.assertEqual(6116, barcode[10].endPos);
+        self.assertEqual(6173, barcode[11].beginPos);
+        self.assertEqual(6199, barcode[11].endPos);
+        self.assertEqual(6719, barcode[12].beginPos);
+        self.assertEqual(6740, barcode[12].endPos);
+        self.assertEqual(6790, barcode[13].beginPos);
+        self.assertEqual(6812, barcode[13].endPos);
+    
+        # HQREGION
+        hqregion = virtualRecord.VirtualRegionsTable(PacBioBam.VirtualRegionType_HQREGION)
+        self.assertEqual(1, len(hqregion))
+        
+        self.assertEqual(2659, hqregion[0].beginPos);
+        self.assertEqual(7034, hqregion[0].endPos);
+    
+        # LQREGION
+        lqregion = virtualRecord.VirtualRegionsTable(PacBioBam.VirtualRegionType_LQREGION)
+        self.assertEqual(2, len(lqregion))
+        
+        self.assertEqual(0,    lqregion[0].beginPos);
+        self.assertEqual(2659, lqregion[0].endPos);
+        self.assertEqual(7034, lqregion[1].beginPos);
+        self.assertEqual(7035, lqregion[1].endPos);
+    
+        # SUBREAD
+        subread = virtualRecord.VirtualRegionsTable(PacBioBam.VirtualRegionType_SUBREAD)
+        self.assertEqual(8, len(subread)) 
+    
+    def test_internalSubreadsToOriginal(self):
+        
+        # stitch virtual polymerase record
+        subreadsBam = self.data.directory + "/polymerase/internal.subreads.bam"
+        scrapsBam   = self.data.directory + "/polymerase/internal.scraps.bam"
+        vpr = PacBioBam.VirtualPolymeraseReader(subreadsBam, scrapsBam)
+        self.assertTrue(vpr.HasNext())
+        virtualRecord = vpr.Next()
+
+        # fetch original polymerase record
+        polyBam   = PacBioBam.DataSet(self.data.directory + "/polymerase/internal.polymerase.bam")
+        polyQuery = PacBioBam.EntireFileQuery(polyBam)
+        polyIter = polyQuery.begin()
+        polyEnd  = polyQuery.end()
+        self.assertTrue(polyIter != polyEnd)
+        polyRecord = polyIter.value()
+
+        # compare
+        self.compare(polyRecord, virtualRecord)
+
+    def test_internalHqToOriginal(self):
+        
+        # stitch virtual polymerase record
+        hqRegionsBam = self.data.directory + "/polymerase/internal.hqregions.bam"
+        lqRegionsBam = self.data.directory + "/polymerase/internal.lqregions.bam"
+        vpr = PacBioBam.VirtualPolymeraseReader(hqRegionsBam, lqRegionsBam)
+        self.assertTrue(vpr.HasNext())
+        virtualRecord = vpr.Next()
+
+        # fetch original polymerase record
+        polyBam   = PacBioBam.DataSet(self.data.directory + "/polymerase/internal.polymerase.bam")
+        polyQuery = PacBioBam.EntireFileQuery(polyBam)
+        polyIter = polyQuery.begin()
+        polyEnd  = polyQuery.end()
+        self.assertTrue(polyIter != polyEnd)
+        polyRecord = polyIter.value()
+       
+        # # compare
+        self.compare(polyRecord, virtualRecord)
+
+    def test_productionSubreadsToOriginal(self):
+        
+        # stitch virtual polymerase record
+        subreadsBam = self.data.directory + "/polymerase/production.subreads.bam"
+        scrapsBam   = self.data.directory + "/polymerase/production.scraps.bam"
+        vpr = PacBioBam.VirtualPolymeraseReader(subreadsBam, scrapsBam)
+        self.assertTrue(vpr.HasNext())
+        virtualRecord = vpr.Next()
+
+        # fetch original polymerase record
+        polyBam   = PacBioBam.DataSet(self.data.directory + "/polymerase/production.polymerase.bam")
+        polyQuery = PacBioBam.EntireFileQuery(polyBam)
+        polyIter = polyQuery.begin()
+        polyEnd  = polyQuery.end()
+        self.assertTrue(polyIter != polyEnd)
+        polyRecord = polyIter.value()
+        
+        # compare
+        self.assertEqual(polyRecord.FullName(),        virtualRecord.FullName());
+        self.assertEqual(polyRecord.HoleNumber(),      virtualRecord.HoleNumber());
+        self.assertEqual(polyRecord.NumPasses(),       virtualRecord.NumPasses());
+        self.assertEqual(polyRecord.Sequence(),        virtualRecord.Sequence());
+        self.assertEqual(polyRecord.DeletionTag(),     virtualRecord.DeletionTag());
+        self.assertEqual(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag());
+        self.assertEqual(polyRecord.IPD(),             virtualRecord.IPDV1Frames());
+        self.assertEqual(polyRecord.ReadGroup(),       virtualRecord.ReadGroup());
+        
+        self.assertAlmostEqual(float(polyRecord.ReadAccuracy()), float(virtualRecord.ReadAccuracy()));
+        
+        self.assertEqual(polyRecord.Qualities().Fastq(),       virtualRecord.Qualities().Fastq());
+        self.assertEqual(polyRecord.DeletionQV().Fastq(),      virtualRecord.DeletionQV().Fastq());
+        self.assertEqual(polyRecord.InsertionQV().Fastq(),     virtualRecord.InsertionQV().Fastq());
+        self.assertEqual(polyRecord.MergeQV().Fastq(),         virtualRecord.MergeQV().Fastq());
+        self.assertEqual(polyRecord.SubstitutionQV().Fastq(),  virtualRecord.SubstitutionQV().Fastq());
+
+    def test_productionHqToOriginal(self):
+        
+        # stitch virtual polymerase record
+        hqRegionsBam = self.data.directory + "/polymerase/production_hq.hqregion.bam"
+        lqRegionsBam = self.data.directory + "/polymerase/production_hq.scraps.bam"
+        vpr = PacBioBam.VirtualPolymeraseReader(hqRegionsBam, lqRegionsBam)
+        self.assertTrue(vpr.HasNext())
+        virtualRecord = vpr.Next()
+
+        # fetch original polymerase record
+        polyBam   = PacBioBam.DataSet(self.data.directory + "/polymerase/production.polymerase.bam")
+        polyQuery = PacBioBam.EntireFileQuery(polyBam)
+        polyIter = polyQuery.begin()
+        polyEnd  = polyQuery.end()
+        self.assertTrue(polyIter != polyEnd)
+        polyRecord = polyIter.value()
+        
+        # compare        
+        self.assertFalse(polyRecord.HasPulseCall());
+        self.assertFalse(virtualRecord.HasPulseCall());
+        
+        self.assertEqual(polyRecord.FullName(),        virtualRecord.FullName());
+        self.assertEqual(polyRecord.HoleNumber(),      virtualRecord.HoleNumber());
+        self.assertEqual(polyRecord.NumPasses(),       virtualRecord.NumPasses());
+        self.assertEqual(polyRecord.Sequence(),        virtualRecord.Sequence());
+        self.assertEqual(polyRecord.DeletionTag(),     virtualRecord.DeletionTag());
+        self.assertEqual(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag());
+        self.assertEqual(polyRecord.IPD(),             virtualRecord.IPDV1Frames());
+        self.assertEqual(polyRecord.ReadGroup(),       virtualRecord.ReadGroup());
+        
+        self.assertAlmostEqual(float(polyRecord.ReadAccuracy()), float(virtualRecord.ReadAccuracy()));
+        
+        self.assertEqual(polyRecord.Qualities().Fastq(),       virtualRecord.Qualities().Fastq());
+        self.assertEqual(polyRecord.DeletionQV().Fastq(),      virtualRecord.DeletionQV().Fastq());
+        self.assertEqual(polyRecord.InsertionQV().Fastq(),     virtualRecord.InsertionQV().Fastq());
+        self.assertEqual(polyRecord.MergeQV().Fastq(),         virtualRecord.MergeQV().Fastq());
+        self.assertEqual(polyRecord.SubstitutionQV().Fastq(),  virtualRecord.SubstitutionQV().Fastq());
+        
+        self.assertTrue(polyRecord.HasDeletionQV());
+        self.assertTrue(polyRecord.HasDeletionTag());
+        self.assertTrue(polyRecord.HasInsertionQV());
+        self.assertTrue(polyRecord.HasMergeQV());
+        self.assertTrue(polyRecord.HasSubstitutionQV());
+        self.assertTrue(polyRecord.HasSubstitutionTag());
+        self.assertTrue(polyRecord.HasIPD());
+        self.assertFalse(polyRecord.HasLabelQV());
+        self.assertFalse(polyRecord.HasAltLabelQV());
+        self.assertFalse(polyRecord.HasAltLabelTag());
+        self.assertFalse(polyRecord.HasPkmean());
+        self.assertFalse(polyRecord.HasPkmid());
+        self.assertFalse(polyRecord.HasPulseCall());
+        self.assertFalse(polyRecord.HasPulseWidth());
+        self.assertFalse(polyRecord.HasPrePulseFrames());
+        self.assertFalse(polyRecord.HasPulseCallWidth());
+        
+        self.assertTrue(virtualRecord.HasDeletionQV());
+        self.assertTrue(virtualRecord.HasDeletionTag());
+        self.assertTrue(virtualRecord.HasInsertionQV());
+        self.assertTrue(virtualRecord.HasMergeQV());
+        self.assertTrue(virtualRecord.HasSubstitutionQV());
+        self.assertTrue(virtualRecord.HasSubstitutionTag());
+        self.assertTrue(virtualRecord.HasIPD());
+        self.assertFalse(virtualRecord.HasLabelQV());
+        self.assertFalse(virtualRecord.HasAltLabelQV());
+        self.assertFalse(virtualRecord.HasAltLabelTag());
+        self.assertFalse(virtualRecord.HasPkmean());
+        self.assertFalse(virtualRecord.HasPkmid());
+        self.assertFalse(virtualRecord.HasPulseCall());
+        self.assertFalse(virtualRecord.HasPulseWidth());
+        self.assertFalse(virtualRecord.HasPrePulseFrames());
+        self.assertFalse(virtualRecord.HasPulseCallWidth());   
+    
+    # ------------ HELPERS --------------
+    
+    def compare(self, b1, b2):
+    
+        self.assertTrue(b1.HasDeletionQV());
+        self.assertTrue(b1.HasDeletionTag());
+        self.assertTrue(b1.HasInsertionQV());
+        self.assertTrue(b1.HasMergeQV());
+        self.assertTrue(b1.HasSubstitutionQV());
+        self.assertTrue(b1.HasSubstitutionTag());
+        self.assertTrue(b1.HasLabelQV());
+        self.assertTrue(b1.HasAltLabelQV());
+        self.assertTrue(b1.HasAltLabelTag());
+        self.assertTrue(b1.HasPkmean());
+        self.assertTrue(b1.HasPkmid());
+        self.assertTrue(b1.HasPulseCall());
+        self.assertTrue(b1.HasIPD());
+        self.assertTrue(b1.HasPulseWidth());
+        self.assertTrue(b1.HasPrePulseFrames());
+        self.assertTrue(b1.HasPulseCallWidth());
+        self.assertTrue(b1.HasPulseMergeQV());
+
+        self.assertTrue(b2.HasDeletionQV());
+        self.assertTrue(b2.HasDeletionTag());
+        self.assertTrue(b2.HasInsertionQV());
+        self.assertTrue(b2.HasMergeQV());
+        self.assertTrue(b2.HasSubstitutionQV());
+        self.assertTrue(b2.HasSubstitutionTag());
+        self.assertTrue(b2.HasLabelQV());
+        self.assertTrue(b2.HasAltLabelQV());
+        self.assertTrue(b2.HasAltLabelTag());
+        self.assertTrue(b2.HasPkmean());
+        self.assertTrue(b2.HasPkmid());
+        self.assertTrue(b2.HasPulseCall());
+        self.assertTrue(b2.HasIPD());
+        self.assertTrue(b2.HasPulseWidth());
+        self.assertTrue(b2.HasPrePulseFrames());
+        self.assertTrue(b2.HasPulseCallWidth());
+        self.assertTrue(b2.HasPulseMergeQV());
+    
+        self.assertEqual(b1.FullName(),        b2.FullName());
+        self.assertEqual(b1.HoleNumber(),      b2.HoleNumber());
+        self.assertEqual(b1.NumPasses(),       b2.NumPasses());
+        self.assertEqual(b1.Sequence(),        b2.Sequence());
+        self.assertEqual(b1.DeletionTag(),     b2.DeletionTag());
+        self.assertEqual(b1.SubstitutionTag(), b2.SubstitutionTag());
+        self.assertEqual(b1.AltLabelTag(),     b2.AltLabelTag());
+        self.assertEqual(b1.Pkmean(),          b2.Pkmean());
+        self.assertEqual(b1.Pkmid(),           b2.Pkmid());
+        self.assertEqual(b1.PulseCall(),       b2.PulseCall());
+        self.assertEqual(b1.IPD(),             b2.IPD());
+        self.assertEqual(b1.PulseWidth(),      b2.PulseWidth());
+        self.assertEqual(b1.PrePulseFrames(),  b2.PrePulseFrames());
+        self.assertEqual(b1.PulseCallWidth(),  b2.PulseCallWidth());
+        self.assertEqual(b1.ReadGroup(),       b2.ReadGroup());
+        
+        self.assertEqual(b1.Qualities().Fastq(),       b2.Qualities().Fastq());
+        self.assertEqual(b1.DeletionQV().Fastq(),      b2.DeletionQV().Fastq());
+        self.assertEqual(b1.InsertionQV().Fastq(),     b2.InsertionQV().Fastq());
+        self.assertEqual(b1.MergeQV().Fastq(),         b2.MergeQV().Fastq());
+        self.assertEqual(b1.SubstitutionQV().Fastq(),  b2.SubstitutionQV().Fastq());
+        self.assertEqual(b1.PulseMergeQV().Fastq(),    b2.PulseMergeQV().Fastq());
+        self.assertEqual(b1.LabelQV().Fastq(),         b2.LabelQV().Fastq());
+        self.assertEqual(b1.AltLabelQV().Fastq(),      b2.AltLabelQV().Fastq());
+        
diff --git a/tests/src/python/test/test_QualityValues.py b/tests/src/python/test/test_QualityValues.py

new file mode 100755 (executable)

index 0000000..41bf8c3
--- /dev/null
+++ b/tests/src/python/test/test_QualityValues.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+import PacBioBam
+import config 
+import unittest
+
+class QualityValueTest(unittest.TestCase):
+    
+    # ------------ SETUP --------------
+    
+    def runTest(self):
+        self.test_defaults()
+        self.test_fromNumber()
+        self.test_fromFastq()
+        
+    # ------------ TESTS --------------
+    
+    def test_defaults(self):
+        value = PacBioBam.QualityValue()    
+        self.assertEqual(0,  int(value))
+        self.assertEqual('!', value.Fastq())
+        
+    def test_fromNumber(self):
+        
+        zero        = PacBioBam.QualityValue(0)
+        thirtythree = PacBioBam.QualityValue(33)
+        normal      = PacBioBam.QualityValue(42)
+        maxQV       = PacBioBam.QualityValue(93)
+        tooHigh     = PacBioBam.QualityValue(94)
+        max8bit     = PacBioBam.QualityValue(126)
+        
+        self.assertEqual(0,  int(zero))
+        self.assertEqual(33, int(thirtythree))
+        self.assertEqual(42, int(normal))
+        self.assertEqual(93, int(maxQV))
+        self.assertEqual(93, int(tooHigh))
+        self.assertEqual(93, int(max8bit))
+
+        self.assertEqual('!', zero.Fastq())
+        self.assertEqual('B', thirtythree.Fastq())
+        self.assertEqual('K', normal.Fastq())
+        self.assertEqual('~', maxQV.Fastq())
+        self.assertEqual('~', tooHigh.Fastq())
+        self.assertEqual('~', max8bit.Fastq())
+
+    def test_fromFastq(self):
+        
+        zero        = PacBioBam.QualityValue.FromFastq('!')
+        thirtythree = PacBioBam.QualityValue.FromFastq('B')
+        normal      = PacBioBam.QualityValue.FromFastq('K')
+        maxQV       = PacBioBam.QualityValue.FromFastq('~')
+        
+        self.assertEqual(0,  int(zero))
+        self.assertEqual(33, int(thirtythree))
+        self.assertEqual(42, int(normal))
+        self.assertEqual(93, int(maxQV))
+        
+class QualityValuesTest(unittest.TestCase):
+    
+    # ------------ SETUP --------------
+    
+    def runTest(self):
+        self.test_defaults()
+        self.test_fromNumbers()
+        self.test_fromFastq()
+        
+    # ------------ TESTS --------------
+    
+    def test_defaults(self):
+        values = PacBioBam.QualityValues()    
+        self.assertFalse(values.Fastq()) 
+        
+    def test_fromNumbers(self):
+        
+        fastqString = "~~~KKBB!!"
+        values = [ 93, 93, 93, 42, 42, 33, 33, 0, 0 ]
+        
+        qvs = PacBioBam.QualityValues()
+        for value in values:
+            qvs.append(PacBioBam.QualityValue(value))
+            
+        self.assertEqual(fastqString, qvs.Fastq())
+
+    def test_fromFastq(self):
+        
+        fastqString = "~~~KKBB!!"
+        values = [ 93, 93, 93, 42, 42, 33, 33, 0, 0 ]
+        
+        qvs = PacBioBam.QualityValues.FromFastq(fastqString)
+        
+        self.assertEqual(len(fastqString), len(qvs))
+        self.assertEqual(len(values),      len(qvs))
+        
+        for i, v in enumerate(values):
+            self.assertEqual(v, int(qvs[i]))
+    
+\ No newline at end of file
diff --git a/tests/src/python/test_pbbam.py b/tests/src/python/test_pbbam.py

new file mode 100755 (executable)

index 0000000..8ca3c90
--- /dev/null
+++ b/tests/src/python/test_pbbam.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#    copyright notice, this list of conditions and the following
+#    disclaimer in the documentation and/or other materials provided
+#    with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# Author: Derek Barnett
+
+import sys
+import unittest
+
+if __name__ == "__main__":
+    suite  = unittest.TestLoader().discover('.', pattern = "test_*.py")
+    result = unittest.TextTestRunner(verbosity=2).run(suite)
+    if result.wasSuccessful():
+        sys.exit(0)
+    else:
+        sys.exit(1)
+    
+\ No newline at end of file
diff --git a/tests/src/test_Accuracy.cpp b/tests/src/test_Accuracy.cpp

new file mode 100644 (file)

index 0000000..9750dd4
--- /dev/null
+++ b/tests/src/test_Accuracy.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/Accuracy.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(AccuracyTest, ClampValues)
+{
+    Accuracy a_zero(0.0);
+    Accuracy a_neg(-0.5);
+    Accuracy a_min(0.0);
+    Accuracy a_normal(0.9);
+    Accuracy a_max(1.0);
+    Accuracy a_tooLarge(1.1);
+
+    EXPECT_FLOAT_EQ(0.0, a_zero);
+    EXPECT_FLOAT_EQ(0.0, a_neg);
+    EXPECT_FLOAT_EQ(0.0, a_min);
+    EXPECT_FLOAT_EQ(0.9, a_normal);
+    EXPECT_FLOAT_EQ(1.0, a_max);
+    EXPECT_FLOAT_EQ(1.0, a_tooLarge);
+}
diff --git a/tests/src/test_AlignmentPrinter.cpp b/tests/src/test_AlignmentPrinter.cpp

new file mode 100644 (file)

index 0000000..89ec98a
--- /dev/null
+++ b/tests/src/test_AlignmentPrinter.cpp
@@ -0,0 +1,152 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+
+#include <pbbam/AlignmentPrinter.h>
+#include <pbbam/BamFile.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/IndexedFastaReader.h>
+#include <string>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+const string lambdaFasta = tests::Data_Dir + "/lambdaNEB.fa";
+const string singleInsertionBam = tests::Data_Dir + "/aligned.bam";
+
+TEST(AlignmentPrinterTest, Print)
+{
+    IndexedFastaReader r(lambdaFasta);
+    AlignmentPrinter pretty(r);
+
+    BamFile bamFile(singleInsertionBam);
+    EntireFileQuery bamQuery(bamFile);
+    auto it = bamQuery.begin();
+
+    // funky formatting used to format alignments
+    auto expected = string
+    {
+        "Read        : singleInsertion/100/0_49\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 49\n"
+        "Concordance : 0.96\n"
+        "\n"
+        "5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249\n"
+        "       \x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| ||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "   0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG :   39\n"
+        "\n"
+        "5249 : ACTGGCTGAT : 5259\n"
+        "       |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "  39 : ACTGGCTGAT :   49\n"
+        "\n"
+    };
+
+    auto record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+
+    expected = string 
+    {
+        "Read        : singleInsertion/200/0_49\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 49\n"
+        "Concordance : 0.96\n"
+        "\n"
+        "5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249\n"
+        "       \x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| ||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "   0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG :   39\n"
+        "\n"
+        "5249 : ACTGGCTGAT : 5259\n"
+        "       |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "  39 : ACTGGCTGAT :   49\n"
+        "\n"
+    };
+
+    record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+
+    expected = string 
+    {
+        "Read        : singleInsertion/100/0_111\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 59\n"
+        "Concordance : 0.951\n"
+        "\n"
+        "9377 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCG : 9417\n"
+        "       |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||  |\n"
+        "   0 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGA--G :   38\n"
+        "\n"
+        "9417 : CAGCACGGT-AACAGCGGCAA : 9437\n"
+        "       |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||| ||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||\n"
+        "  38 : CAGCACGGTAAACAGCGGCAA :   59\n"
+        "\n"
+    };
+
+    record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+
+    expected = string 
+    {
+        "Read        : singleInsertion/100/0_111\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 59\n"
+        "Concordance : 0.951\n"
+        "\n"
+        "9377 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCG : 9417\n"
+        "       |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||  |\n"
+        "   0 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGA--G :   38\n"
+        "\n"
+        "9417 : CAGCACGGT-AACAGCGGCAA : 9437\n"
+        "       |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||| ||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||\n"
+        "  38 : CAGCACGGTAAACAGCGGCAA :   59\n"
+        "\n"
+    };
+
+    record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+}
diff --git a/tests/src/test_BamFile.cpp b/tests/src/test_BamFile.cpp

new file mode 100644 (file)

index 0000000..674a471
--- /dev/null
+++ b/tests/src/test_BamFile.cpp
@@ -0,0 +1,142 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/BamFile.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/../../src/FileUtils.h>
+#include <stdexcept>
+#include <cstdlib>
+#include <unistd.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace tests {
+
+template<typename T>
+void CheckFile(const T& input, const size_t expectedCount)
+{
+    size_t observedCount = 0;
+    EntireFileQuery entireFile(input);
+    for (const BamRecord& r : entireFile) {
+        (void)r;
+        ++observedCount;
+    }
+    EXPECT_EQ(expectedCount, observedCount);
+}
+
+} // namespace tests
+} // namespace BAM
+} // namespace PacBio
+
+TEST(BamFileTest, NonExistentFileThrows)
+{
+    EXPECT_THROW(BamFile{ "does_not_exist.bam" }, std::runtime_error);
+}
+
+TEST(BamFileTest, NonBamFileThrows)
+{
+    EXPECT_THROW(BamFile { tests::Data_Dir + "/lambdaNEB.fa.fai" }, std::runtime_error);
+}
+
+TEST(BamFileTest, RelativePathBamOk)
+{
+    // cache current working directory, then drill down so we can point to
+    // BAMs using relative path
+    const string cwd = internal::FileUtils::CurrentWorkingDirectory();
+    ASSERT_EQ(0, chdir(tests::Data_Dir.c_str()));
+    ASSERT_EQ(0, chdir("relative/a"));
+
+    // BamFile from relative BAM fn
+    tests::CheckFile(BamFile{ "../b/test1.bam" }, 3);
+
+    // dataset from relative BAM fn
+    tests::CheckFile(DataSet{ "../b/test1.bam" }, 3);
+
+    // dataset from BamFile object (itself from relative BAM fn)
+    {
+        auto file = BamFile{"../b/test1.bam"};
+        tests::CheckFile(DataSet{ file }, 3);
+    }
+
+    // restore working directory
+    ASSERT_EQ(0, chdir(cwd.c_str()));
+}
+
+TEST(BamFileTest, RelativePathXmlOk)
+{
+    // cache current working directory, then drill down so we can point to
+    // BAMs using relative path
+    const string cwd = internal::FileUtils::CurrentWorkingDirectory();
+    ASSERT_EQ(0, chdir(tests::Data_Dir.c_str()));
+
+    // dataset from XML containing relative paths
+    tests::CheckFile(DataSet{ "relative/relative.xml" }, 9);
+
+    // restore working directory
+    ASSERT_EQ(0, chdir(cwd.c_str()));
+}
+
+TEST(BamFileTest, RelativePathFofnOk)
+{
+    // cache current working directory, then drill down so we can point to
+    // BAMs using relative path
+    const string cwd = internal::FileUtils::CurrentWorkingDirectory();
+    ASSERT_EQ(0, chdir(tests::Data_Dir.c_str()));
+
+    // dataset from FOFN containing relative paths
+    tests::CheckFile(DataSet{ "relative/relative.fofn" }, 9);
+
+    // NOTE: doesn't yet support a FOFN containing an XML with relative paths
+//       tests::CheckFile(DataSet{ "relative/relative2.fofn" }, 60);
+
+    // restore working directory
+    ASSERT_EQ(0, chdir(cwd.c_str()));
+}
+
+TEST(BamFileTest, TruncatedFileThrowsOk)
+{
+    EXPECT_THROW(BamFile{ tests::GeneratedData_Dir + "/truncated.bam" }, std::runtime_error);
+}
diff --git a/tests/src/test_BamHeader.cpp b/tests/src/test_BamHeader.cpp

new file mode 100644 (file)

index 0000000..4c49b95
--- /dev/null
+++ b/tests/src/test_BamHeader.cpp
@@ -0,0 +1,436 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <htslib/sam.h>
+#include <pbbam/BamHeader.h>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace tests {
+
+struct BamHdrDeleter
+{
+    void operator()(bam_hdr_t* hdr) {
+        if (hdr)
+            bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+} // namespace tests
+} // namespace BAM
+} // namespace PacBio
+
+TEST(BamHeaderTest, DefaultConstruction)
+{
+    BamHeader header;
+    EXPECT_TRUE(header.Version().empty());
+    EXPECT_TRUE(header.SortOrder().empty()); // default to unknown ?
+    EXPECT_TRUE(header.ReadGroups().empty());
+    EXPECT_TRUE(header.Sequences().empty());
+    EXPECT_TRUE(header.Programs().empty());
+    EXPECT_TRUE(header.Comments().empty());
+
+    EXPECT_THROW(header.Program("foo"),     std::exception);
+    EXPECT_THROW(header.ReadGroup("foo"),   std::exception);
+    EXPECT_THROW(header.SequenceId("foo"),  std::exception);
+    EXPECT_THROW(header.SequenceLength(42), std::exception);
+    EXPECT_THROW(header.SequenceName(42),   std::exception);
+}
+
+TEST(BamHeaderTest, DecodeTest)
+{
+    const string& text = "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+                         "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+                         "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+                         "@RG\tID:rg1\tSM:control\n"
+                         "@RG\tID:rg2\tSM:condition1\n"
+                         "@RG\tID:rg3\tSM:condition1\n"
+                         "@PG\tID:_foo_\tPN:ide\n"
+                         "@CO\tipsum and so on\n"
+                         "@CO\tcitation needed\n";
+
+    BamHeader header = BamHeader(text);
+
+    EXPECT_EQ(string("1.1"),       header.Version());
+    EXPECT_EQ(string("queryname"), header.SortOrder());
+    EXPECT_EQ(string("3.0.1"),     header.PacBioBamVersion());
+
+    EXPECT_EQ(3, header.ReadGroups().size());
+    EXPECT_TRUE(header.HasReadGroup("rg1"));
+    EXPECT_TRUE(header.HasReadGroup("rg2"));
+    EXPECT_TRUE(header.HasReadGroup("rg3"));
+
+    EXPECT_EQ(string("control"),    header.ReadGroup("rg1").Sample());
+    EXPECT_EQ(string("condition1"), header.ReadGroup("rg2").Sample());
+    EXPECT_EQ(string("condition1"), header.ReadGroup("rg3").Sample());
+
+    EXPECT_EQ(2, header.Sequences().size());
+    EXPECT_TRUE(header.HasSequence("chr1"));
+    EXPECT_TRUE(header.HasSequence("chr2"));
+    EXPECT_EQ(string("chocobo"), header.Sequence("chr1").Species());
+    EXPECT_EQ(string("chocobo"), header.Sequence("chr2").Species());
+    EXPECT_EQ(string("2038"), header.Sequence("chr1").Length());
+    EXPECT_EQ(string("3042"), header.Sequence("chr2").Length());
+
+    EXPECT_EQ(1, header.Programs().size());
+    EXPECT_TRUE(header.HasProgram("_foo_"));
+    EXPECT_EQ(string("ide"), header.Program("_foo_").Name());
+
+    EXPECT_EQ(2, header.Comments().size());
+    EXPECT_EQ(string("ipsum and so on"), header.Comments().at(0));
+    EXPECT_EQ(string("citation needed"), header.Comments().at(1));
+}
+
+TEST(BamHeaderTest, VersionCheckOk)
+{
+    auto expectFail = [](string&& label, string&& text)
+    {
+        SCOPED_TRACE(label);
+        EXPECT_THROW(BamHeader{ text }, std::runtime_error);
+    };
+    expectFail("empty version",        "@HD\tVN:1.1\tSO:queryname\tpb:\n");
+    expectFail("old beta version",     "@HD\tVN:1.1\tSO:queryname\tpb:3.0b3\n");
+    expectFail("old beta version",     "@HD\tVN:1.1\tSO:queryname\tpb:3.0b7\n");
+    expectFail("invalid value",        "@HD\tVN:1.1\tSO:queryname\tpb:3.0.should_not_work\n");
+    expectFail("earlier than minimum", "@HD\tVN:1.1\tSO:queryname\tpb:3.0.0\n");
+
+    // correct version syntax, number
+    EXPECT_NO_THROW(BamHeader{ "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n" });
+}
+
+TEST(BamHeaderTest, EncodeTest)
+{
+    ReadGroupInfo rg1("rg1");
+    rg1.Sample("control");
+    ReadGroupInfo rg2("rg2");
+    rg2.Sample("condition1");
+    ReadGroupInfo rg3("rg3");
+    rg3.Sample("condition1");
+
+    SequenceInfo seq1("chr1");
+    seq1.Length("2038").Species("chocobo");
+    SequenceInfo seq2("chr2");
+    seq2.Length("3042").Species("chocobo");
+
+    ProgramInfo prog1("_foo_");
+    prog1.Name("ide");
+
+    BamHeader header;
+    header.Version("1.1")
+          .SortOrder("queryname")
+          .PacBioBamVersion("3.0.1")
+          .AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddSequence(seq1)
+          .AddSequence(seq2)
+          .AddProgram(prog1)
+          .AddComment("ipsum and so on")
+          .AddComment("citation needed");
+
+    const string& expectedText = "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+                                 "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+                                 "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+                                 "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
+                                 "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+                                 "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+                                 "@PG\tID:_foo_\tPN:ide\n"
+                                 "@CO\tipsum and so on\n"
+                                 "@CO\tcitation needed\n";
+
+    const string& text = header.ToSam();
+    EXPECT_EQ(expectedText, text);
+}
+
+TEST(BamHeaderTest, ConvertToRawDataOk)
+{
+    ReadGroupInfo rg1("rg1");
+    rg1.Sample("control");
+    ReadGroupInfo rg2("rg2");
+    rg2.Sample("condition1");
+    ReadGroupInfo rg3("rg3");
+    rg3.Sample("condition1");
+
+    SequenceInfo seq1("chr1");
+    seq1.Length("2038").Species("chocobo");
+    SequenceInfo seq2("chr2");
+    seq2.Length("3042").Species("chocobo");
+
+    ProgramInfo prog1("_foo_");
+    prog1.Name("ide");
+
+    BamHeader header;
+    header.Version("1.1")
+          .SortOrder("queryname")
+          .PacBioBamVersion("3.0.1")
+          .AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddSequence(seq1)
+          .AddSequence(seq2)
+          .AddProgram(prog1)
+          .AddComment("ipsum and so on")
+          .AddComment("citation needed");
+
+    const string& expectedText = "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+                                 "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+                                 "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+                                 "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
+                                 "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+                                 "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+                                 "@PG\tID:_foo_\tPN:ide\n"
+                                 "@CO\tipsum and so on\n"
+                                 "@CO\tcitation needed\n";
+
+
+    const string& text = header.ToSam();
+    PBBAM_SHARED_PTR<bam_hdr_t> rawData(sam_hdr_parse(text.size(), text.c_str()), tests::BamHdrDeleter());
+    rawData->ignore_sam_err = 0;
+    rawData->cigar_tab = NULL;
+    rawData->l_text = text.size();
+    rawData->text = (char*)calloc(rawData->l_text + 1, 1);
+    memcpy(rawData->text, text.c_str(), rawData->l_text);
+
+    const string& rawText = string(rawData->text, rawData->l_text);
+    EXPECT_EQ(expectedText, rawText);
+}
+
+TEST(BamHeaderTest, ExtractFromRawDataOk)
+{
+    ReadGroupInfo rg1("rg1");
+    rg1.Sample("control");
+    ReadGroupInfo rg2("rg2");
+    rg2.Sample("condition1");
+    ReadGroupInfo rg3("rg3");
+    rg3.Sample("condition1");
+
+    SequenceInfo seq1("chr1");
+    seq1.Length("2038").Species("chocobo");
+    SequenceInfo seq2("chr2");
+    seq2.Length("3042").Species("chocobo");
+
+    ProgramInfo prog1("_foo_");
+    prog1.Name("ide");
+
+    BamHeader header;
+    header.Version("1.1")
+          .SortOrder("queryname")
+          .PacBioBamVersion("3.0.1")
+          .AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddSequence(seq1)
+          .AddSequence(seq2)
+          .AddProgram(prog1)
+          .AddComment("ipsum and so on")
+          .AddComment("citation needed");
+
+    const string& expectedText = "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+                                 "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+                                 "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+                                 "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
+                                 "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+                                 "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+                                 "@PG\tID:_foo_\tPN:ide\n"
+                                 "@CO\tipsum and so on\n"
+                                 "@CO\tcitation needed\n";
+
+
+    string text = header.ToSam();
+    PBBAM_SHARED_PTR<bam_hdr_t> rawData(sam_hdr_parse(text.size(), text.c_str()), tests::BamHdrDeleter());
+    rawData->ignore_sam_err = 0;
+    rawData->cigar_tab = NULL;
+    rawData->l_text = text.size();
+    rawData->text = (char*)calloc(rawData->l_text + 1, 1);
+    memcpy(rawData->text, text.c_str(), rawData->l_text);
+
+    const BamHeader newHeader = BamHeader(string(rawData->text, rawData->l_text));
+
+    EXPECT_EQ(header.Version(),          newHeader.Version());
+    EXPECT_EQ(header.SortOrder(),        newHeader.SortOrder());
+    EXPECT_EQ(header.PacBioBamVersion(), newHeader.PacBioBamVersion());
+
+    text = newHeader.ToSam();
+    EXPECT_EQ(expectedText, text);
+}
+
+TEST(BamHeaderTest, MergeOk)
+{
+    const string hdrText1 = {
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+            "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
+            "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\t"
+            "PM:SEQUEL\n"
+        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
+        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"
+        "@CO\tcomment1\n"
+    };
+
+    const string hdrText2 = {
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;"
+            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;"
+            "PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;"
+            "PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;"
+            "BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;"
+            "FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\t"
+            "PM:SEQUEL\n"
+        "@PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0\n"
+        "@PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0\n"
+        "@PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0\n"
+        "@CO\tcomment2\n"
+    };
+
+    const string mergedText = {
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+            "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
+            "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\t"
+            "PM:SEQUEL\n"
+        "@RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;"
+            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;"
+            "PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;"
+            "PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;"
+            "BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;"
+            "FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\t"
+            "PM:SEQUEL\n"
+        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
+        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"
+        "@PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0\n"
+        "@PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0\n"
+        "@PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0\n"
+        "@CO\tcomment1\n"
+        "@CO\tcomment2\n"
+    };
+
+    { // operator+
+
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        const BamHeader merged = header1 + header2;
+        EXPECT_EQ(mergedText, merged.ToSam());
+
+        // also make sure inputs not changed
+        EXPECT_EQ(hdrText1, header1.ToSam());
+        EXPECT_EQ(hdrText2, header2.ToSam());
+    }
+
+    { // operator+=
+
+        BamHeader header1(hdrText1);
+        header1 += BamHeader(hdrText2);
+        EXPECT_EQ(mergedText, header1.ToSam());
+    }
+}
+
+TEST(BamHeaderTest, MergeHandlesDuplicateReadGroups)
+{
+    const string hdrText = {
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+            "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
+            "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\tPM:SEQUEL\n"
+        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
+        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"
+    };
+
+    // duplicate @RG:IDs handled ok (i.e. not duplicated in output)
+    const BamHeader header1(hdrText);
+    const BamHeader header2(hdrText);
+    const BamHeader merged = header1 + header2;
+    EXPECT_EQ(hdrText, merged.ToSam());
+}
+
+TEST(BamHeaderTest, MergeCompatibilityOk)
+{
+    {   // different @HD:VN - this IS allowed (as of SAT-465, pbbam v0.7.2)
+        const string hdrText1 = { "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n" };
+        const string hdrText2 = { "@HD\tVN:1.0\tSO:unknown\tpb:3.0.1\n" };
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_NO_THROW(header1 + header2);
+    }
+
+    {   // different @HD:SO
+        const string hdrText1 = { "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n" };
+        const string hdrText2 = { "@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n" };
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_THROW(header1 + header2, std::runtime_error);
+    }
+
+    {   // different @HD:pb - this IS allowed (as of SAT-529, pbbam 0.7.4)
+        const string hdrText1 = { "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n" };
+        const string hdrText2 = { "@HD\tVN:1.1\tSO:unknown\tpb:3.0.3\n" };
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_NO_THROW(header1 + header2);
+    }
+
+    {   // @SQ list clash
+        const string hdrText1 = {
+            "@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n"
+            "@SQ\tSN:foo\tLN:42\n"
+            "@SQ\tSN:bar\tLN:24\n"
+        };
+        const string hdrText2 = {
+            "@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n"
+            "@SQ\tSN:foo\tLN:42\n"
+            "@SQ\tSN:baz\tLN:99\n"
+        };
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_THROW(header1 + header2, std::runtime_error);
+    }
+}
diff --git a/tests/src/test_BamRecord.cpp b/tests/src/test_BamRecord.cpp

new file mode 100644 (file)

index 0000000..753ab6b
--- /dev/null
+++ b/tests/src/test_BamRecord.cpp
@@ -0,0 +1,2708 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamTagCodec.h>
+#include <array>
+#include <initializer_list>
+#include <string>
+#include <vector>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace tests {
+
+static
+BamRecordImpl CreateBamImpl(void)
+{
+    TagCollection tags;
+    tags["HX"] = string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = vector<uint8_t>({34, 5, 125});
+    tags["XY"] = static_cast<int32_t>(-42);
+
+    BamRecordImpl bam;
+    bam.Bin(42);
+    bam.Flag(42);
+    bam.InsertSize(42);
+    bam.MapQuality(42);
+    bam.MatePosition(42);
+    bam.MateReferenceId(42);
+    bam.Position(42);
+    bam.ReferenceId(42);
+    bam.Tags(tags);
+    return bam;
+}
+
+static inline
+BamRecord CreateBam(void)
+{ return BamRecord{ CreateBamImpl() }; }
+
+static
+void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+    const uint32_t expectedNameLength  = bam.Name().size() + 1;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t  expectedSeqLength   = bam.Sequence().length();
+    const size_t   expectedTagsLength  = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + << TAGS >>
+    const int expectedTotalDataLength = expectedNameLength +
+                                        (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength+1)/2 +
+                                         expectedSeqLength +
+                                         expectedTagsLength;
+    EXPECT_TRUE((bool)bam.d_);
+    EXPECT_EQ(expectedNameLength,      bam.d_->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps,     bam.d_->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength,       bam.d_->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, bam.d_->l_data);
+}
+
+static inline
+void CheckRawData(const BamRecord& bam)
+{ CheckRawData(bam.impl_); }
+
+static
+BamRecordImpl MakeCigaredImpl(const string& seq,
+                              const string& cigar,
+                              const Strand strand)
+{
+    BamRecordImpl impl;
+    impl.SetMapped(true).ReferenceId(0).Position(0).MapQuality(0);
+    impl.CigarData(Cigar::FromStdString(cigar));
+    impl.MateReferenceId(-1).MatePosition(-1).InsertSize(0);
+    impl.SetSequenceAndQualities(seq, string(seq.size(), '*'));
+    impl.SetReverseStrand(strand == Strand::REVERSE);
+    return impl;
+}
+
+static inline
+BamRecord MakeCigaredRecord(const string& seq,
+                            const string& cigar,
+                            const Strand strand)
+{ return BamRecord{ MakeCigaredImpl(seq, cigar, strand) }; }
+
+static
+BamRecord MakeCigaredBaseRecord(const string& bases,
+                                const string& cigar,
+                                const Strand strand)
+{
+    TagCollection tags;
+    tags["dt"] = bases;
+    tags["st"] = bases;
+
+    const string seq = string(bases.size(), 'N');
+    BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredFrameRecord(const vector<uint16_t>& frames,
+                                 const string& cigar,
+                                 const Strand strand)
+{
+    TagCollection tags;
+    tags["ip"] = frames;
+    tags["pw"] = frames;
+
+    const string seq = string(frames.size(), 'N');
+    BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredQualRecord(const string& quals,
+                                const string& cigar,
+                                const Strand strand)
+{
+    TagCollection tags;
+    tags["dq"] = quals;
+    tags["iq"] = quals;
+    tags["mq"] = quals;
+    tags["sq"] = quals;
+
+    const string seq = string(quals.size(), 'N');
+    BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseBaseRecord(const string& seqBases,
+                                     const string& pulseCalls,
+                                     const string& pulseBases,
+                                     const string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls; // PulseCall
+    tags["pt"] = pulseBases; // AltLabelTag
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseQualRecord(const string& seqBases,
+                                     const string& pulseCalls,
+                                     const string& pulseQuals,
+                                     const string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls;
+    tags["pv"] = pulseQuals; // AltLabelQV
+    tags["pq"] = pulseQuals; // LabelQV
+    tags["pg"] = pulseQuals; // PulseMergeQV
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseFrameRecord(const string& seqBases,
+                                     const string& pulseCalls,
+                                     const vector<uint16_t>& pulseFrames,
+                                     const string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls;
+    tags["pd"] = pulseFrames; // PrePulseFrames
+    tags["px"] = pulseFrames; // PulseCallWidth
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseUIntRecord(const string& seqBases,
+                                     const string& pulseCalls,
+                                     const vector<uint32_t>& pulseUInts,
+                                     const string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls;
+    tags["sf"] = pulseUInts; // StartFrame
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+// ----------------------------------------------------------
+// helper structs and methods for checking combinations of:
+//   aligned strand, orientation requested, alignment, clipping
+// ----------------------------------------------------------
+
+// generic result holder for various requested states
+template<typename T>
+struct ExpectedResult
+{
+public:
+    ExpectedResult(std::initializer_list<T> init)
+        : d_(init)
+    {
+        assert(12 == init.size());
+    }
+
+    T ForwardGenomic(void) const               { return d_.at(0); }
+    T ForwardNative(void) const                { return d_.at(1); }
+    T ForwardGenomicAligned(void) const        { return d_.at(2); }
+    T ForwardNativeAligned(void) const         { return d_.at(3); }
+    T ForwardGenomicAlignedClipped(void) const { return d_.at(4); }
+    T ForwardNativeAlignedClipped(void) const  { return d_.at(5); }
+    T ReverseGenomic(void) const               { return d_.at(6); }
+    T ReverseNative(void) const                { return d_.at(7); }
+    T ReverseGenomicAligned(void) const        { return d_.at(8); }
+    T ReverseNativeAligned(void) const         { return d_.at(9); }
+    T ReverseGenomicAlignedClipped(void) const { return d_.at(10); }
+    T ReverseNativeAlignedClipped(void) const  { return d_.at(11); }
+
+private:
+    vector<T> d_;
+};
+
+// generic data type checker on the various requested states
+template<typename DataType, typename MakeRecordType, typename FetchDataType>
+void CheckAlignAndClip(const string& cigar,
+                       const DataType& input,
+                       const tests::ExpectedResult<DataType>& e,
+                       const MakeRecordType& makeRecord,
+                       const FetchDataType& fetchData)
+{
+    {   // map to forward strand
+        const BamRecord b = makeRecord(input, cigar, Strand::FORWARD);
+        EXPECT_EQ(e.ForwardGenomic(),               fetchData(b, Orientation::GENOMIC, false, false));
+        EXPECT_EQ(e.ForwardNative(),                fetchData(b, Orientation::NATIVE,  false, false));
+        EXPECT_EQ(e.ForwardGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false));
+        EXPECT_EQ(e.ForwardNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false));
+        EXPECT_EQ(e.ForwardGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true));
+        EXPECT_EQ(e.ForwardNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true));
+    }
+    {   // map to reverse strand
+        const BamRecord b = makeRecord(input, cigar, Strand::REVERSE);
+        EXPECT_EQ(e.ReverseGenomic(),               fetchData(b, Orientation::GENOMIC, false, false));
+        EXPECT_EQ(e.ReverseNative(),                fetchData(b, Orientation::NATIVE,  false, false));
+        EXPECT_EQ(e.ReverseGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false));
+        EXPECT_EQ(e.ReverseNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false));
+        EXPECT_EQ(e.ReverseGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true));
+        EXPECT_EQ(e.ReverseNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true));
+    }
+}
+
+template<typename DataType, typename MakeRecordType, typename FetchDataType>
+void CheckPulseDataAlignAndClip(const string& cigar,
+                                const string& seqBases,
+                                const string& pulseCalls,
+                                const DataType& input,
+                                const tests::ExpectedResult<DataType>& allPulses,
+                                const tests::ExpectedResult<DataType>& basecallsOnly,
+                                const MakeRecordType& makeRecord,
+                                const FetchDataType& fetchData)
+{
+    {   // map to forward strand
+        const BamRecord b = makeRecord(seqBases, pulseCalls, input, cigar, Strand::FORWARD);
+
+        EXPECT_EQ(allPulses.ForwardGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::ALL));
+        EXPECT_EQ(allPulses.ForwardNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::ALL));
+        // no align/clipping operations available on ALL pulses
+
+        EXPECT_EQ(basecallsOnly.ForwardGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true,  PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true,  PulseBehavior::BASECALLS_ONLY));
+    }
+    {   // map to reverse strand
+        const BamRecord b = makeRecord(seqBases, pulseCalls, input, cigar, Strand::REVERSE);
+
+        EXPECT_EQ(allPulses.ReverseGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::ALL));
+        EXPECT_EQ(allPulses.ReverseNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::ALL));
+        // no align/clipping operations available on ALL pulses
+
+        EXPECT_EQ(basecallsOnly.ReverseGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true,  PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true,  PulseBehavior::BASECALLS_ONLY));
+    }
+}
+
+static
+void CheckBaseTagsClippedAndAligned(const string& cigar,
+                                    const string& input,
+                                    const ExpectedResult<string>& e)
+{
+    // aligned record + DeletionTag, SubstitutionTag
+    auto makeRecord = [](const string& bases,
+                         const string& cigar,
+                         const Strand strand)
+    { return MakeCigaredBaseRecord(bases, cigar, strand); };
+
+    // DeletionTag
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.DeletionTag(orientation, aligned, exciseSoftClips); }
+    );
+
+    // SubstitutionTag
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.SubstitutionTag(orientation, aligned, exciseSoftClips); }
+    );
+}
+
+static
+void CheckFrameTagsClippedAndAligned(const string& cigar,
+                                     const vector<uint16_t>& input,
+                                     const ExpectedResult<vector<uint16_t> >& e)
+{
+
+    // aligned record + IPD, PulseWidth
+    auto makeRecord = [](const vector<uint16_t>& frames,
+                         const string& cigar,
+                         const Strand strand)
+    { return tests::MakeCigaredFrameRecord(frames, cigar, strand); };
+
+    // IPD
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.IPD(orientation, aligned, exciseSoftClips).Data(); }
+    );
+
+    // PulseWidth
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.PulseWidth(orientation, aligned, exciseSoftClips).Data(); }
+    );
+}
+
+static
+void CheckQualityTagsClippedAndAligned(const string& cigar,
+                                       const string& input,
+                                       const ExpectedResult<string>& e)
+{
+    // aligned record + DeletionQV, InsertionQV, MergeQV, SubstitutionQV
+    auto makeRecord = [](const string& quals,
+                         const string& cigar,
+                         const Strand strand)
+    { return tests::MakeCigaredQualRecord(quals, cigar, strand); };
+
+    // DeletionQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.DeletionQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+
+    // InsertionQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.InsertionQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+
+    // MergeQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.MergeQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+
+    // SubstitutionQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.SubstitutionQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+}
+
+static
+void CheckQualitiesClippedAndAligned(const string& cigar,
+                                     const string& input,
+                                     const ExpectedResult<string>& e)
+{
+    // aligned record w/ dummy SEQ & QUALs under test
+    auto makeRecord = [](const string& quals,
+                         const string& cigar,
+                         const Strand strand)
+    {
+        const string seq = string(quals.size(), 'N');
+        auto record = tests::MakeCigaredRecord(seq, cigar, strand);
+        record.Impl().SetSequenceAndQualities(seq, quals);
+        return record;
+    };
+
+    // QUAL
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.Qualities(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+}
+
+static
+void CheckSequenceClippedAndAligned(const string& cigar,
+                                    const string& input,
+                                    const ExpectedResult<string>& e)
+{
+    // aligned record w/ SEQ
+    auto makeRecord = [](const string& seq,
+                         const string& cigar,
+                         const Strand strand)
+    { return tests::MakeCigaredRecord(seq, cigar, strand); };
+
+    // SEQ
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.Sequence(orientation, aligned, exciseSoftClips); }
+    );
+}
+
+static
+void CheckPulseBaseTags(const string& cigar,
+                        const string& seqBases,
+                        const string& pulseCalls,
+                        const string& pulseBases,
+                        const ExpectedResult<string>& allPulses,
+                        const ExpectedResult<string>& basecallsOnly)
+{
+    // aligned record + AltLabelTag
+    auto makeRecord = [](const string& seqBases,
+                         const string& pulseCalls,
+                         const string& pulseBases,
+                         const string& cigar,
+                         const Strand strand)
+    { return MakeCigaredPulseBaseRecord(seqBases, pulseCalls, pulseBases, cigar, strand); };
+
+    // AltLabelTag
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseBases, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.AltLabelTag(orientation, aligned, exciseSoftClips, pulseBehavior); }
+    );
+    // PulseCall
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseBases, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PulseCall(orientation, aligned, exciseSoftClips, pulseBehavior); }
+    );
+}
+
+static
+void CheckPulseFrameTags(const string& cigar,
+                         const string& seqBases,
+                         const string& pulseCalls,
+                         const vector<uint16_t>& pulseFrames,
+                         const ExpectedResult<vector<uint16_t>>& allPulses,
+                         const ExpectedResult<vector<uint16_t>>& basecallsOnly)
+{
+    // aligned record + PrePulseFrames
+    auto makeRecord = [](const string& seqBases,
+                         const string& pulseCalls,
+                         const vector<uint16_t>& pulseFrames,
+                         const string& cigar,
+                         const Strand strand)
+    { return MakeCigaredPulseFrameRecord(seqBases, pulseCalls, pulseFrames, cigar, strand); };
+
+    // PrePulseFrame
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseFrames, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PrePulseFrames(orientation, aligned, exciseSoftClips, pulseBehavior).Data(); }
+    );
+    // PulseCallWidth
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseFrames, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PulseCallWidth(orientation, aligned, exciseSoftClips, pulseBehavior).Data(); }
+    );
+}
+
+/*
+
+    { BamRecordTag::PKMEAN,            {"pa", true}  },   photons (vector<float>
+    { BamRecordTag::PKMEAN_2,          {"ps", true}  },   photons
+    { BamRecordTag::PKMID,             {"pm", true}  },   photons
+    { BamRecordTag::PKMID_2,           {"pi", true}  },   photons
+*/
+
+static
+void CheckPulseQualityTags(const string& cigar,
+                           const string& seqBases,
+                           const string& pulseCalls,
+                           const string& pulseQuals,
+                           const ExpectedResult<string>& allPulses,
+                           const ExpectedResult<string>& basecallsOnly)
+{
+    // aligned record + AltLabelQV
+    auto makeRecord = [](const string& seqBases,
+                         const string& pulseCalls,
+                         const string& pulseQuals,
+                         const string& cigar,
+                         const Strand strand)
+    { return MakeCigaredPulseQualRecord(seqBases, pulseCalls, pulseQuals, cigar, strand); };
+
+    // AltLabelQV
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.AltLabelQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); }
+    );
+    // LabelQV
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.LabelQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); }
+    );
+    // PulseMergeQV
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PulseMergeQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); }
+    );
+}
+
+static
+void CheckPulseUIntTags(const string& cigar,
+                        const string& seqBases,
+                        const string& pulseCalls,
+                        const vector<uint32_t>& startFrames,
+                        const ExpectedResult<vector<uint32_t>>& allPulses,
+                        const ExpectedResult<vector<uint32_t>>& basecallsOnly)
+{
+   // aligned record + StartFrame
+   auto makeRecord = [](const string& seqBases,
+                        const string& pulseCalls,
+                        const vector<uint32_t>& startFrames,
+                        const string& cigar,
+                        const Strand strand)
+   { return MakeCigaredPulseUIntRecord(seqBases, pulseCalls, startFrames, cigar, strand); };
+
+   // StartFrame
+   CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, startFrames, allPulses, basecallsOnly, makeRecord,
+                             [](const BamRecord& b,
+                                Orientation orientation,
+                                bool aligned,
+                                bool exciseSoftClips,
+                                PulseBehavior pulseBehavior)
+                             { return b.StartFrame(orientation, aligned, exciseSoftClips, pulseBehavior); }
+   );
+}
+
+
+
+} // namespace tests
+
+TEST(BamRecordTest, DefaultValues)
+{
+    BamRecord bam;
+    const string emptyString;
+
+    // BamRecordImpl data
+    EXPECT_EQ(0, bam.impl_.Bin());
+    EXPECT_EQ(BamRecordImpl::UNMAPPED, bam.impl_.Flag());  // forced init unmapped
+    EXPECT_EQ(0, bam.impl_.InsertSize());
+    EXPECT_EQ(255, bam.impl_.MapQuality());
+    EXPECT_EQ(-1, bam.impl_.MateReferenceId());
+    EXPECT_EQ(-1, bam.impl_.MatePosition());
+    EXPECT_EQ(-1, bam.impl_.Position());
+    EXPECT_EQ(-1, bam.impl_.ReferenceId());
+    EXPECT_EQ(0, bam.impl_.Tags().size());
+
+    EXPECT_FALSE(bam.impl_.IsDuplicate());
+    EXPECT_FALSE(bam.impl_.IsFailedQC());
+    EXPECT_FALSE(bam.impl_.IsFirstMate());
+    EXPECT_FALSE(bam.impl_.IsMapped());             // forced init unmapped
+    EXPECT_TRUE(bam.impl_.IsMateMapped());
+    EXPECT_FALSE(bam.impl_.IsMateReverseStrand());
+    EXPECT_FALSE(bam.impl_.IsPaired());
+    EXPECT_TRUE(bam.impl_.IsPrimaryAlignment());
+    EXPECT_FALSE(bam.impl_.IsProperPair());
+    EXPECT_FALSE(bam.impl_.IsReverseStrand());
+    EXPECT_FALSE(bam.impl_.IsSecondMate());
+    EXPECT_FALSE(bam.impl_.IsSupplementaryAlignment());
+
+    EXPECT_EQ(emptyString, bam.impl_.Name());
+    EXPECT_EQ(emptyString, bam.impl_.CigarData().ToStdString());
+    EXPECT_EQ(emptyString, bam.impl_.Sequence());
+    EXPECT_EQ(emptyString, bam.impl_.Qualities().Fastq());
+
+    // PacBio data
+    EXPECT_EQ(-1, bam.AlignedStart());
+    EXPECT_EQ(-1, bam.AlignedEnd());
+
+    EXPECT_FALSE(bam.HasHoleNumber());
+    EXPECT_FALSE(bam.HasNumPasses());
+    EXPECT_FALSE(bam.HasQueryEnd());
+    EXPECT_FALSE(bam.HasQueryStart());
+    EXPECT_FALSE(bam.HasReadAccuracy());
+
+    EXPECT_THROW(bam.HoleNumber(), std::exception);
+    EXPECT_THROW(bam.NumPasses(), std::exception);
+    EXPECT_EQ(Position(0), bam.QueryEnd());
+    EXPECT_EQ(Position(0), bam.QueryStart());
+    EXPECT_THROW(bam.ReadAccuracy(), std::exception);
+
+    EXPECT_FALSE(bam.HasDeletionQV());
+    EXPECT_FALSE(bam.HasDeletionTag());
+    EXPECT_FALSE(bam.HasInsertionQV());
+    EXPECT_FALSE(bam.HasMergeQV());
+    EXPECT_FALSE(bam.HasSubstitutionQV());
+    EXPECT_FALSE(bam.HasSubstitutionTag());
+
+    EXPECT_THROW(bam.DeletionQV(),      std::exception);
+    EXPECT_THROW(bam.DeletionTag(),     std::exception);
+    EXPECT_THROW(bam.InsertionQV(),     std::exception);
+    EXPECT_THROW(bam.MergeQV(),         std::exception);
+    EXPECT_THROW(bam.SubstitutionQV(),  std::exception);
+    EXPECT_THROW(bam.SubstitutionTag(), std::exception);
+
+    // raw data
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordTest, FromBamRecordImpl)
+{
+    // check generic data
+    BamRecordImpl genericBam = tests::CreateBamImpl();
+
+    EXPECT_EQ(42, genericBam.Bin());
+    EXPECT_EQ(42, genericBam.Flag());
+    EXPECT_EQ(42, genericBam.InsertSize());
+    EXPECT_EQ(42, genericBam.MapQuality());
+    EXPECT_EQ(42, genericBam.MateReferenceId());
+    EXPECT_EQ(42, genericBam.MatePosition());
+    EXPECT_EQ(42, genericBam.Position());
+    EXPECT_EQ(42, genericBam.ReferenceId());
+
+    const TagCollection& genericTags = genericBam.Tags();
+    EXPECT_TRUE(genericTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(string("1abc75"), genericTags.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), genericTags.at("XY").ToInt32());
+    EXPECT_EQ(vector<uint8_t>({34, 5, 125}), genericTags.at("CA").ToUInt8Array());
+
+    // copy ctor
+    BamRecord bam1(genericBam);
+
+    EXPECT_EQ(42, bam1.impl_.Bin());
+    EXPECT_EQ(42, bam1.impl_.Flag());
+    EXPECT_EQ(42, bam1.impl_.InsertSize());
+    EXPECT_EQ(42, bam1.impl_.MapQuality());
+    EXPECT_EQ(42, bam1.impl_.MateReferenceId());
+    EXPECT_EQ(42, bam1.impl_.MatePosition());
+    EXPECT_EQ(42, bam1.impl_.Position());
+    EXPECT_EQ(42, bam1.impl_.ReferenceId());
+
+    const TagCollection& bam1Tags = bam1.impl_.Tags();
+    EXPECT_TRUE(bam1Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(string("1abc75"), bam1Tags.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), bam1Tags.at("XY").ToInt32());
+    EXPECT_EQ(vector<uint8_t>({34, 5, 125}), bam1Tags.at("CA").ToUInt8Array());
+
+    // copy assignment
+    BamRecord bam2;
+    bam2 = genericBam;
+
+    EXPECT_EQ(42, bam2.impl_.Bin());
+    EXPECT_EQ(42, bam2.impl_.Flag());
+    EXPECT_EQ(42, bam2.impl_.InsertSize());
+    EXPECT_EQ(42, bam2.impl_.MapQuality());
+    EXPECT_EQ(42, bam2.impl_.MateReferenceId());
+    EXPECT_EQ(42, bam2.impl_.MatePosition());
+    EXPECT_EQ(42, bam2.impl_.Position());
+    EXPECT_EQ(42, bam2.impl_.ReferenceId());
+
+    const TagCollection& bam2Tags = bam2.impl_.Tags();
+    EXPECT_TRUE(bam2Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(string("1abc75"), bam2Tags.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), bam2Tags.at("XY").ToInt32());
+    EXPECT_EQ(vector<uint8_t>({34, 5, 125}), bam2Tags.at("CA").ToUInt8Array());
+
+    // change genericBam, make sure we deep copied bam1 & bam2
+    genericBam.Position(2000);
+
+    EXPECT_EQ(2000, genericBam.Position());
+    EXPECT_EQ(42, bam1.impl_.Position());
+    EXPECT_EQ(42, bam2.impl_.Position());
+
+    // move ctor
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    BamRecord bam3(move(tests::CreateBamImpl()));
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam3.impl_.Bin());
+    EXPECT_EQ(42, bam3.impl_.Flag());
+    EXPECT_EQ(42, bam3.impl_.InsertSize());
+    EXPECT_EQ(42, bam3.impl_.MapQuality());
+    EXPECT_EQ(42, bam3.impl_.MateReferenceId());
+    EXPECT_EQ(42, bam3.impl_.MatePosition());
+    EXPECT_EQ(42, bam3.impl_.Position());
+    EXPECT_EQ(42, bam3.impl_.ReferenceId());
+
+    const TagCollection& bam3Tags = bam3.impl_.Tags();
+    EXPECT_TRUE(bam3Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(string("1abc75"), bam3Tags.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), bam3Tags.at("XY").ToInt32());
+    EXPECT_EQ(vector<uint8_t>({34, 5, 125}), bam3Tags.at("CA").ToUInt8Array());
+
+    // move assignment
+    BamRecord bam4;
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    bam4 = move(tests::CreateBamImpl());
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam4.impl_.Bin());
+    EXPECT_EQ(42, bam4.impl_.Flag());
+    EXPECT_EQ(42, bam4.impl_.InsertSize());
+    EXPECT_EQ(42, bam4.impl_.MapQuality());
+    EXPECT_EQ(42, bam4.impl_.MateReferenceId());
+    EXPECT_EQ(42, bam4.impl_.MatePosition());
+    EXPECT_EQ(42, bam4.impl_.Position());
+    EXPECT_EQ(42, bam4.impl_.ReferenceId());
+
+    const TagCollection& bam4Tags = bam4.impl_.Tags();
+    EXPECT_TRUE(bam4Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(string("1abc75"), bam4Tags.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), bam4Tags.at("XY").ToInt32());
+    EXPECT_EQ(vector<uint8_t>({34, 5, 125}), bam4Tags.at("CA").ToUInt8Array());
+}
+
+TEST(BamRecordTest, SelfAssignmentTolerated)
+{
+    BamRecord bam1;
+    bam1.impl_.Bin(42);
+    bam1.impl_.Flag(42);
+    bam1.impl_.InsertSize(42);
+    bam1.impl_.MapQuality(42);
+    bam1.impl_.MatePosition(42);
+    bam1.impl_.MateReferenceId(42);
+    bam1.impl_.Position(42);
+    bam1.impl_.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = vector<uint8_t>({34, 5, 125});
+    tags["XY"] = static_cast<int32_t>(-42);
+    bam1.impl_.Tags(tags);
+
+    bam1 = bam1;
+
+    EXPECT_EQ(42, bam1.impl_.Bin());
+    EXPECT_EQ(42, bam1.impl_.Flag());
+    EXPECT_EQ(42, bam1.impl_.InsertSize());
+    EXPECT_EQ(42, bam1.impl_.MapQuality());
+    EXPECT_EQ(42, bam1.impl_.MateReferenceId());
+    EXPECT_EQ(42, bam1.impl_.MatePosition());
+    EXPECT_EQ(42, bam1.impl_.Position());
+    EXPECT_EQ(42, bam1.impl_.ReferenceId());
+
+    const TagCollection& fetchedTags1 = bam1.impl_.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    tests::CheckRawData(bam1);
+}
+
+TEST(BamRecordTest, CoreSetters)
+{
+    // create basic BAM with (generic) data
+    BamRecord bam = tests::CreateBam();
+
+    QualityValues testQVs;
+    testQVs.push_back(0);
+    testQVs.push_back(1);
+
+    const string testTags = "GATTACA";
+
+    // now set PacBio data
+//    bam.AlignedStart(42);
+//    bam.AlignedEnd(42);
+//    bam.DeletionQVs(testQVs);
+//    bam.DeletionTags(testTags);
+//    bam.HoleNumber(42);
+//    bam.InsertionQVs(testQVs);
+//    bam.MergeQVs(testQVs);
+//    bam.NumPasses(42);
+//    bam.QueryEnd(42);
+//    bam.QueryStart(42);
+//    bam.ReadAccuracy(42);
+//    bam.ReferenceEnd(42);
+//    bam.ReferenceStart(42);
+//    bam.SubstitutionQVs(testQVs);
+//    bam.SubstitutionTags(testTags);
+
+    // check generic data
+    EXPECT_EQ(42, bam.impl_.Bin());
+    EXPECT_EQ(42, bam.impl_.Flag());
+    EXPECT_EQ(42, bam.impl_.InsertSize());
+    EXPECT_EQ(42, bam.impl_.MapQuality());
+    EXPECT_EQ(42, bam.impl_.MateReferenceId());
+    EXPECT_EQ(42, bam.impl_.MatePosition());
+    EXPECT_EQ(42, bam.impl_.Position());
+    EXPECT_EQ(42, bam.impl_.ReferenceId());
+
+    // check PacBio data
+//    EXPECT_EQ(42, bam.AlignedStart());
+//    EXPECT_EQ(42, bam.AlignedEnd());
+//    EXPECT_EQ(testQVs, bam.DeletionQVs());
+//    EXPECT_EQ(testTags, bam.DeletionTags());
+//    EXPECT_EQ(42, bam.HoleNumber());
+//    EXPECT_EQ(testQVs, bam.InsertionQVs());
+//    EXPECT_EQ(testQVs, bam.MergeQVs());
+
+//    EXPECT_EQ(42, bam.NumPasses());
+//    EXPECT_EQ(42, bam.QueryEnd());
+//    EXPECT_EQ(42, bam.QueryStart());
+//    EXPECT_EQ(42, bam.ReadAccuracy());
+//    EXPECT_EQ(42, bam.ReferenceEnd());
+//    EXPECT_EQ(42, bam.ReferenceStart());
+//    EXPECT_EQ(testQVs, bam.SubstitutionQVs());
+//    EXPECT_EQ(testTags, bam.SubstitutionTags());
+
+    // check tags
+    const TagCollection& fetchedTags = bam.impl_.Tags();
+    EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(string("1abc75"), fetchedTags.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), fetchedTags.at("XY").ToInt32());
+    EXPECT_EQ(vector<uint8_t>({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array());
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordTest, SequenceOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Sequence");
+        tests::CheckSequenceClippedAndAligned(
+            "13=",                  // CIGAR
+            "ATATATCCCGGCG",        // input
+            {
+                "ATATATCCCGGCG",    // forward strand, genomic
+                "ATATATCCCGGCG",    // forward strand, native
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned
+                "ATATATCCCGGCG",    // forward strand, native,  aligned
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned + clipped
+                "ATATATCCCGGCG",    // forward strand, native,  aligned + clipped
+                "ATATATCCCGGCG",    // reverse strand, genomic
+                "CGCCGGGATATAT",    // reverse strand, native
+                "ATATATCCCGGCG",    // reverse strand, genomic, aligned
+                "CGCCGGGATATAT",    // reverse strand, native,  aligned
+                "ATATATCCCGGCG",    // reverse strand, genomic, aligned + clipped
+                "CGCCGGGATATAT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, QualitiesOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Qualities");
+        tests::CheckQualitiesClippedAndAligned(
+            "13=",                  // CIGAR
+            "?]?]?]?]?]?]*",        // input
+            {
+                "?]?]?]?]?]?]*",    // forward strand, genomic
+                "?]?]?]?]?]?]*",    // forward strand, native
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned + clipped
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned + clipped
+                "?]?]?]?]?]?]*",    // reverse strand, genomic
+                "*]?]?]?]?]?]?",    // reverse strand, native
+                "?]?]?]?]?]?]*",    // reverse strand, genomic, aligned
+                "*]?]?]?]?]?]?",    // reverse strand, native,  aligned
+                "?]?]?]?]?]?]*",    // reverse strand, genomic, aligned + clipped
+                "*]?]?]?]?]?]?"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, SequenceTagsOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Base Tags");
+        tests::CheckBaseTagsClippedAndAligned(
+            "13=",                  // CIGAR
+            "ATATATCCCGGCG",        // input
+            {
+                "ATATATCCCGGCG",    // forward strand, genomic
+                "ATATATCCCGGCG",    // forward strand, native
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned
+                "ATATATCCCGGCG",    // forward strand, native, aligned
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned, clipped
+                "ATATATCCCGGCG",    // forward strand, native, aligned, clipped
+                "CGCCGGGATATAT",    // reverse strand, genomic
+                "ATATATCCCGGCG",    // reverse strand, native
+                "CGCCGGGATATAT",    // reverse strand, genomic, aligned
+                "ATATATCCCGGCG",    // reverse strand, native, aligned
+                "CGCCGGGATATAT",    // reverse strand, genomic, aligned, clipped
+                "ATATATCCCGGCG"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, FrameTagsOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Frames");
+        tests::CheckFrameTagsClippedAndAligned(
+            "5=",                   // CIGAR
+            { 0, 1, 2, 3, 4 },      // input
+            {
+                { 0, 1, 2, 3, 4 },  // forward strand, genomic
+                { 0, 1, 2, 3, 4 },  // forward strand, native
+                { 0, 1, 2, 3, 4 },  // forward strand, genomic, aligned
+                { 0, 1, 2, 3, 4 },  // forward strand, native, aligned
+                { 0, 1, 2, 3, 4 },  // forward strand, genomic, aligned, clipped
+                { 0, 1, 2, 3, 4 },  // forward strand, native, aligned, clipped
+                { 4, 3, 2, 1, 0 },  // reverse strand, genomic
+                { 0, 1, 2, 3, 4 },  // reverse strand, native
+                { 4, 3, 2, 1, 0 },  // reverse strand, genomic, aligned
+                { 0, 1, 2, 3, 4 },  // reverse strand, native, aligned
+                { 4, 3, 2, 1, 0 },  // reverse strand, genomic, aligned, clipped
+                { 0, 1, 2, 3, 4 }   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, QualityTagsOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Quality Tags");
+        tests::CheckQualityTagsClippedAndAligned(
+            "13=",                  // CIGAR
+            "?]?]?]?]?]?]*",        // input
+            {
+                "?]?]?]?]?]?]*",    // forward strand, genomic
+                "?]?]?]?]?]?]*",    // forward strand, native
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned + clipped
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned + clipped
+                "*]?]?]?]?]?]?",    // reverse strand, genomic
+                "?]?]?]?]?]?]*",    // reverse strand, native
+                "*]?]?]?]?]?]?",    // reverse strand, genomic, aligned
+                "?]?]?]?]?]?]*",    // reverse strand, native,  aligned
+                "*]?]?]?]?]?]?",    // reverse strand, genomic, aligned + clipped
+                "?]?]?]?]?]?]*"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, SequenceClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 10=");
+        tests::CheckSequenceClippedAndAligned(
+            "10=",              // CIGAR
+            "ATCCGCGGTT",       // input
+            {
+                "ATCCGCGGTT",   // forward strand, genomic
+                "ATCCGCGGTT",   // forward strand, native
+                "ATCCGCGGTT",   // forward strand, genomic, aligned
+                "ATCCGCGGTT",   // forward strand, native,  aligned
+                "ATCCGCGGTT",   // forward strand, genomic, aligned + clipped
+                "ATCCGCGGTT",   // forward strand, native,  aligned + clipped
+                "ATCCGCGGTT",   // reverse strand, genomic
+                "AACCGCGGAT",   // reverse strand, native
+                "ATCCGCGGTT",   // reverse strand, genomic, aligned
+                "AACCGCGGAT",   // reverse strand, native,  aligned
+                "ATCCGCGGTT",   // reverse strand, genomic, aligned + clipped
+                "AACCGCGGAT"    // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3=4N3=");
+        tests::CheckSequenceClippedAndAligned(
+            "3=4N3=",       // CIGAR
+            "ACGTTT",        // input
+            {
+                "ACGTTT",    // forward strand, genomic
+                "ACGTTT",    // forward strand, native
+                "ACGTTT",    // forward strand, genomic, aligned
+                "ACGTTT",    // forward strand, native,  aligned
+                "ACGTTT",    // forward strand, genomic, aligned + clipped
+                "ACGTTT",    // forward strand, native,  aligned + clipped
+                "ACGTTT",    // reverse strand, genomic
+                "AAACGT",    // reverse strand, native
+                "ACGTTT",    // reverse strand, genomic, aligned
+                "AAACGT",    // reverse strand, native,  aligned
+                "ACGTTT",    // reverse strand, genomic, aligned + clipped
+                "AAACGT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 1S8=1S");
+        tests::CheckSequenceClippedAndAligned(
+            "1S8=1S",           // CIGAR
+            "ACCCGCGGTT",       // input
+            {
+                "ACCCGCGGTT",   // forward strand, genomic
+                "ACCCGCGGTT",   // forward strand, native
+                "ACCCGCGGTT",   // forward strand, genomic, aligned
+                "ACCCGCGGTT",   // forward strand, native,  aligned
+                "CCCGCGGT",     // forward strand, genomic, aligned + clipped
+                "CCCGCGGT",     // forward strand, native,  aligned + clipped
+                "ACCCGCGGTT",   // reverse strand, genomic
+                "AACCGCGGGT",   // reverse strand, native
+                "ACCCGCGGTT",   // reverse strand, genomic, aligned
+                "AACCGCGGGT",   // reverse strand, native,  aligned
+                "CCCGCGGT",     // reverse strand, genomic, aligned + clipped
+                "ACCGCGGG"      // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 1H8=1H");
+        tests::CheckSequenceClippedAndAligned(
+            "1H8=1H",           // CIGAR
+            "ATCGCGGT",         // input
+            {
+                "ATCGCGGT",     // forward strand, genomic
+                "ATCGCGGT",     // forward strand, native
+                "ATCGCGGT",     // forward strand, genomic, aligned
+                "ATCGCGGT",     // forward strand, native,  aligned
+                "ATCGCGGT",     // forward strand, genomic, aligned + clipped
+                "ATCGCGGT",     // forward strand, native,  aligned + clipped
+                "ATCGCGGT",     // reverse strand, genomic
+                "ACCGCGAT",     // reverse strand, native
+                "ATCGCGGT",     // reverse strand, genomic, aligned
+                "ACCGCGAT",     // reverse strand, native,  aligned
+                "ATCGCGGT",     // reverse strand, genomic, aligned + clipped
+                "ACCGCGAT"      // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2S6=2S");
+        tests::CheckSequenceClippedAndAligned(
+            "2S6=2S",           // CIGAR
+            "AGCCGCGGTT",       // input
+            {
+                "AGCCGCGGTT",   // forward strand, genomic
+                "AGCCGCGGTT",   // forward strand, native
+                "AGCCGCGGTT",   // forward strand, genomic, aligned
+                "AGCCGCGGTT",   // forward strand, native,  aligned
+                "CCGCGG",       // forward strand, genomic, aligned + clipped
+                "CCGCGG",       // forward strand, native,  aligned + clipped
+                "AGCCGCGGTT",   // reverse strand, genomic
+                "AACCGCGGCT",   // reverse strand, native
+                "AGCCGCGGTT",   // reverse strand, genomic, aligned
+                "AACCGCGGCT",   // reverse strand, native,  aligned
+                "CCGCGG",       // reverse strand, genomic, aligned + clipped
+                "CCGCGG"        // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2S3=2I3=2S");
+        tests::CheckSequenceClippedAndAligned(
+            "2S3=2I3=2S",           // CIGAR
+            "ATCCGNNCGGTT",         // input
+            {
+                "ATCCGNNCGGTT",     // forward strand, genomic
+                "ATCCGNNCGGTT",     // forward strand, native
+                "ATCCGNNCGGTT",     // forward strand, genomic, aligned
+                "ATCCGNNCGGTT",     // forward strand, native,  aligned
+                "CCGNNCGG",         // forward strand, genomic, aligned + clipped
+                "CCGNNCGG",         // forward strand, native,  aligned + clipped
+                "ATCCGNNCGGTT",     // reverse strand, genomic
+                "AACCGNNCGGAT",     // reverse strand, native
+                "ATCCGNNCGGTT",     // reverse strand, genomic, aligned
+                "AACCGNNCGGAT",     // reverse strand, native,  aligned
+                "CCGNNCGG",         // reverse strand, genomic, aligned + clipped
+                "CCGNNCGG"          // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H6=2H");
+        tests::CheckSequenceClippedAndAligned(
+            "2H6=2H",       // CIGAR
+            "CAGCGG",       // input
+            {
+                "CAGCGG",   // forward strand, genomic
+                "CAGCGG",   // forward strand, native
+                "CAGCGG",   // forward strand, genomic, aligned
+                "CAGCGG",   // forward strand, native,  aligned
+                "CAGCGG",   // forward strand, genomic, aligned + clipped
+                "CAGCGG",   // forward strand, native,  aligned + clipped
+                "CAGCGG",   // reverse strand, genomic
+                "CCGCTG",   // reverse strand, native
+                "CAGCGG",   // reverse strand, genomic, aligned
+                "CCGCTG",   // reverse strand, native,  aligned
+                "CAGCGG",   // reverse strand, genomic, aligned + clipped
+                "CCGCTG"    // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, ClippingOrientationAndAlignment)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        tests::CheckSequenceClippedAndAligned(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native,  aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",  // forward strand, native,  aligned + clipped
+                "AACCGTTA",     // reverse strand, genomic
+                "TAACGGTT",     // reverse strand, native
+                "AACC---GTTA",  // reverse strand, genomic, aligned
+                "TAAC---GGTT",  // reverse strand, native,  aligned
+                "AACC---GTTA",  // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        tests::CheckSequenceClippedAndAligned(
+            "4=1D2I2D4=",           // CIGAR
+            "ATCCTAGGTT",           // input
+            {
+                "ATCCTAGGTT",       // forward strand, genomic
+                "ATCCTAGGTT",       // forward strand, native
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned
+                "ATCC-TA--GGTT",    // forward strand, native,  aligned
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned + clipped
+                "ATCC-TA--GGTT",    // forward strand, native,  aligned + clipped
+                "ATCCTAGGTT",       // reverse strand, genomic
+                "AACCTAGGAT",       // reverse strand, native
+                "ATCC-TA--GGTT",    // reverse strand, genomic, aligned
+                "AACC--TA-GGAT",    // reverse strand, native,  aligned
+                "ATCC-TA--GGTT",    // reverse strand, genomic, aligned + clipped
+                "AACC--TA-GGAT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        tests::CheckSequenceClippedAndAligned(
+            "4=1D2P2I2P2D4=",           // CIGAR
+            "ATCCTAGGTT",               // input
+            {
+                "ATCCTAGGTT",           // forward strand, genomic
+                "ATCCTAGGTT",           // forward strand, native
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, native,  aligned
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned + clipped
+                "ATCC-**TA**--GGTT",    // forward strand, native,  aligned + clipped
+                "ATCCTAGGTT",           // reverse strand, genomic
+                "AACCTAGGAT",           // reverse strand, native
+                "ATCC-**TA**--GGTT",    // reverse strand, genomic, aligned
+                "AACC--**TA**-GGAT",    // reverse strand, native,  aligned
+                "ATCC-**TA**--GGTT",    // reverse strand, genomic, aligned + clipped
+                "AACC--**TA**-GGAT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2S4=3D4=3S");
+        tests::CheckSequenceClippedAndAligned(
+            "2S4=3D4=3S",               // CIGAR
+            "TTAACCGTTACCG",            // input
+            {
+                "TTAACCGTTACCG",        // forward strand, genomic
+                "TTAACCGTTACCG",        // forward strand, native
+                "TTAACC---GTTACCG",     // forward strand, genomic, aligned
+                "TTAACC---GTTACCG",     // forward strand, native,  aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",          // forward strand, native,  aligned + clipped
+                "TTAACCGTTACCG",        // reverse strand, genomic
+                "CGGTAACGGTTAA",        // reverse strand, native
+                "TTAACC---GTTACCG",     // reverse strand, genomic, aligned
+                "CGGTAAC---GGTTAA",     // reverse strand, native,  aligned
+                "AACC---GTTA",          // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"           // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        tests::CheckSequenceClippedAndAligned(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native,  aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",  // forward strand, native,  aligned + clipped
+                "AACCGTTA",     // reverse strand, genomic
+                "TAACGGTT",     // reverse strand, native
+                "AACC---GTTA",  // reverse strand, genomic, aligned
+                "TAAC---GGTT",  // reverse strand, native,  aligned
+                "AACC---GTTA",  // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H2S4=3D4=3S3H");
+        tests::CheckSequenceClippedAndAligned(
+            "2H2S4=3D4=3S3H",           // CIGAR
+            "TTAACCGTTACCG",            // input
+            {
+                "TTAACCGTTACCG",        // forward strand, genomic
+                "TTAACCGTTACCG",        // forward strand, native
+                "TTAACC---GTTACCG",     // forward strand, genomic, aligned
+                "TTAACC---GTTACCG",     // forward strand, native,  aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",          // forward strand, native,  aligned + clipped
+                "TTAACCGTTACCG",        // reverse strand, genomic
+                "CGGTAACGGTTAA",        // reverse strand, native
+                "TTAACC---GTTACCG",     // reverse strand, genomic, aligned
+                "CGGTAAC---GGTTAA",     // reverse strand, native,  aligned
+                "AACC---GTTA",          // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"           // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, QualityTagsClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        tests::CheckQualityTagsClippedAndAligned(
+            "4=3D4=",           // CIGAR
+            "?]?]?]?@",         // input
+            {
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native,  aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned + clipped
+                "?]?]!!!?]?@",  // forward strand, native,  aligned + clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native,  aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned + clipped
+                "?]?]!!!?]?@"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        tests::CheckQualityTagsClippedAndAligned(
+            "4=1D2I2D4=",           // CIGAR
+            "?]?]87?]?@",           // input
+            {
+                "?]?]87?]?@",       // forward strand, genomic
+                "?]?]87?]?@",       // forward strand, native
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned + clipped
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned + clipped
+                "@?]?78]?]?",       // reverse strand, genomic
+                "?]?]87?]?@",       // reverse strand, native
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned
+                "?]?]!!87!?]?@",    // reverse strand, native,  aligned
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned + clipped
+                "?]?]!!87!?]?@"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        tests::CheckQualityTagsClippedAndAligned(
+            "4=1D2P2I2P2D4=",       // CIGAR
+            "?]?]87?]?@",           // input
+        {
+            "?]?]87?]?@",           // forward strand, genomic
+            "?]?]87?]?@",           // forward strand, native
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned + clipped
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned + clipped
+            "@?]?78]?]?",           // reverse strand, genomic
+            "?]?]87?]?@",           // reverse strand, native
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned
+            "?]?]!!!!87!!!?]?@",    // reverse strand, native,  aligned
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned + clipped
+            "?]?]!!!!87!!!?]?@"     // reverse strand, native,  aligned + clipped
+        }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        tests::CheckQualityTagsClippedAndAligned(
+            "3S4=3D4=3S",               // CIGAR
+            "vvv?]?]?]?@xxx",           // input
+            {
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        tests::CheckQualityTagsClippedAndAligned(
+            "2H4=3D4=3H",       // CIGAR
+            "?]?]?]?@",         // input
+            {
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native, aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",  // forward strand, native, aligned, clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native, aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        tests::CheckQualityTagsClippedAndAligned(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "vvv?]?]?]?@xxx",           // input
+            {
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, BaseTagsClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        tests::CheckBaseTagsClippedAndAligned(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        tests::CheckBaseTagsClippedAndAligned(
+            "4=1D2I2D4=",           // CIGAR
+            "ATCCTAGGTT",           // input
+            {
+                "ATCCTAGGTT",       // forward strand, genomic
+                "ATCCTAGGTT",       // forward strand, native
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned
+                "ATCC-TA--GGTT",    // forward strand, native, aligned
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-TA--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",       // reverse strand, genomic
+                "ATCCTAGGTT",       // reverse strand, native
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--TA-GGTT",    // reverse strand, native, aligned
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--TA-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        tests::CheckBaseTagsClippedAndAligned(
+            "4=1D2P2I2P2D4=",           // CIGAR
+            "ATCCTAGGTT",               // input
+            {
+                "ATCCTAGGTT",           // forward strand, genomic
+                "ATCCTAGGTT",           // forward strand, native
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",           // reverse strand, genomic
+                "ATCCTAGGTT",           // reverse strand, native
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--**TA**-GGTT",    // reverse strand, native, aligned
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--**TA**-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        tests::CheckBaseTagsClippedAndAligned(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // input
+            {
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        tests::CheckBaseTagsClippedAndAligned(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        tests::CheckBaseTagsClippedAndAligned(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // input
+            {
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, FrameTagsClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        tests::CheckFrameTagsClippedAndAligned(
+            "4=3D4=",                                           // CIGAR
+            { 10, 20, 10, 20, 10, 20, 10, 30 },                 // input
+            {
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        tests::CheckFrameTagsClippedAndAligned(
+            "4=1D2I2D4=",                                               // CIGAR
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                 // input
+            {
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        tests::CheckFrameTagsClippedAndAligned(
+            "4=1D2P2I2P2D4=",                                                   // CIGAR
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // input
+        {
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, genomic
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, native
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+            { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },                         // reverse strand, genomic
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // reverse strand, native
+            { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+            { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+            { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+            { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+        }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        tests::CheckFrameTagsClippedAndAligned(
+            "3S4=3D4=3S",                                                               // CIGAR
+            { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },                 // input
+            {
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        tests::CheckFrameTagsClippedAndAligned(
+            "2H4=3D4=3H",                                       // CIGAR
+            { 10, 20, 10, 20, 10, 20, 10, 30 },                 // input
+            {
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        tests::CheckFrameTagsClippedAndAligned(
+            "2H3S4=3D4=3S3H",                                                           // CIGAR
+            { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },                 // input
+            {
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseBaseTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        tests::CheckPulseBaseTags(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "AAaaCCGggTTA",     // tag data
+
+            {   // all pulses
+
+                "AAaaCCGggTTA",     // forward strand, genomic
+                "AAaaCCGggTTA",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native, aligned
+                "",  // forward strand, genomic, aligned, clipped
+                "",  // forward strand, native, aligned, clipped
+                "TAAccCGGttTT",     // reverse strand, genomic
+                "AAaaCCGggTTA",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native, aligned
+                "",  // reverse strand, genomic, aligned, clipped
+                ""   // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        tests::CheckPulseBaseTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "ATttCCTtAGGggTT",  // tag data
+
+            {   // all pulses
+
+                "ATttCCTtAGGggTT",       // forward strand, genomic
+                "ATttCCTtAGGggTT",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",    // forward strand, genomic, aligned, clipped
+                "",    // forward strand, native, aligned, clipped
+                "AAccCCTaAGGaaAT",       // reverse strand, genomic
+                "ATttCCTtAGGggTT",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",    // reverse strand, genomic, aligned, clipped
+                ""     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "ATCCTAGGTT",       // forward strand, genomic
+                "ATCCTAGGTT",       // forward strand, native
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned
+                "ATCC-TA--GGTT",    // forward strand, native, aligned
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-TA--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",       // reverse strand, genomic
+                "ATCCTAGGTT",       // reverse strand, native
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--TA-GGTT",    // reverse strand, native, aligned
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--TA-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        tests::CheckPulseBaseTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "ATttCCTtAGGggTT",  // tag data
+            {
+                "ATttCCTtAGGggTT",           // forward strand, genomic
+                "ATttCCTtAGGggTT",           // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",    // forward strand, genomic, aligned, clipped
+                "",    // forward strand, native, aligned, clipped
+                "AAccCCTaAGGaaAT",           // reverse strand, genomic
+                "ATttCCTtAGGggTT",           // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",    // reverse strand, genomic, aligned, clipped
+                ""     // reverse strand, native, aligned, clipped
+            },
+            {
+                "ATCCTAGGTT",           // forward strand, genomic
+                "ATCCTAGGTT",           // forward strand, native
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",           // reverse strand, genomic
+                "ATCCTAGGTT",           // reverse strand, native
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--**TA**-GGTT",    // reverse strand, native, aligned
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--**TA**-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        tests::CheckPulseBaseTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "TTTttAACCccGTTAaaCCG",     // tag data
+
+            {   // all pulses
+
+                "TTTttAACCccGTTAaaCCG",       // forward strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // forward strand, native
+                "",         // forward strand, genomic, aligned
+                "",         // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "CGGttTAACggGGTTaaAAA",       // reverse strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",     // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        tests::CheckPulseBaseTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "AAaaCCGggTTA",     // tag data
+
+            {   // all pulses
+
+                "AAaaCCGggTTA",     // forward strand, genomic
+                "AAaaCCGggTTA",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native, aligned
+                "",  // forward strand, genomic, aligned, clipped
+                "",  // forward strand, native, aligned, clipped
+                "TAAccCGGttTT",     // reverse strand, genomic
+                "AAaaCCGggTTA",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native, aligned
+                "",  // reverse strand, genomic, aligned, clipped
+                ""   // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        tests::CheckPulseBaseTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "TTTttAACCccGTTAaaCCG",     // tag data
+
+            {   // all pulses
+
+                "TTTttAACCccGTTAaaCCG",       // forward strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // forward strand, native
+                "",         // forward strand, genomic, aligned
+                "",         // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "CGGttTAACggGGTTaaAAA",       // reverse strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // reverse strand, native
+                "",         // reverse strand, genomic, aligned
+                "",         // reverse strand, native, aligned
+                "",          // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseQualityTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        tests::CheckPulseQualityTags(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "?]!!?]?!!]?@",     // tag data
+
+            {   // all pulses
+
+                "?]!!?]?!!]?@",     // forward strand, genomic
+                "?]!!?]?!!]?@",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native,  aligned
+                "",  // forward strand, genomic, aligned + clipped
+                "",  // forward strand, native,  aligned + clipped
+                "@?]!!?]?!!]?",     // reverse strand, genomic
+                "?]!!?]?!!]?@",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native,  aligned
+                "",  // reverse strand, genomic, aligned + clipped
+                ""   // reverse strand, native,  aligned + clipped
+            },
+            {   // basecalls only
+
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native,  aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned + clipped
+                "?]?]!!!?]?@",  // forward strand, native,  aligned + clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native,  aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned + clipped
+                "?]?]!!!?]?@"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        tests::CheckPulseQualityTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "?]!!?]8!7?]!!?@",  // tag data
+
+            {   // all pulses
+
+                "?]!!?]8!7?]!!?@",       // forward strand, genomic
+                "?]!!?]8!7?]!!?@",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native,  aligned
+                "",    // forward strand, genomic, aligned + clipped
+                "",    // forward strand, native,  aligned + clipped
+                "@?!!]?7!8]?!!]?",       // reverse strand, genomic
+                "?]!!?]8!7?]!!?@",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native,  aligned
+                "",    // reverse strand, genomic, aligned + clipped
+                ""     // reverse strand, native,  aligned + clipped
+            },
+            {   // basecalls only
+
+                "?]?]87?]?@",       // forward strand, genomic
+                "?]?]87?]?@",       // forward strand, native
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned + clipped
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned + clipped
+                "@?]?78]?]?",       // reverse strand, genomic
+                "?]?]87?]?@",       // reverse strand, native
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned
+                "?]?]!!87!?]?@",    // reverse strand, native,  aligned
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned + clipped
+                "?]?]!!87!?]?@"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        tests::CheckPulseQualityTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "?]!!?]8!7?]!!?@",  // tag data
+        {
+            "?]!!?]8!7?]!!?@",           // forward strand, genomic
+            "?]!!?]8!7?]!!?@",           // forward strand, native
+            "",    // forward strand, genomic, aligned
+            "",    // forward strand, native,  aligned
+            "",    // forward strand, genomic, aligned + clipped
+            "",    // forward strand, native,  aligned + clipped
+            "@?!!]?7!8]?!!]?",           // reverse strand, genomic
+            "?]!!?]8!7?]!!?@",           // reverse strand, native
+            "",    // reverse strand, genomic, aligned
+            "",    // reverse strand, native,  aligned
+            "",    // reverse strand, genomic, aligned + clipped
+            ""     // reverse strand, native,  aligned + clipped
+        },
+        {
+            "?]?]87?]?@",           // forward strand, genomic
+            "?]?]87?]?@",           // forward strand, native
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned + clipped
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned + clipped
+            "@?]?78]?]?",           // reverse strand, genomic
+            "?]?]87?]?@",           // reverse strand, native
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned
+            "?]?]!!!!87!!!?]?@",    // reverse strand, native,  aligned
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned + clipped
+            "?]?]!!!!87!!!?]?@"     // reverse strand, native,  aligned + clipped
+        }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        tests::CheckPulseQualityTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "vvv!!?]?]!!?]?@!!xxx",     // tag data
+
+            {   // all pulses
+
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "xxx!!@?]?!!]?]?!!vvv",       // reverse strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",          // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        tests::CheckPulseQualityTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "?]!!?]?!!]?@",     // tag data
+
+            {   // all pulses
+
+                "?]!!?]?!!]?@",     // forward strand, genomic
+                "?]!!?]?!!]?@",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native, aligned
+                "",  // forward strand, genomic, aligned, clipped
+                "",  // forward strand, native, aligned, clipped
+                "@?]!!?]?!!]?",     // reverse strand, genomic
+                "?]!!?]?!!]?@",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native, aligned
+                "",  // reverse strand, genomic, aligned, clipped
+                ""   // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native, aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",  // forward strand, native, aligned, clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native, aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        tests::CheckPulseQualityTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "vvv!!?]?]!!?]?@!!xxx",     // tag data
+
+            {   // all pulses
+
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "xxx!!@?]?!!]?]?!!vvv",       // reverse strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",          // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseFrameTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        tests::CheckPulseFrameTags(
+            "4=3D4=",       // CIGAR
+            "AACCGTTA",     // seqBases
+            "AAaaCCGggTTA", // pulseCalls
+            { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },   // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        tests::CheckPulseFrameTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        tests::CheckPulseFrameTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },                         // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        tests::CheckPulseFrameTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },   // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        tests::CheckPulseFrameTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        tests::CheckPulseFrameTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },                 // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseUIntTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        tests::CheckPulseUIntTags(
+            "4=3D4=",       // CIGAR
+            "AACCGTTA",     // seqBases
+            "AAaaCCGggTTA", // pulseCalls
+            { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },   // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        tests::CheckPulseUIntTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        tests::CheckPulseUIntTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },                         // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        tests::CheckPulseUIntTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },   // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        tests::CheckPulseUIntTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        tests::CheckPulseUIntTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },                 // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
diff --git a/tests/src/test_BamRecordBuilder.cpp b/tests/src/test_BamRecordBuilder.cpp

new file mode 100644 (file)

index 0000000..d1287d5
--- /dev/null
+++ b/tests/src/test_BamRecordBuilder.cpp
@@ -0,0 +1,274 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/BamRecordBuilder.h>
+#include <pbbam/BamTagCodec.h>
+#include <chrono>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace tests {
+
+static
+void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+
+    const uint32_t expectedNameLength  = bam.Name().size() + 1;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t  expectedSeqLength   = bam.Sequence().length();
+    const size_t   expectedTagsLength  = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + << TAGS >>
+
+    const int expectedTotalDataLength = expectedNameLength +
+                                        (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength+1)/2 +
+                                         expectedSeqLength +
+                                         expectedTagsLength;
+
+    EXPECT_TRUE((bool)bam.d_);
+    EXPECT_EQ(expectedNameLength,      bam.d_->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps,     bam.d_->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength,       bam.d_->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, bam.d_->l_data);
+}
+
+static
+void CheckRawData(const BamRecord& bam)
+{ CheckRawData(bam.impl_); }
+
+} // namespace tests
+
+TEST(BamRecordBuilderTest, DefaultValues)
+{
+    BamRecordBuilder builder;
+    BamRecord bam = builder.Build();
+
+    const PBBAM_SHARED_PTR<bam1_t> rawData = bam.impl_.d_;
+    ASSERT_TRUE((bool)rawData);
+
+    // fixed-length (core) data
+    EXPECT_EQ(0, rawData->core.tid);
+    EXPECT_EQ(0, rawData->core.pos);
+    EXPECT_EQ(0, rawData->core.bin);
+    EXPECT_EQ(0, rawData->core.qual);
+    EXPECT_EQ(1, rawData->core.l_qname);    // initialized w/ NULL-term
+    EXPECT_EQ(0, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(0, rawData->core.mtid);
+    EXPECT_EQ(0, rawData->core.mpos);
+    EXPECT_EQ(0, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(1, rawData->l_data);
+    EXPECT_EQ((int)0x800, rawData->m_data);  // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(0, bam.impl_.Bin());
+    EXPECT_EQ(0, bam.impl_.Flag());
+    EXPECT_EQ(0, bam.impl_.InsertSize());
+    EXPECT_EQ(0, bam.impl_.MapQuality());
+    EXPECT_EQ(0, bam.impl_.MateReferenceId());
+    EXPECT_EQ(0, bam.impl_.MatePosition());
+    EXPECT_EQ(0, bam.impl_.Position());
+    EXPECT_EQ(0, bam.impl_.ReferenceId());
+    EXPECT_EQ(0, bam.impl_.Tags().size());
+
+    EXPECT_FALSE(bam.impl_.IsDuplicate());
+    EXPECT_FALSE(bam.impl_.IsFailedQC());
+    EXPECT_FALSE(bam.impl_.IsFirstMate());
+    EXPECT_TRUE(bam.impl_.IsMapped());
+    EXPECT_TRUE(bam.impl_.IsMateMapped());
+    EXPECT_FALSE(bam.impl_.IsMateReverseStrand());
+    EXPECT_FALSE(bam.impl_.IsPaired());
+    EXPECT_TRUE(bam.impl_.IsPrimaryAlignment());
+    EXPECT_FALSE(bam.impl_.IsProperPair());
+    EXPECT_FALSE(bam.impl_.IsReverseStrand());
+    EXPECT_FALSE(bam.impl_.IsSecondMate());
+    EXPECT_FALSE(bam.impl_.IsSupplementaryAlignment());
+
+    const std::string emptyString = "";
+    EXPECT_EQ(emptyString, bam.impl_.Name());
+    EXPECT_EQ(emptyString, bam.impl_.CigarData().ToStdString());
+    EXPECT_EQ(emptyString, bam.impl_.Sequence());
+    EXPECT_EQ(emptyString, bam.impl_.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordBuilderTest, CheckSetters)
+{
+    // should be 28 bytes, encoded
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = static_cast<int32_t>(-42);
+
+    BamRecordBuilder builder;
+    builder.Bin(42)
+           .Flag(42)
+           .InsertSize(42)
+           .MapQuality(42)
+           .MatePosition(42)
+           .MateReferenceId(42)
+           .Position(42)
+           .ReferenceId(42)
+           .Tags(tags);
+
+    BamRecord bam = builder.Build();
+
+    // -------------------------------
+    // check raw data
+    // -------------------------------
+
+    const PBBAM_SHARED_PTR<bam1_t> rawData = bam.impl_.d_;
+    ASSERT_TRUE((bool)rawData);
+
+    // fixed-length (core) data
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(1,  rawData->core.l_qname);    // initialized w/ NULL-term
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0,  rawData->core.n_cigar);
+    EXPECT_EQ(0,  rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(29, rawData->l_data);         // NULL-term qname + tags
+    EXPECT_EQ((int)0x800, rawData->m_data); // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(42, bam.impl_.Bin());
+    EXPECT_EQ(42, bam.impl_.Flag());
+    EXPECT_EQ(42, bam.impl_.InsertSize());
+    EXPECT_EQ(42, bam.impl_.MapQuality());
+    EXPECT_EQ(42, bam.impl_.MateReferenceId());
+    EXPECT_EQ(42, bam.impl_.MatePosition());
+    EXPECT_EQ(42, bam.impl_.Position());
+    EXPECT_EQ(42, bam.impl_.ReferenceId());
+
+    const TagCollection& fetchedTags = bam.impl_.Tags();
+
+    EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), fetchedTags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array());
+}
+
+//#define SEQ_LENGTH  7000
+//#define NUM_RECORDS 1000
+
+//const std::string& TEST_SEQUENCE  = std::string(SEQ_LENGTH, 'G');
+//const std::string& TEST_QUALITIES = std::string(SEQ_LENGTH, '=');
+//const std::string& TEST_NAME      = std::string(SEQ_LENGTH, '/');
+//const std::string& TEST_TAGDATA   = std::string(SEQ_LENGTH, '2');
+
+//TEST(BamRecordBuilderTest, JustDoingSomeTimings_BamRecordBuilder)
+//{
+
+//    BamRecordBuilder builder;
+
+//    TagCollection tags;
+//    tags["aa"] = TEST_TAGDATA;
+//    tags["bb"] = TEST_TAGDATA;
+//    tags["cc"] = TEST_TAGDATA;
+//    tags["dd"] = TEST_TAGDATA;
+//    tags["ee"] = TEST_TAGDATA;
+//    tags["ff"] = TEST_TAGDATA;
+
+//    auto start = std::chrono::steady_clock::now();
+
+//    BamRecord record;
+//    for (size_t i = 0; i < NUM_RECORDS; ++i) {
+//        builder.Sequence(TEST_SEQUENCE)
+//               .Qualities(TEST_QUALITIES)
+//               .Name(TEST_NAME)
+//               .Tags(tags)
+//               .BuildInPlace(record);
+//    }
+//    auto end = std::chrono::steady_clock::now();
+//    (void)record;
+//    auto diff = end - start;
+//    std::cout << std::chrono::duration <double, std::milli>(diff).count() << " ms" << std::endl;
+//}
+
+
+//TEST(BamRecordBuilderTest, JustDoingSomeTimings_BamRecordOnly)
+//{
+//    TagCollection tags;
+//    tags["aa"] = TEST_TAGDATA;
+//    tags["bb"] = TEST_TAGDATA;
+//    tags["cc"] = TEST_TAGDATA;
+//    tags["dd"] = TEST_TAGDATA;
+//    tags["ee"] = TEST_TAGDATA;
+//    tags["ff"] = TEST_TAGDATA;
+
+//    auto start = std::chrono::steady_clock::now();
+
+//    BamRecord record;
+//    for (size_t i = 0; i < NUM_RECORDS; ++i) {
+//        record.SetSequenceAndQualities(TEST_SEQUENCE, TEST_QUALITIES);
+//        record.Name(TEST_NAME);
+//        record.Tags(tags);
+//    }
+//    auto end = std::chrono::steady_clock::now();
+//    (void)record;
+//    auto diff = end - start;
+//    std::cout << std::chrono::duration <double, std::milli>(diff).count() << " ms" << std::endl;
+//}
+
diff --git a/tests/src/test_BamRecordClipping.cpp b/tests/src/test_BamRecordClipping.cpp

new file mode 100644 (file)

index 0000000..8a5ddb2
--- /dev/null
+++ b/tests/src/test_BamRecordClipping.cpp
@@ -0,0 +1,1731 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamRecordView.h>
+#include <pbbam/BamTagCodec.h>
+#include <chrono>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+typedef vector<uint16_t> f_data;
+
+namespace tests {
+
+static
+BamRecord MakeRecord(const Position qStart,
+                     const Position qEnd,
+                     const string& seq,
+                     const string& quals,
+                     const string& tagBases,
+                     const string& tagQuals,
+                     const f_data& frames,
+                     const string& pulseCall = "",
+                     const string& pulseBases = "",
+                     const string& pulseQuals = "",
+                     const f_data& pulseFrames = f_data())
+{
+    BamRecordImpl impl;
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["qs"] = qStart;        // qStart
+    tags["qe"] = qEnd;          // qEnd
+    tags["dt"] = tagBases;      // deletionTag
+    tags["st"] = tagBases;      // substitutionTag
+    tags["dq"] = tagQuals;      // deletionQV
+    tags["iq"] = tagQuals;      // insertionQV
+    tags["mq"] = tagQuals;      // mergeQV
+    tags["sq"] = tagQuals;      // substitutionQV
+    tags["ip"] = frames;        // IPD
+    tags["pw"] = frames;        // pulseWidth
+    tags["pc"] = pulseCall;     // pulseCall
+    tags["pt"] = pulseBases;    // altLabelTag
+    tags["pq"] = pulseQuals;    // labelQV
+    tags["pv"] = pulseQuals;    // altLabelQV
+    tags["pg"] = pulseQuals;    // pulseMergeQV
+    tags["pa"] = pulseFrames;   // pkmean
+    tags["pm"] = pulseFrames;   // pkmid
+    impl.Tags(tags);
+
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCCSRecord(const string& seq,
+                        const string& quals,
+                        const string& tagBases,
+                        const string& tagQuals,
+                        const f_data& frames,
+                        const string& pulseCall = "",
+                        const string& pulseBases = "",
+                        const string& pulseQuals = "",
+                        const f_data& pulseFrames = f_data())
+{
+    BamRecordImpl impl;
+    impl.Name("movie/42/ccs");
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["dt"] = tagBases;      // deletionTag
+    tags["st"] = tagBases;      // substitutionTag
+    tags["dq"] = tagQuals;      // deletionQV
+    tags["iq"] = tagQuals;      // insertionQV
+    tags["mq"] = tagQuals;      // mergeQV
+    tags["sq"] = tagQuals;      // substitutionQV
+    tags["ip"] = frames;        // IPD
+    tags["pw"] = frames;        // pulseWidth
+    tags["pc"] = pulseCall;     // pulseCall
+    tags["pt"] = pulseBases;    // altLabelTag
+    tags["pq"] = pulseQuals;    // labelQV
+    tags["pv"] = pulseQuals;    // altLabelQV
+    tags["pg"] = pulseQuals;    // pulseMergeQV
+    tags["pa"] = pulseFrames;   // pkmean
+    tags["pm"] = pulseFrames;   // pkmid
+    impl.Tags(tags);
+
+    return BamRecord(std::move(impl));
+}
+
+} // namespace tests
+
+TEST(BamRecordClippingTest, ClipToQuery_Basic)
+{
+    const Position qStart  = 500;
+    const Position qEnd    = 510;
+    const string seq       = "AACCGTTAGC";
+    const string quals     = "?]?]?]?]?*";
+    const string tagBases  = "AACCGTTAGC";
+    const string tagQuals  = "?]?]?]?]?*";
+    const f_data frames    = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const string pulseCall   = "ttAaAtaCCGggatTTAcatGCt";
+    const string pulseBases  = pulseCall;
+    const string pulseQuals  = "==?=]==?]?====]?]===?*=";
+    const f_data pulseFrames = { 0,0,10,0,10,0,0,20,20,30,0,0,0,0,40,40,10,0,0,0,30,20,0 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const string seq_clipped      = "CCGTTAG";
+    const string quals_clipped    = "?]?]?]?";
+    const string tagBases_clipped = "CCGTTAG";
+    const string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const string pulseCall_clipped = "CCGggatTTAcatG";
+    const string pulseQuals_clipped = "?]?====]?]===?";
+    const f_data pulseFrames_clipped = { 20,20,30,0,0,0,0,40,40,10,0,0,0,30 };
+
+    const string seq_rev       = "GCTAACGGTT";
+    const string pulseCall_rev = "aGCatgTAAatccCGGtaTtTaa";
+    const string quals_rev     = "*?]?]?]?]?";
+    const string tagQuals_rev  = quals_rev;
+    const f_data frames_rev    = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 };
+
+    const string seq_rev_clipped   = "CTAACGG";
+    const string quals_rev_clipped = "?]?]?]?";
+    const string tagBases_rev_clipped = seq_rev_clipped;
+    const string tagQuals_rev_clipped = quals_rev_clipped;
+    const f_data frames_rev_clipped = { 30, 10, 40, 40, 30, 20, 20 };
+
+    const string pulseCall_rev_clipped = "CatgTAAatccCGG";
+    const string pulseQuals_rev_clipped    = "?===]?]====?]?";
+    const f_data pulseFrames_rev_clipped = { 30,0,0,0,10,40,40,0,0,0,0,30,20,20 };
+
+    const string s1_cigar = "10=";
+    const string s2_cigar = "5=3D5=";
+    const string s3_cigar = "4=1D2I2D4=";
+
+    const string s1_cigar_clipped = "7=";
+    const string s2_cigar_clipped = "3=3D4=";
+    const string s3_cigar_clipped = "2=1D2I2D3=";
+
+    const BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  pulseCall, pulseBases, pulseQuals, pulseFrames);
+
+    BamRecord s0 = prototype; // unmapped record
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    s0.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s1.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    {   // s0
+
+        EXPECT_FALSE(s0.IsMapped());
+        EXPECT_EQ(clipStart, s0.QueryStart());
+        EXPECT_EQ(clipEnd,   s0.QueryEnd());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedStart());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedEnd());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceStart());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceEnd());
+
+        const BamRecordView view
+        {
+            s0,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(clipStart, s1.QueryStart());
+        EXPECT_EQ(clipEnd,   s1.QueryEnd());
+        EXPECT_EQ(clipStart, s1.AlignedStart());   // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s1.AlignedEnd());     // alignStart + seqLength
+        EXPECT_EQ(102, s1.ReferenceStart());       // 100 + startOffset
+        EXPECT_EQ(109, s1.ReferenceEnd());         // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s1_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s1_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s1_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s1_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(102, s1_rev.ReferenceStart());        // 100 + startOffset
+        EXPECT_EQ(109, s1_rev.ReferenceEnd());          // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(clipStart, s2.QueryStart());
+        EXPECT_EQ(clipEnd,   s2.QueryEnd());
+        EXPECT_EQ(clipStart, s2.AlignedStart());   // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s2.AlignedEnd());     // alignStart + seqLength
+        EXPECT_EQ(102, s2.ReferenceStart());       // 100 + startOffset
+        EXPECT_EQ(112, s2.ReferenceEnd());         // RefStart + 7= + 3D
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,      view.Sequence());
+        EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s2_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s2_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s2_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s2_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(102, s2_rev.ReferenceStart());        // 100 + startOffset
+        EXPECT_EQ(112, s2_rev.ReferenceEnd());          // RefStart + 7= + 3D
+
+        EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(clipStart, s3.QueryStart());
+        EXPECT_EQ(clipEnd,   s3.QueryEnd());
+        EXPECT_EQ(clipStart, s3.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s3.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+        EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s3_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s3_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s3_rev.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s3_rev.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(102, s3_rev.ReferenceStart());         // 100 + startOffset
+        EXPECT_EQ(110, s3_rev.ReferenceEnd());           // RefStart + 5= + 3D
+
+        EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+}
+
+TEST(BamRecordClippingTest, ClipToQuery_WithSoftClips)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const string seq      = "TTAACCGTTAGCAAA";
+    const string seq_rev  = "TTTGCTAACGGTTAA";
+    const string quals    = "--?]?]?]?]?*+++";
+    const string tagBases = "TTAACCGTTAGCAAA";
+    const string tagQuals = "--?]?]?]?]?*+++";
+    const string tagQuals_rev = "+++*?]?]?]?]?--";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+    const f_data frames_rev = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const string s1_cigar = "2S10=3S";
+    const string s1_cigar_clipped = "7=";
+    const string s1_seq_clipped      = "AACCGTT";
+    const string s1_quals_clipped    = "?]?]?]?";
+    const string s1_tagBases_clipped = s1_seq_clipped;
+    const string s1_tagQuals_clipped = s1_quals_clipped;
+    const f_data s1_frames_clipped   = { 10, 10, 20, 20, 30, 40, 40 };
+    const string s1_seq_rev_clipped   = "AACGGTT";
+    const string s1_quals_rev_clipped = "?]?]?]?";
+    const string s1_tagBases_rev_clipped = s1_seq_rev_clipped;
+    const string s1_tagQuals_rev_clipped = s1_quals_rev_clipped;
+    const f_data s1_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 };
+
+    const string s2_cigar = "2S5=3D5=3S";
+    const string s2_cigar_clipped = "5=3D2=";
+    const string s2_seq_clipped      = "AACCGTT";
+    const string s2_quals_clipped    = "?]?]?]?";
+    const string s2_tagBases_clipped = s2_seq_clipped;
+    const string s2_tagQuals_clipped = s2_quals_clipped;
+    const f_data s2_frames_clipped   = { 10, 10, 20, 20, 30, 40, 40 };
+    const string s2_seq_rev_clipped   = "AACGGTT";
+    const string s2_quals_rev_clipped = "?]?]?]?";
+    const string s2_tagBases_rev_clipped = s2_seq_rev_clipped;
+    const string s2_tagQuals_rev_clipped = s2_quals_rev_clipped;
+    const f_data s2_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 };
+
+    const string s3_cigar = "2S4=1D2I2D4=3S";
+    const string s3_cigar_clipped = "4=1D2I2D1=";
+    const string s3_seq_clipped      = "AACCGTT";
+    const string s3_quals_clipped    = "?]?]?]?";
+    const string s3_tagBases_clipped = s3_seq_clipped;
+    const string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 10, 10, 20, 20, 30, 40, 40 };
+    const string s3_seq_rev_clipped   = "AACGGTT";
+    const string s3_quals_rev_clipped = "?]?]?]?";
+    const string s3_tagBases_rev_clipped = s3_seq_rev_clipped;
+    const string s3_tagQuals_rev_clipped = s3_quals_rev_clipped;
+    const f_data s3_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 };
+
+    const BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  seq, tagBases, tagQuals, frames);
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    // sanity checks before clipping
+    EXPECT_TRUE(s1.IsMapped());
+    EXPECT_EQ(tPos, s1.ReferenceStart());
+    EXPECT_EQ(tPos + 10, s1.ReferenceEnd()); // 10=
+
+    EXPECT_TRUE(s1_rev.IsMapped());
+    EXPECT_EQ(tPos, s1_rev.ReferenceStart());
+    EXPECT_EQ(tPos + 10, s1_rev.ReferenceEnd()); // 10=
+
+    EXPECT_TRUE(s2.IsMapped());
+    EXPECT_EQ(tPos, s2.ReferenceStart());
+    EXPECT_EQ(tPos + 13, s2.ReferenceEnd());   // 5= + 3D + 5=
+
+    EXPECT_TRUE(s2_rev.IsMapped());
+    EXPECT_EQ(tPos, s2_rev.ReferenceStart());
+    EXPECT_EQ(tPos + 13, s2_rev.ReferenceEnd());   // 5= + 3D + 5=
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(tPos, s3.ReferenceStart());
+    EXPECT_EQ(tPos + 11, s3.ReferenceEnd());   // 4= + 1D + 2D + 4=
+
+    EXPECT_TRUE(s3_rev.IsMapped());
+    EXPECT_EQ(tPos, s3_rev.ReferenceStart());
+    EXPECT_EQ(tPos + 11, s3_rev.ReferenceEnd());   // 4= + 1D + 2D + 4=
+
+    s1.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(clipStart, s1.QueryStart());
+        EXPECT_EQ(clipEnd,   s1.QueryEnd());
+        EXPECT_EQ(clipStart, s1.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s1.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s1.ReferenceStart());  // tPos
+        EXPECT_EQ(tPos + 7,  s1.ReferenceEnd());    // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s1_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s1_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s1_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s1_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s1_rev.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 7,  s1_rev.ReferenceEnd());    // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(clipStart, s2.QueryStart());
+        EXPECT_EQ(clipEnd,   s2.QueryEnd());
+        EXPECT_EQ(clipStart, s2.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s2.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s2.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 10, s2.ReferenceEnd());    // RefStart + 5=3D2=
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s2_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s2_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s2_rev.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s2_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s2_rev.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 10, s2_rev.ReferenceEnd());    // RefStart + 5=3D2=
+
+        EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(clipStart, s3.QueryStart());
+        EXPECT_EQ(clipEnd,   s3.QueryEnd());
+        EXPECT_EQ(clipStart, s3.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s3.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s3.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 8,  s3.ReferenceEnd());    // RefStart + 4=1D2D1=
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s3_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s3_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s3_rev.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s3_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s3_rev.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 8,  s3_rev.ReferenceEnd());    // RefStart + 4=1D2D1=
+
+        EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_rev_clipped,   view.IPD().Data());
+    }
+}
+
+TEST(BamRecordClippingTest, ClipToReference_Basic)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const string seq      = "AACCGTTAGC";
+    const string quals    = "?]?]?]?]?*";
+    const string tagBases = "AACCGTTAGC";
+    const string tagQuals = "?]?]?]?]?*";
+    const string tagQuals_rev = "*?]?]?]?]?";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const string s1_cigar = "10=";
+    const string s1_cigar_clipped = "5=";
+    const string s1_seq_clipped      = "CCGTT";
+    const string s1_quals_clipped    = "?]?]?";
+    const string s1_tagBases_clipped = s1_seq_clipped;
+    const string s1_tagQuals_clipped = s1_quals_clipped;
+    const f_data s1_frames_clipped   = { 20, 20, 30, 40, 40 };
+    const string s1_seq_rev_clipped   = "TAACG";
+    const string s1_quals_rev_clipped = "]?]?]";
+    const string s1_tagBases_rev_clipped = s1_seq_rev_clipped;
+    const string s1_tagQuals_rev_clipped = s1_quals_rev_clipped;
+    const f_data s1_frames_rev_clipped = { 10, 40, 40, 30, 20 };
+
+    const string s2_cigar = "5=3D5=";
+    const string s2_cigar_clipped = "3=2D";
+    const string s2_seq_clipped      = "CCG";
+    const string s2_quals_clipped    = "?]?";
+    const string s2_tagBases_clipped = s2_seq_clipped;
+    const string s2_tagQuals_clipped = s2_quals_clipped;
+    const f_data s2_frames_clipped   = { 20, 20, 30 };
+    const string s2_seq_rev_clipped   = "TAA";
+    const string s2_quals_rev_clipped = "]?]";
+    const string s2_tagBases_rev_clipped = s2_seq_rev_clipped;
+    const string s2_tagQuals_rev_clipped = s2_quals_rev_clipped;
+    const f_data s2_frames_rev_clipped = { 10, 40, 40 };
+
+    const string s3_cigar = "4=1D2I2D4=";
+    const string s3_cigar_clipped = "2=1D2I2D";
+    const string s3_seq_clipped      = "CCGT";
+    const string s3_quals_clipped    = "?]?]";
+    const string s3_tagBases_clipped = s3_seq_clipped;
+    const string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+    const string s3_seq_rev_clipped   = "TAAC";
+    const string s3_quals_rev_clipped = "]?]?";
+    const string s3_tagBases_rev_clipped = s3_seq_rev_clipped;
+    const string s3_tagQuals_rev_clipped = s3_quals_rev_clipped;
+    const f_data s3_frames_rev_clipped = { 10, 40, 40, 30};
+
+    const BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  seq, tagBases, tagQuals, frames);
+    BamRecord s0 = prototype;
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    s0.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    {   // s0 - no clipping should have been done to unmapped record
+
+        EXPECT_FALSE(s0.IsMapped());
+        EXPECT_EQ(prototype.QueryStart(),     s0.QueryStart());
+        EXPECT_EQ(prototype.QueryEnd(),       s0.QueryEnd());
+        EXPECT_EQ(prototype.AlignedStart(),   s0.AlignedStart());
+        EXPECT_EQ(prototype.AlignedEnd(),     s0.AlignedEnd());
+        EXPECT_EQ(prototype.ReferenceStart(), s0.ReferenceStart());
+        EXPECT_EQ(prototype.ReferenceEnd(),   s0.ReferenceEnd());
+
+        const BamRecordView protoView
+        {
+            prototype,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        const BamRecordView view
+        {
+            s0,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(protoView.Sequence(),       view.Sequence());
+        EXPECT_EQ(protoView.Qualities(),      view.Qualities());
+        EXPECT_EQ(protoView.DeletionTags(),    view.DeletionTags());
+        EXPECT_EQ(protoView.DeletionQVs(),     view.DeletionQVs());
+        EXPECT_EQ(protoView.LabelQVs(),        view.LabelQVs());
+        EXPECT_EQ(protoView.AltLabelQVs(),     view.AltLabelQVs());
+        EXPECT_EQ(protoView.IPD(),            view.IPD());
+    }
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(502,   s1.QueryStart());
+        EXPECT_EQ(507,   s1.QueryEnd());
+        EXPECT_EQ(502,   s1.AlignedStart());       // queryStart (no soft clips)
+        EXPECT_EQ(507,   s1.AlignedEnd());         // alignStart + seqLength
+        EXPECT_EQ(clipStart, s1.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s1.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(503, s1_rev.QueryStart());
+        EXPECT_EQ(508, s1_rev.QueryEnd());
+        EXPECT_EQ(503, s1_rev.AlignedStart());          // queryStart (no soft clips)
+        EXPECT_EQ(508, s1_rev.AlignedEnd());            // alignStart + seqLength
+        EXPECT_EQ(clipStart, s1_rev.ReferenceStart());  // clipStart
+        EXPECT_EQ(clipEnd,   s1_rev.ReferenceEnd());    // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(502, s2.QueryStart());
+        EXPECT_EQ(505, s2.QueryEnd());
+        EXPECT_EQ(502, s2.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(505, s2.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s2.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s2.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(505, s2_rev.QueryStart());
+        EXPECT_EQ(508, s2_rev.QueryEnd());
+        EXPECT_EQ(505, s2_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(508, s2_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(clipStart, s2_rev.ReferenceStart());  // clipStart
+        EXPECT_EQ(clipEnd,   s2_rev.ReferenceEnd());    // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(502, s3.QueryStart());
+        EXPECT_EQ(506, s3.QueryEnd());
+        EXPECT_EQ(502, s3.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(506, s3.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(504, s3_rev.QueryStart());
+        EXPECT_EQ(508, s3_rev.QueryEnd());
+        EXPECT_EQ(504, s3_rev.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(508, s3_rev.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s3_rev.ReferenceStart());  // clipStart
+        EXPECT_EQ(clipEnd,   s3_rev.ReferenceEnd());    // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_rev_clipped,   view.IPD().Data());
+    }
+}
+
+TEST(BamRecordClippingTest, ClipToReference_WithSoftClips)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const string seq      = "TTAACCGTTAGCAAA";
+    const string quals    = "--?]?]?]?]?*+++";
+    const string tagBases = "TTAACCGTTAGCAAA";
+    const string tagQuals = "--?]?]?]?]?*+++";
+    const string tagQuals_rev = "+++*?]?]?]?]?--";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const string seq_rev      = "TTTGCTAACGGTTAA";
+    const string quals_rev    = "+++*?]?]?]?]?--";
+    const f_data frames_rev   = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 };
+
+    const string s1_cigar = "2S10=3S";
+    const string s1_cigar_clipped = "5=";
+    const string s1_seq_clipped      = "CCGTT";
+    const string s1_quals_clipped    = "?]?]?";
+    const string s1_tagBases_clipped = s1_seq_clipped;
+    const string s1_tagQuals_clipped = s1_quals_clipped;
+    const f_data s1_frames_clipped   = { 20, 20, 30, 40, 40 };
+    const string s1_seq_rev_clipped   = "CTAAC";
+    const string s1_quals_rev_clipped = "?]?]?";
+    const string s1_tagBases_rev_clipped = s1_seq_rev_clipped;
+    const string s1_tagQuals_rev_clipped = s1_quals_rev_clipped;
+    const f_data s1_frames_rev_clipped = { 30, 10, 40, 40, 30 };
+
+    const string s2_cigar = "2S5=3D5=3S";
+    const string s2_cigar_clipped = "3=2D";
+    const string s2_seq_clipped      = "CCG";
+    const string s2_quals_clipped    = "?]?";
+    const string s2_tagBases_clipped = s2_seq_clipped;
+    const string s2_tagQuals_clipped = s2_quals_clipped;
+    const f_data s2_frames_clipped   = { 20, 20, 30 };
+    const string s2_seq_rev_clipped   = "CTA";
+    const string s2_quals_rev_clipped = "?]?";
+    const string s2_tagBases_rev_clipped = s2_seq_rev_clipped;
+    const string s2_tagQuals_rev_clipped = s2_quals_rev_clipped;
+    const f_data s2_frames_rev_clipped = { 30, 10, 40 };
+
+    const string s3_cigar = "2S4=1D2I2D4=3S";
+    const string s3_cigar_clipped = "2=1D2I2D";
+    const string s3_seq_clipped      = "CCGT";
+    const string s3_quals_clipped    = "?]?]";
+    const string s3_tagBases_clipped = s3_seq_clipped;
+    const string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+    const string s3_seq_rev_clipped   = "CTAA";
+    const string s3_quals_rev_clipped = "?]?]";
+    const string s3_tagBases_rev_clipped = s3_seq_rev_clipped;
+    const string s3_tagQuals_rev_clipped = s3_quals_rev_clipped;
+    const f_data s3_frames_rev_clipped = { 30, 10, 40, 40 };
+
+    const BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  seq, tagBases, tagQuals, frames);
+    BamRecord s0 = prototype;
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    // sanity checks before clipping
+    EXPECT_FALSE(s0.IsMapped());
+
+    EXPECT_TRUE(s1.IsMapped());
+    EXPECT_EQ(500,       s1.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s1.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(502,       s1.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(512,       s1.AlignedEnd());      // alignedStart + 10=
+    EXPECT_EQ(tPos,      s1.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 10, s1.ReferenceEnd());    // tPos + 10=
+
+    EXPECT_TRUE(s1_rev.IsMapped());
+    EXPECT_EQ(500,       s1_rev.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s1_rev.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(503,       s1_rev.AlignedStart());    // queryStart + 3S
+    EXPECT_EQ(513,       s1_rev.AlignedEnd());      // alignedStart + 10=
+    EXPECT_EQ(tPos,      s1_rev.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 10, s1_rev.ReferenceEnd());    // tPos + 10=
+
+    EXPECT_TRUE(s2.IsMapped());
+    EXPECT_EQ(500,       s2.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s2.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(502,       s2.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(512,       s2.AlignedEnd());      // alignedStart + 5=5=
+    EXPECT_EQ(tPos,      s2.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 13, s2.ReferenceEnd());    // tPos + 5=3D5=
+
+    EXPECT_TRUE(s2_rev.IsMapped());
+    EXPECT_EQ(500,       s2_rev.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s2_rev.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(503,       s2_rev.AlignedStart());    // queryStart + S
+    EXPECT_EQ(513,       s2_rev.AlignedEnd());      // alignedStart + 5=5=
+    EXPECT_EQ(tPos,      s2_rev.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 13, s2_rev.ReferenceEnd());    // tPos + 5=3D5=
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(500,       s3.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s3.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(502,       s3.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(512,       s3.AlignedEnd());      // alignedStart + 4=2I4=
+    EXPECT_EQ(tPos,      s3.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 11, s3.ReferenceEnd());    // tPos + 4=1D2D4=
+
+    EXPECT_TRUE(s3_rev.IsMapped());
+    EXPECT_EQ(500,       s3_rev.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s3_rev.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(503,       s3_rev.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(513,       s3_rev.AlignedEnd());      // alignedStart + 4=2I4=
+    EXPECT_EQ(tPos,      s3_rev.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 11, s3_rev.ReferenceEnd());    // tPos + 4=1D2D4=
+
+    s0.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    {   // s0 - no clipping should have been done to unmapped record
+
+        EXPECT_FALSE(s0.IsMapped());
+        EXPECT_EQ(prototype.QueryStart(),     s0.QueryStart());
+        EXPECT_EQ(prototype.QueryEnd(),       s0.QueryEnd());
+        EXPECT_EQ(prototype.AlignedStart(),   s0.AlignedStart());
+        EXPECT_EQ(prototype.AlignedEnd(),     s0.AlignedEnd());
+        EXPECT_EQ(prototype.ReferenceStart(), s0.ReferenceStart());
+        EXPECT_EQ(prototype.ReferenceEnd(),   s0.ReferenceEnd());
+
+        const BamRecordView protoView
+        {
+            prototype,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        const BamRecordView view
+        {
+            s0,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(protoView.Sequence(),      view.Sequence());
+        EXPECT_EQ(protoView.Qualities(),     view.Qualities());
+        EXPECT_EQ(protoView.DeletionTags(),  view.DeletionTags());
+        EXPECT_EQ(protoView.DeletionQVs(),   view.DeletionQVs());
+        EXPECT_EQ(protoView.LabelQVs(),      view.LabelQVs());
+        EXPECT_EQ(protoView.AltLabelQVs(),   view.AltLabelQVs());
+        EXPECT_EQ(protoView.IPD(),           view.IPD());
+    }
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(504,   s1.QueryStart());         // new queryStart
+        EXPECT_EQ(509,   s1.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(504,   s1.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(509,   s1.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s1.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s1.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(506,   s1_rev.QueryStart());         // new queryStart
+        EXPECT_EQ(511,   s1_rev.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(506,   s1_rev.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(511,   s1_rev.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s1_rev.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s1_rev.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(504, s2.QueryStart());
+        EXPECT_EQ(507, s2.QueryEnd());
+        EXPECT_EQ(504, s2.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(507, s2.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s2.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s2.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(508,   s2_rev.QueryStart());         // new queryStart
+        EXPECT_EQ(511,   s2_rev.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(508,   s2_rev.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(511,   s2_rev.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s2_rev.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s2_rev.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(504, s3.QueryStart());
+        EXPECT_EQ(508, s3.QueryEnd());
+        EXPECT_EQ(504, s3.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(508, s3.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(507,   s3_rev.QueryStart());         // new queryStart
+        EXPECT_EQ(511,   s3_rev.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(507,   s3_rev.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(511,   s3_rev.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s3_rev.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s3_rev.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_rev_clipped,   view.IPD().Data());
+    }
+}
+
+TEST(BamRecordClippingTest, ClippedToQueryCopy)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const string seq      = "AACCGTTAGC";
+    const string quals    = "?]?]?]?]?*";
+    const string tagBases = "AACCGTTAGC";
+    const string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const string seq_clipped      = "CCGTTAG";
+    const string quals_clipped    = "?]?]?]?";
+    const string tagBases_clipped = "CCGTTAG";
+    const string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const string s3_cigar = "4=1D2I2D4=";
+    const string s3_cigar_clipped = "2=1D2I2D3=";
+
+    BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(clipStart, s3.QueryStart());
+    EXPECT_EQ(clipEnd,   s3.QueryEnd());
+    EXPECT_EQ(clipStart, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(clipEnd,   s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+    EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq_clipped,      view.Sequence());
+    EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordClippingTest, ClippedToReferenceCopy)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const string seq      = "AACCGTTAGC";
+    const string quals    = "?]?]?]?]?*";
+    const string tagBases = "AACCGTTAGC";
+    const string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const string s3_cigar = "4=1D2I2D4=";
+    const string s3_cigar_clipped = "2=1D2I2D";
+    const string s3_seq_clipped      = "CCGT";
+    const string s3_quals_clipped    = "?]?]";
+    const string s3_tagBases_clipped = s3_seq_clipped;
+    const string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+
+    BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    // s3 - FORWARD
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(502, s3.QueryStart());
+    EXPECT_EQ(506, s3.QueryEnd());
+    EXPECT_EQ(502, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(506, s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+    EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+    EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordClippingTest, StaticClippedToQuery)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const string seq      = "AACCGTTAGC";
+    const string quals    = "?]?]?]?]?*";
+    const string tagBases = "AACCGTTAGC";
+    const string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const string seq_clipped      = "CCGTTAG";
+    const string quals_clipped    = "?]?]?]?";
+    const string tagBases_clipped = "CCGTTAG";
+    const string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const string s3_cigar = "4=1D2I2D4=";
+    const string s3_cigar_clipped = "2=1D2I2D3=";
+
+    BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(clipStart, s3.QueryStart());
+    EXPECT_EQ(clipEnd,   s3.QueryEnd());
+    EXPECT_EQ(clipStart, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(clipEnd,   s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+    EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq_clipped,      view.Sequence());
+    EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordClippingTest, StaticClippedToReference)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const string seq      = "AACCGTTAGC";
+    const string quals    = "?]?]?]?]?*";
+    const string tagBases = "AACCGTTAGC";
+    const string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const string s3_cigar = "4=1D2I2D4=";
+    const string s3_cigar_clipped = "2=1D2I2D";
+    const string s3_seq_clipped      = "CCGT";
+    const string s3_quals_clipped    = "?]?]";
+    const string s3_tagBases_clipped = s3_seq_clipped;
+    const string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+
+    BamRecord prototype = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    // s3 - FORWARD
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(502, s3.QueryStart());
+    EXPECT_EQ(506, s3.QueryEnd());
+    EXPECT_EQ(502, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(506, s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+    EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+    EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordTest, ClipCigarData)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const string seq      = "TTAACCGTTAGCAAA";
+    const string quals    = "--?]?]?]?]?*+++";
+    const string tagBases = "TTAACCGTTAGCAAA";
+    const string tagQuals = "--?]?]?]?]?*+++";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+    const uint8_t mapQual = 80;
+    BamRecord s3 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                     seq, tagBases, tagQuals, frames);
+    BamRecord s3_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                         seq, tagBases, tagQuals, frames);
+
+    const string s3_cigar = "5H2S4=1D2I2D4=3S7H";
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    const Cigar s3_cigar_raw     = s3.CigarData();
+    const Cigar s3_cigar_clipped = s3.CigarData(true);
+
+    EXPECT_EQ(s3_cigar, s3_cigar_raw.ToStdString());
+    EXPECT_EQ(string("4=1D2I2D4="), s3_cigar_clipped.ToStdString());
+}
+
+TEST(BamRecordTest, CCS_ClipToQuery)
+{
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 2;
+    const Position clipEnd   = 9;
+
+    const string seq      = "AACCGTTAGC";
+    const string quals    = "?]?]?]?]?*";
+    const string tagBases = "AACCGTTAGC";
+    const string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const string seq_clipped      = "CCGTTAG";
+    const string quals_clipped    = "?]?]?]?";
+    const string tagBases_clipped = "CCGTTAG";
+    const string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const string s3_cigar = "4=1D2I2D4=";
+    const string s3_cigar_clipped = "2=1D2I2D3=";
+
+    BamRecord prototype = tests::MakeCCSRecord(seq, quals, tagBases, tagQuals, frames,
+                                               seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(0,   s3.AlignedStart());     // record start (no soft clips)
+    EXPECT_EQ(7,   s3.AlignedEnd());       // alignStart + clipped seqLength
+    EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+    EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq_clipped,      view.Sequence());
+    EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordTest, CCS_ClipToReference)
+{
+    const string seq      = "AACCGTTAGC";
+    const string quals    = "?]?]?]?]?*";
+    const string tagBases = "AACCGTTAGC";
+    const string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const string s3_cigar = "4=1D2I2D4=";
+    const string s3_cigar_clipped = "2=1D2I2D";
+    const string s3_seq_clipped      = "CCGT";
+    const string s3_quals_clipped    = "?]?]";
+    const string s3_tagBases_clipped = s3_seq_clipped;
+    const string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+
+    BamRecord prototype = tests::MakeCCSRecord(seq, quals, tagBases, tagQuals, frames,
+                                               seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(0, s3.AlignedStart());     // record tart (no soft clips)
+    EXPECT_EQ(4, s3.AlignedEnd());       // alignStart + clipped seqLength (4)
+    EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+    EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+    EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+}
diff --git a/tests/src/test_BamRecordImplCore.cpp b/tests/src/test_BamRecordImplCore.cpp

new file mode 100644 (file)

index 0000000..30226c2
--- /dev/null
+++ b/tests/src/test_BamRecordImplCore.cpp
@@ -0,0 +1,627 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/BamRecordImpl.h>
+#include <pbbam/BamTagCodec.h>
+#include <pbbam/Tag.h>
+#include <pbbam/TagCollection.h>
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace tests {
+
+struct Bam1Deleter
+{
+    void operator()(bam1_t* b) {
+        if (b)
+            bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+static
+BamRecordImpl CreateBamImpl(void)
+{
+    BamRecordImpl bam;
+    bam.Bin(42);
+    bam.Flag(42);
+    bam.InsertSize(42);
+    bam.MapQuality(42);
+    bam.MatePosition(42);
+    bam.MateReferenceId(42);
+    bam.Position(42);
+    bam.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = static_cast<int32_t>(-42);
+    bam.Tags(tags);
+
+    return bam;
+}
+
+static
+void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+
+    const uint32_t expectedNameLength  = bam.Name().size() + 1;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t  expectedSeqLength   = bam.Sequence().length();
+    const size_t   expectedTagsLength  = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + << TAGS >>
+
+    const int expectedTotalDataLength = expectedNameLength +
+                                        (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength+1)/2 +
+                                         expectedSeqLength +
+                                         expectedTagsLength;
+
+    EXPECT_TRUE((bool)bam.d_);
+    EXPECT_EQ(expectedNameLength,      bam.d_->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps,     bam.d_->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength,       bam.d_->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, bam.d_->l_data);
+}
+
+} // namespace tests
+
+TEST(BamRecordImplCoreTest, RawDataDefaultValues)
+{
+    PBBAM_SHARED_PTR<bam1_t> rawData(bam_init1(), tests::Bam1Deleter());
+    ASSERT_TRUE((bool)rawData);
+
+    // fixed-length (core) data
+    EXPECT_EQ(0, rawData->core.tid);
+    EXPECT_EQ(0, rawData->core.pos);
+    EXPECT_EQ(0, rawData->core.bin);
+    EXPECT_EQ(0, rawData->core.qual);
+    EXPECT_EQ(0, rawData->core.l_qname);
+    EXPECT_EQ(0, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(0, rawData->core.mtid);
+    EXPECT_EQ(0, rawData->core.mpos);
+    EXPECT_EQ(0, rawData->core.isize);
+
+    // variable length data
+    EXPECT_EQ(0, rawData->data);
+    EXPECT_EQ(0, rawData->l_data);
+    EXPECT_EQ(0, rawData->m_data);
+}
+
+TEST(BamRecordImplCoreTest, DefaultValues)
+{
+    BamRecordImpl bam;
+
+    // -------------------------------
+    // check raw data
+    // -------------------------------
+
+    const PBBAM_SHARED_PTR<bam1_t> rawData = bam.d_;
+    ASSERT_TRUE((bool)rawData);
+
+    // fixed-length (core) data
+    // (forced init unmapped, with NULL-term as QNAME)
+    EXPECT_EQ(-1, rawData->core.tid);
+    EXPECT_EQ(-1, rawData->core.pos);
+    EXPECT_EQ(0, rawData->core.bin);
+    EXPECT_EQ(255, rawData->core.qual);
+    EXPECT_EQ(1, rawData->core.l_qname);
+    EXPECT_EQ(BamRecordImpl::UNMAPPED, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(-1, rawData->core.mtid);
+    EXPECT_EQ(-1, rawData->core.mpos);
+    EXPECT_EQ(0, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(1, rawData->l_data);
+    EXPECT_EQ((int)0x800, rawData->m_data);  // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(0, bam.Bin());
+    EXPECT_EQ(BamRecordImpl::UNMAPPED, bam.Flag());
+    EXPECT_EQ(0, bam.InsertSize());
+    EXPECT_EQ(255, bam.MapQuality());
+    EXPECT_EQ(-1, bam.MateReferenceId());
+    EXPECT_EQ(-1, bam.MatePosition());
+    EXPECT_EQ(-1, bam.Position());
+    EXPECT_EQ(-1, bam.ReferenceId());
+    EXPECT_EQ(0, bam.Tags().size());
+
+    EXPECT_FALSE(bam.IsDuplicate());
+    EXPECT_FALSE(bam.IsFailedQC());
+    EXPECT_FALSE(bam.IsFirstMate());
+    EXPECT_FALSE(bam.IsMapped());
+    EXPECT_TRUE(bam.IsMateMapped());
+    EXPECT_FALSE(bam.IsMateReverseStrand());
+    EXPECT_FALSE(bam.IsPaired());
+    EXPECT_TRUE(bam.IsPrimaryAlignment());
+    EXPECT_FALSE(bam.IsProperPair());
+    EXPECT_FALSE(bam.IsReverseStrand());
+    EXPECT_FALSE(bam.IsSecondMate());
+    EXPECT_FALSE(bam.IsSupplementaryAlignment());
+
+    const std::string emptyString = "";
+    EXPECT_EQ(emptyString, bam.Name());
+    EXPECT_EQ(emptyString, bam.CigarData().ToStdString());
+    EXPECT_EQ(emptyString, bam.Sequence());
+    EXPECT_EQ(emptyString, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTest, CoreSetters)
+{
+    BamRecordImpl bam;
+    bam.Bin(42);
+    bam.Flag(42);
+    bam.InsertSize(42);
+    bam.MapQuality(42);
+    bam.MatePosition(42);
+    bam.MateReferenceId(42);
+    bam.Position(42);
+    bam.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = static_cast<int32_t>(-42);
+    bam.Tags(tags); // (28 bytes encoded)
+
+    // -------------------------------
+    // check raw data
+    // -------------------------------
+
+    const PBBAM_SHARED_PTR<bam1_t> rawData = bam.d_;
+    ASSERT_TRUE((bool)rawData);
+
+    // fixed-length (core) data
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(1,  rawData->core.l_qname);    // initialized w/ NULL-term
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0,  rawData->core.n_cigar);
+    EXPECT_EQ(0,  rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(29, rawData->l_data);         // NULL-term qname + tags
+    EXPECT_EQ((int)0x800, rawData->m_data); // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    const TagCollection& fetchedTags = bam.Tags();
+
+    EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), fetchedTags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array());
+}
+
+TEST(BamRecordImplCoreTest, DeepCopyFromRawData)
+{
+    // init raw data
+    PBBAM_SHARED_PTR<bam1_t> rawData(bam_init1(), tests::Bam1Deleter());
+    ASSERT_TRUE((bool)rawData);
+
+    rawData->core.tid = 42;
+    rawData->core.pos = 42;
+    rawData->core.bin = 42;
+    rawData->core.qual = 42;
+    rawData->core.flag = 42;
+    rawData->core.mtid = 42;
+    rawData->core.mpos = 42;
+    rawData->core.isize = 42;
+
+    const int32_t x = 42;
+    char valueBytes[sizeof x];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&x)),
+              static_cast<const char*>(static_cast<const void*>(&x)) + sizeof x,
+              valueBytes);
+    bam_aux_append(rawData.get(), "XY", 'i', sizeof(x), (uint8_t*)&valueBytes[0]);
+
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(0,  rawData->core.l_qname);
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0,  rawData->core.n_cigar);
+    EXPECT_EQ(0,  rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+    const int32_t fetchedX = bam_aux2i( bam_aux_get(rawData.get(), "XY") );
+    EXPECT_EQ(42, fetchedX);
+
+    // static "ctor"
+    BamRecordImpl bam = BamRecordImpl::FromRawData(rawData);
+
+    // make sure raw data is still valid
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(0,  rawData->core.l_qname);
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0,  rawData->core.n_cigar);
+    EXPECT_EQ(0,  rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_TRUE(0 != rawData->l_data);
+    EXPECT_TRUE(0 != rawData->m_data);
+
+    // check new record
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+    EXPECT_EQ(x,  bam.Tags()["XY"].ToInt32());
+
+    EXPECT_TRUE(bam.d_->data != nullptr);
+    EXPECT_TRUE(bam.d_->m_data >= (int)0x800); // check this if we change or tune later
+
+    // tweak raw data, make sure we've done a deep copy (so BamRecordImpl isn't changed)
+    rawData->core.pos = 37;
+    EXPECT_EQ(37, rawData->core.pos);
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.d_->core.pos);
+}
+
+TEST(BamRecordImplCoreTest, CopyAssignment)
+{
+    BamRecordImpl bam1;
+    bam1.Bin(42);
+    bam1.Flag(42);
+    bam1.InsertSize(42);
+    bam1.MapQuality(42);
+    bam1.MatePosition(42);
+    bam1.MateReferenceId(42);
+    bam1.Position(42);
+    bam1.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = static_cast<int32_t>(-42);
+    bam1.Tags(tags);
+
+    BamRecordImpl bam2;
+    bam2 = bam1;
+
+    EXPECT_EQ(42, bam1.Bin());
+    EXPECT_EQ(42, bam1.Flag());
+    EXPECT_EQ(42, bam1.InsertSize());
+    EXPECT_EQ(42, bam1.MapQuality());
+    EXPECT_EQ(42, bam1.MateReferenceId());
+    EXPECT_EQ(42, bam1.MatePosition());
+    EXPECT_EQ(42, bam1.Position());
+    EXPECT_EQ(42, bam1.ReferenceId());
+
+    const TagCollection& fetchedTags1 = bam1.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    EXPECT_EQ(42, bam2.Bin());
+    EXPECT_EQ(42, bam2.Flag());
+    EXPECT_EQ(42, bam2.InsertSize());
+    EXPECT_EQ(42, bam2.MapQuality());
+    EXPECT_EQ(42, bam2.MateReferenceId());
+    EXPECT_EQ(42, bam2.MatePosition());
+    EXPECT_EQ(42, bam2.Position());
+    EXPECT_EQ(42, bam2.ReferenceId());
+
+    const TagCollection& fetchedTags2 = bam2.Tags();
+    EXPECT_TRUE(fetchedTags2.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags2.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), fetchedTags2.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags2.at("CA").ToUInt8Array());
+
+    tests::CheckRawData(bam1);
+    tests::CheckRawData(bam2);
+}
+
+TEST(BamRecordImplCoreTest, SelfAssignmentTolerated)
+{
+    BamRecordImpl bam1;
+    bam1.Bin(42);
+    bam1.Flag(42);
+    bam1.InsertSize(42);
+    bam1.MapQuality(42);
+    bam1.MatePosition(42);
+    bam1.MateReferenceId(42);
+    bam1.Position(42);
+    bam1.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = static_cast<int32_t>(-42);
+    bam1.Tags(tags);
+
+    bam1 = bam1;
+
+    EXPECT_EQ(42, bam1.Bin());
+    EXPECT_EQ(42, bam1.Flag());
+    EXPECT_EQ(42, bam1.InsertSize());
+    EXPECT_EQ(42, bam1.MapQuality());
+    EXPECT_EQ(42, bam1.MateReferenceId());
+    EXPECT_EQ(42, bam1.MatePosition());
+    EXPECT_EQ(42, bam1.Position());
+    EXPECT_EQ(42, bam1.ReferenceId());
+
+    const TagCollection& fetchedTags1 = bam1.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    tests::CheckRawData(bam1);
+}
+
+TEST(BamRecordImplCoreTest, CopyConstructor)
+{
+    BamRecordImpl bam1;
+    bam1.Bin(42);
+    bam1.Flag(42);
+    bam1.InsertSize(42);
+    bam1.MapQuality(42);
+    bam1.MatePosition(42);
+    bam1.MateReferenceId(42);
+    bam1.Position(42);
+    bam1.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = static_cast<int32_t>(-42);
+    bam1.Tags(tags);
+
+    BamRecordImpl bam2(bam1);
+
+    EXPECT_EQ(42, bam1.Bin());
+    EXPECT_EQ(42, bam1.Flag());
+    EXPECT_EQ(42, bam1.InsertSize());
+    EXPECT_EQ(42, bam1.MapQuality());
+    EXPECT_EQ(42, bam1.MateReferenceId());
+    EXPECT_EQ(42, bam1.MatePosition());
+    EXPECT_EQ(42, bam1.Position());
+    EXPECT_EQ(42, bam1.ReferenceId());
+
+    const TagCollection& fetchedTags1 = bam1.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    EXPECT_EQ(42, bam2.Bin());
+    EXPECT_EQ(42, bam2.Flag());
+    EXPECT_EQ(42, bam2.InsertSize());
+    EXPECT_EQ(42, bam2.MapQuality());
+    EXPECT_EQ(42, bam2.MateReferenceId());
+    EXPECT_EQ(42, bam2.MatePosition());
+    EXPECT_EQ(42, bam2.Position());
+    EXPECT_EQ(42, bam2.ReferenceId());
+
+    const TagCollection& fetchedTags2 = bam2.Tags();
+    EXPECT_TRUE(fetchedTags2.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags2.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), fetchedTags2.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags2.at("CA").ToUInt8Array());
+
+    tests::CheckRawData(bam1);
+    tests::CheckRawData(bam2);
+}
+
+TEST(BamRecordImplCoreTest, CreateRecord_InternalTest)
+{
+    BamRecordImpl bam = tests::CreateBamImpl();
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = static_cast<int32_t>(-42);
+    bam.Tags(tags);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTest, MoveAssignment)
+{
+    BamRecordImpl bam;
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    bam = std::move(tests::CreateBamImpl());
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    const TagCollection& fetchedTags1 = bam.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTest, MoveConstructor)
+{
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif 
+    BamRecordImpl bam(std::move(tests::CreateBamImpl()));
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    const TagCollection& fetchedTags1 = bam.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(static_cast<int32_t>(-42), fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTest, AlignmentFlags)
+{
+    // same set of flags, different ways of getting there
+
+    // raw number
+    BamRecordImpl bam1;
+    bam1.Flag(1107);
+
+    // enum values
+    BamRecordImpl bam2;
+    bam2.Flag(BamRecordImpl::DUPLICATE |
+              BamRecordImpl::MATE_1 |
+              BamRecordImpl::REVERSE_STRAND |
+              BamRecordImpl::PROPER_PAIR |
+              BamRecordImpl::PAIRED
+             );
+
+    // convenience calls
+    BamRecordImpl bam3;
+    bam3.SetDuplicate(true);
+    bam3.SetFirstMate(true);
+    bam3.SetReverseStrand(true);
+    bam3.SetMapped(true);
+    bam3.SetMateMapped(true);
+    bam3.SetPaired(true);
+    bam3.SetProperPair(true);
+    bam3.SetPrimaryAlignment(true);
+
+    // make sure all are same
+    EXPECT_EQ(1107, bam1.Flag());
+    EXPECT_EQ(1107, bam2.Flag());
+    EXPECT_EQ(1107, bam3.Flag());
+
+    // check API calls
+    EXPECT_TRUE(bam1.IsPaired());
+    EXPECT_TRUE(bam1.IsProperPair());
+    EXPECT_TRUE(bam1.IsMapped());
+    EXPECT_TRUE(bam1.IsMateMapped());
+    EXPECT_TRUE(bam1.IsReverseStrand());
+    EXPECT_FALSE(bam1.IsMateReverseStrand());
+    EXPECT_TRUE(bam1.IsFirstMate());
+    EXPECT_FALSE(bam1.IsSecondMate());
+    EXPECT_TRUE(bam1.IsPrimaryAlignment());
+    EXPECT_FALSE(bam1.IsFailedQC());
+    EXPECT_TRUE(bam1.IsDuplicate());
+    EXPECT_FALSE(bam1.IsSupplementaryAlignment());
+}
diff --git a/tests/src/test_BamRecordImplTags.cpp b/tests/src/test_BamRecordImplTags.cpp

new file mode 100644 (file)

index 0000000..197e11e
--- /dev/null
+++ b/tests/src/test_BamRecordImplTags.cpp
@@ -0,0 +1,214 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/BamRecordImpl.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+// NOTE: these tests check "high-level" tag query/manipulation via BamRecordImpl.
+//       For raw Tag/TagCollection tests, see test_Tags.cpp
+//       For encoding tests, see test_BamRecordImplVariableData.cpp
+
+TEST(BamRecordImplTagsTest, HasTagTest)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    EXPECT_FALSE(bam.HasTag("zz"));
+    EXPECT_FALSE(bam.HasTag(""));
+    EXPECT_FALSE(bam.HasTag("some_too_long_name"));
+
+    const TagCollection& fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_TRUE(fetchedTags.Contains("XY"));
+    EXPECT_FALSE(fetchedTags.Contains("zz"));
+    EXPECT_FALSE(fetchedTags.Contains(""));
+    EXPECT_FALSE(fetchedTags.Contains("some_too_long_name"));
+}
+
+TEST(BamRecordImplTagsTest, SimpleAddTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_FALSE(bam.HasTag("XY"));
+
+    bam.AddTag("XY", (int32_t)-42);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const TagCollection& fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_TRUE(fetchedTags.Contains("XY"));
+    EXPECT_FALSE(fetchedTags.Contains("zz"));
+    EXPECT_FALSE(fetchedTags.Contains(""));
+    EXPECT_FALSE(fetchedTags.Contains("some_too_long_name"));
+
+    EXPECT_EQ(-42, fetchedTags.at("XY").ToInt32());
+
+    // fail on invalid adds
+    EXPECT_FALSE(bam.AddTag("", (int32_t)-42));
+    EXPECT_FALSE(bam.AddTag("some_too_long_name", (int32_t)-42));
+    EXPECT_FALSE(bam.AddTag("XY", (int32_t)-42));                   // reject duplicate
+}
+
+TEST(BamRecordImplTagsTest, SimpleRemoveTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const bool removedOk = bam.RemoveTag("XY");
+    EXPECT_TRUE(removedOk);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_FALSE(bam.HasTag("XY"));
+
+    const TagCollection& fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_FALSE(fetchedTags.Contains("XY"));
+    EXPECT_FALSE(fetchedTags.Contains("zz"));
+    EXPECT_FALSE(fetchedTags.Contains(""));
+    EXPECT_FALSE(fetchedTags.Contains("some_too_long_name"));
+
+    // fail on invalid removes
+    EXPECT_FALSE(bam.RemoveTag(""));
+    EXPECT_FALSE(bam.RemoveTag("some_too_long_name"));
+    EXPECT_FALSE(bam.RemoveTag("zz"));                 // reject remove unknown
+}
+
+TEST(BamRecordImplTagsTest, SimpleEditTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const TagCollection& fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_TRUE(fetchedTags.Contains("XY"));
+    EXPECT_EQ(-42, fetchedTags.at("XY").ToInt32());
+
+    const bool editedOk = bam.EditTag("XY", (int32_t)500);
+    EXPECT_TRUE(editedOk);
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const TagCollection& fetchedTags2 = bam.Tags();
+    EXPECT_TRUE(fetchedTags2.Contains("HX"));
+    EXPECT_TRUE(fetchedTags2.Contains("CA"));
+    EXPECT_TRUE(fetchedTags2.Contains("XY"));
+    EXPECT_EQ(500, fetchedTags2.at("XY").ToInt32());
+
+    // fail on invalid edits
+    EXPECT_FALSE(bam.EditTag("", 500));
+    EXPECT_FALSE(bam.EditTag("some_too_long_name", 500));
+    EXPECT_FALSE(bam.EditTag("zz", 500));                 // reject edit unknown
+}
+
+TEST(BamRecordImplTagsTest, SimpleQueryTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("XY"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    EXPECT_EQ(string("1abc75"),              bam.TagValue("HX").ToString());
+    EXPECT_EQ(vector<uint8_t>({34, 5, 125}), bam.TagValue("CA").ToUInt8Array());
+    EXPECT_EQ((int32_t)-42,                  bam.TagValue("XY").ToInt32());
+
+    EXPECT_FALSE(bam.HasTag("zz"));
+    EXPECT_FALSE(bam.HasTag(""));
+    EXPECT_FALSE(bam.HasTag("some_too_long_name"));
+
+    EXPECT_EQ(Tag(), bam.TagValue("zz"));
+    EXPECT_EQ(Tag(), bam.TagValue(""));
+    EXPECT_EQ(Tag(), bam.TagValue("some_too_long_name"));
+}
+
diff --git a/tests/src/test_BamRecordImplVariableData.cpp b/tests/src/test_BamRecordImplVariableData.cpp

new file mode 100644 (file)

index 0000000..7a63700
--- /dev/null
+++ b/tests/src/test_BamRecordImplVariableData.cpp
@@ -0,0 +1,4542 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/BamRecordImpl.h>
+#include <pbbam/BamTagCodec.h>
+#include <pbbam/SamTagCodec.h>
+#include <pbbam/Tag.h>
+#include <pbbam/TagCollection.h>
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+// NOTE: this file has a *TON* of tests. Probably overkill, but I wanted to check
+//       every possible combination of variable data, and then manipulate each
+//       element within each combo to shrink & expand.
+
+namespace tests {
+
+static
+void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+
+    const uint32_t expectedNameLength  = bam.Name().size() + 1;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t  expectedSeqLength   = bam.Sequence().length();
+    const size_t   expectedTagsLength  = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + <encoded length>
+
+    const int expectedTotalDataLength = expectedNameLength +
+                                        (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength+1)/2 +
+                                         expectedSeqLength +
+                                         expectedTagsLength;
+
+    EXPECT_EQ(expectedNameLength,      bam.d_->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps,     bam.d_->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength,       bam.d_->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, bam.d_->l_data);
+}
+
+} // namespace tests
+
+TEST(BamRecordImplVariableDataTest, InitEmpty)
+{
+    BamRecordImpl bam;
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_InitEmpty)
+{
+    BamRecordImpl bam;
+    bam.Tags(TagCollection());
+    EXPECT_EQ(0, bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_InitNormal)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithLongerTags)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithShorterTags)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithEmptyTags)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(0, bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_InitEmpty)
+{
+    BamRecordImpl bam;
+    bam.CigarData(std::string());
+    EXPECT_EQ(0, bam.CigarData().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_InitNormal_CigarObject)
+{
+    Cigar cigar;
+    cigar.push_back(CigarOperation('=', 100));
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData());
+    EXPECT_TRUE("100=" == bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_InitNormal_StdString)
+{
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithLongerCigar)
+{
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithShorterCigar)
+{
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithEmptyCigar)
+{
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_Init_Normal)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_Init_EmptyCigar)
+{
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_Init_EmptyTag)
+{
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0,     bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0,     bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Empty)
+{
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(std::string(), std::string());
+    EXPECT_EQ(0, bam.Sequence().size());
+    EXPECT_EQ(0, bam.Qualities().Fastq().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_NormalQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Preencoded) {
+
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    const size_t encodedLength = static_cast<size_t>((sequence.size()+1)/2);
+    char* encoded = (char*)::calloc(encodedLength, sizeof(char));
+    char* e = encoded;
+
+    uint8_t nucleotideCode;
+    bool useHighWord = true;
+    for (size_t i = 0; i < sequence.size(); ++i) {
+        switch (sequence.at(i)) {
+            case 'A' : nucleotideCode = 1;  break;
+            case 'C' : nucleotideCode = 2;  break;
+            case 'G' : nucleotideCode = 4;  break;
+            case 'T' : nucleotideCode = 8;  break;
+            default:
+                EXPECT_FALSE(true);
+                break;
+        }
+
+        // pack the nucleotide code
+        if (useHighWord) {
+            *e = nucleotideCode << 4;
+            useHighWord = false;
+        } else {
+            *e |= nucleotideCode;
+            ++e;
+            useHighWord = true;
+        }
+    }
+
+    BamRecordImpl bam;
+    bam.SetPreencodedSequenceAndQualities(encoded, sequence.size(), qualities.c_str());
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+
+    if (encoded)
+        free(encoded);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Preencoded_EmptyQual) {
+
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    const size_t encodedLength = static_cast<size_t>((sequence.size()+1)/2);
+    char* encoded = (char*)::calloc(encodedLength, sizeof(char));
+    char* e = encoded;
+
+    uint8_t nucleotideCode;
+    bool useHighWord = true;
+    for (size_t i = 0; i < sequence.size(); ++i) {
+        switch (sequence.at(i)) {
+            case 'A' : nucleotideCode = 1;  break;
+            case 'C' : nucleotideCode = 2;  break;
+            case 'G' : nucleotideCode = 4;  break;
+            case 'T' : nucleotideCode = 8;  break;
+            default:
+                EXPECT_FALSE(true);
+                break;
+        }
+
+        // pack the nucleotide code
+        if (useHighWord) {
+            *e = nucleotideCode << 4;
+            useHighWord = false;
+        } else {
+            *e |= nucleotideCode;
+            ++e;
+            useHighWord = true;
+        }
+    }
+
+    BamRecordImpl bam;
+    bam.SetPreencodedSequenceAndQualities(encoded, sequence.size(), qualities.c_str());
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+
+    if (encoded)
+        free(encoded);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty     = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_Normal)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptySeqQual)
+{
+    const std::string sequence  = "";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptyTag)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0,         bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty     = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerTags)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterTags)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0,         bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_Normal)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptySeqQual)
+{
+    const std::string sequence  = "";
+    const std::string qualities = "";
+    const std::string cigar     = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar     = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptyCigar)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string empty     = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerCigar)
+{
+    const std::string sequence    = "ACGTACGTACGT";
+    const std::string qualities   = "?]?]?]?]?]?]";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(sequence,    bam.Sequence());
+    EXPECT_EQ(qualities,   bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterCigar)
+{
+    const std::string sequence    = "ACGTACGTACGT";
+    const std::string qualities   = "?]?]?]?]?]?]";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithEmptyCigar)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string empty     = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_Normal)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptySeqQual)
+{
+    const std::string sequence  = "";
+    const std::string qualities = "";
+    const std::string cigar     = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar     = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyCigar)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyTag)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    EXPECT_EQ(0,         bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string empty     = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string sequence    = "ACGTACGTACGT";
+    const std::string qualities   = "?]?]?]?]?]?]";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(sequence,    bam.Sequence());
+    EXPECT_EQ(qualities,   bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string sequence    = "ACGTACGTACGT";
+    const std::string qualities   = "?]?]?]?]?]?]";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string empty     = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    EXPECT_EQ(0,         bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_InitEmpty)
+{
+    BamRecordImpl bam;
+    bam.Name(std::string());
+    EXPECT_EQ(0, bam.Name().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_InitNormal)
+{
+    const std::string readName = "foo";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithLongerName)
+{
+    const std::string readName   = "foo";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithShorterName)
+{
+    const std::string readName   = "foo";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithEmptyName)
+{
+    const std::string readName  = "foo";
+    const std::string emptyName = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Name(emptyName);
+
+    EXPECT_EQ(emptyName, bam.Name());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_Init_Normal)
+{
+    const std::string readName = "foo";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_Init_EmptyName)
+{
+    const std::string readName = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_Init_EmptyTag)
+{
+    const std::string readName = "foo";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(0,        bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName   = "foo";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName   = "foo";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string empty    = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName = "foo";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName = "foo";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName = "foo";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(0,        bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string cigar   = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string cigar    = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_Init_EmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar    = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithLongerName)
+{
+    const std::string readName   = "foo";
+    const std::string cigar      = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(cigar,      bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithShorterName)
+{
+    const std::string readName   = "foo";
+    const std::string cigar      = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.CigarData(cigar);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string cigar    = "100=";
+    const std::string empty    = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName    = "foo";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName,    bam.Name());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName    = "foo";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar    = "100=";
+    const std::string empty    = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty,    bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string cigar    = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string cigar    = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar    = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyTag)
+{
+    const std::string readName = "foo";
+    const std::string cigar    = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+    EXPECT_EQ(0,        bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName   = "foo";
+    const std::string cigar      = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(cigar,      bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName   = "foo";
+    const std::string cigar      = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string cigar    = "100=";
+    const std::string empty    = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName    = "foo";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName,    bam.Name());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName    = "foo";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar    = "100=";
+    const std::string empty    = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty,    bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName = "foo";
+    const std::string cigar    = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName = "foo";
+    const std::string cigar    = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName = "foo";
+    const std::string cigar    = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+    EXPECT_EQ(0,        bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_Normal)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_EmptySeqQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "";
+    const std::string qualities = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_EmptyQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerName)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(sequence,   bam.Sequence());
+    EXPECT_EQ(qualities,  bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterName)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithEmptyName)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty     = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty,     bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty     = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty,    bam.Sequence());
+    EXPECT_EQ(empty,    bam.Qualities().Fastq());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_Normal)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyName)
+{
+    const std::string readName  = "";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptySeqQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyTag)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0,         bam.Tags().size());
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(sequence,   bam.Sequence());
+    EXPECT_EQ(qualities,  bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty     = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty,     bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty     = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty,    bam.Sequence());
+    EXPECT_EQ(empty,    bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0,         bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_Normal)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyName)
+{
+    const std::string readName  = "";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptySeqQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "";
+    const std::string qualities = "";
+    const std::string cigar     = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar     = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyCigar)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerName)
+{
+    const std::string readName   = "foo";
+    const std::string sequence   = "ACGTACGTACGT";
+    const std::string qualities  = "?]?]?]?]?]?]";
+    const std::string cigar      = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName,  bam.Name());
+    EXPECT_EQ(sequence,    bam.Sequence());
+    EXPECT_EQ(qualities,   bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,       bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterName)
+{
+    const std::string readName   = "foo";
+    const std::string sequence   = "ACGTACGTACGT";
+    const std::string qualities  = "?]?]?]?]?]?]";
+    const std::string cigar      = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,    bam.Sequence());
+    EXPECT_EQ(qualities,   bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,       bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptyName)
+{
+    const std::string readName   = "foo";
+    const std::string sequence   = "ACGTACGTACGT";
+    const std::string qualities  = "?]?]?]?]?]?]";
+    const std::string cigar      = "100=";
+    const std::string empty      = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty,       bam.Name());
+    EXPECT_EQ(sequence,    bam.Sequence());
+    EXPECT_EQ(qualities,   bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,       bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName   = "foo";
+    const std::string sequence   = "ACGTACGTACGT";
+    const std::string qualities  = "?]?]?]?]?]?]";
+    const std::string cigar      = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName   = "foo";
+    const std::string sequence   = "ACGTACGTACGT";
+    const std::string qualities  = "";
+    const std::string cigar      = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName   = "foo";
+    const std::string sequence   = "ACGTACGTACGT";
+    const std::string qualities  = "?]?]?]?]?]?]";
+    const std::string cigar      = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName   = "foo";
+    const std::string sequence   = "ACGTACGTACGT";
+    const std::string qualities  = "?]?]?]?]?]?]";
+    const std::string cigar      = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string empty     = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty,    bam.Sequence());
+    EXPECT_EQ(empty,    bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName    = "foo";
+    const std::string sequence    = "ACGTACGTACGT";
+    const std::string qualities   = "?]?]?]?]?]?]";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName,    bam.Name());
+    EXPECT_EQ(sequence,    bam.Sequence());
+    EXPECT_EQ(qualities,   bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName    = "foo";
+    const std::string sequence    = "ACGTACGTACGT";
+    const std::string qualities   = "?]?]?]?]?]?]";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string empty     = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty,     bam.CigarData().ToStdString());
+    tests::CheckRawData(bam);
+}
+
+// @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_Normal)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyName)
+{
+    const std::string readName  = "";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptySeqQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "";
+    const std::string qualities = "";
+    const std::string cigar     = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar     = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyCigar)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyTag)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    EXPECT_EQ(0,         bam.Tags().size());
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(sequence,   bam.Sequence());
+    EXPECT_EQ(qualities,  bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,      bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string empty     = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty,     bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string shortSeq  = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(shortSeq,  bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string empty     = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty,    bam.Sequence());
+    EXPECT_EQ(empty,    bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,    bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName    = "foo";
+    const std::string sequence    = "ACGTACGTACGT";
+    const std::string qualities   = "?]?]?]?]?]?]";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName,    bam.Name());
+    EXPECT_EQ(sequence,    bam.Sequence());
+    EXPECT_EQ(qualities,   bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName    = "foo";
+    const std::string sequence    = "ACGTACGTACGT";
+    const std::string qualities   = "?]?]?]?]?]?]";
+    const std::string cigar       = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+    const std::string empty     = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = (int32_t)-42;
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    tests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName  = "foo";
+    const std::string sequence  = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar     = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = (int32_t)-42;
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName,  bam.Name());
+    EXPECT_EQ(sequence,  bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar,     bam.CigarData().ToStdString());
+    EXPECT_EQ(0,         bam.Tags().size());
+    tests::CheckRawData(bam);
+}
diff --git a/tests/src/test_BamRecordMapping.cpp b/tests/src/test_BamRecordMapping.cpp

new file mode 100644 (file)

index 0000000..d1a4af9
--- /dev/null
+++ b/tests/src/test_BamRecordMapping.cpp
@@ -0,0 +1,745 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamRecordView.h>
+#include <pbbam/BamTagCodec.h>
+#include <chrono>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+typedef vector<uint16_t> f_data;
+
+namespace tests {
+
+static
+BamRecord MakeRecord(const Position qStart,
+                     const Position qEnd,
+                     const string& seq,
+                     const string& quals,
+                     const string& tagBases,
+                     const string& tagQuals,
+                     const f_data& frames)
+{
+    BamRecordImpl impl;
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["qs"] = qStart;
+    tags["qe"] = qEnd;
+    tags["ip"] = frames;
+    tags["pw"] = frames;
+    tags["dt"] = tagBases;
+    tags["st"] = tagBases;
+    tags["dq"] = tagQuals;
+    tags["iq"] = tagQuals;
+    tags["mq"] = tagQuals;
+    tags["sq"] = tagQuals;
+    tags["pq"] = tagQuals;
+    tags["pv"] = tagQuals;
+    impl.Tags(tags);
+
+    return BamRecord(std::move(impl));
+}
+
+} // namespace tests
+
+TEST(BamRecordMappingTest, BasicMap)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const string seq      = "AACCGTTAGC";
+    const string quals    = "?]?]?]?]?*";
+    const string tagBases = "AACCGTTAGC";
+    const string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+
+    const string seq_rev   = "GCTAACGGTT";
+    const string quals_rev = "*?]?]?]?]?";
+    const string tagBases_rev = seq_rev;
+    const string tagQuals_rev = quals_rev;
+    const f_data frames_rev = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 };
+
+    const string s1_cigar = "10=";
+    const string s2_cigar = "5=3D5=";
+    const string s3_cigar = "4=1D2I2D4=";
+
+    BamRecord s1 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s1_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+
+    s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual);
+    s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual);
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual);
+    s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    {   // s1 - FORWARD
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(0, s1.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(mapQual, s1.MapQuality());
+
+        EXPECT_EQ(qStart, s1.QueryStart());
+        EXPECT_EQ(qEnd,   s1.QueryEnd());
+        EXPECT_EQ(500, s1.AlignedStart());
+        EXPECT_EQ(510, s1.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s1.ReferenceStart());
+        EXPECT_EQ(110, s1.ReferenceEnd());       // 100 + 10=
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(0, s1_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s1_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s1_rev.QueryStart());
+        EXPECT_EQ(qEnd,   s1_rev.QueryEnd());
+        EXPECT_EQ(500, s1_rev.AlignedStart());
+        EXPECT_EQ(510, s1_rev.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s1_rev.ReferenceStart());
+        EXPECT_EQ(110, s1_rev.ReferenceEnd());       // 100 + 10=
+
+        // native
+        const BamRecordView nativeView
+        {
+            s1_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(0, s2.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(mapQual, s2.MapQuality());
+
+        EXPECT_EQ(qStart, s2.QueryStart());
+        EXPECT_EQ(qEnd,   s2.QueryEnd());
+        EXPECT_EQ(500, s2.AlignedStart());
+        EXPECT_EQ(510, s2.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s2.ReferenceStart());
+        EXPECT_EQ(113, s2.ReferenceEnd());      // 100 + 10= + 3D
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(0, s2_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s2_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s2_rev.QueryStart());
+        EXPECT_EQ(qEnd,   s2_rev.QueryEnd());
+        EXPECT_EQ(500, s2_rev.AlignedStart());
+        EXPECT_EQ(510, s2_rev.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s2_rev.ReferenceStart());
+        EXPECT_EQ(113, s2_rev.ReferenceEnd());      // 100 + 10= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s2_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(0, s3.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(mapQual, s3.MapQuality());
+
+        EXPECT_EQ(qStart, s3.QueryStart());
+        EXPECT_EQ(qEnd,   s3.QueryEnd());
+        EXPECT_EQ(500, s3.AlignedStart());
+        EXPECT_EQ(510, s3.AlignedEnd());         // 500 + 8= + 2I
+        EXPECT_EQ(100, s3.ReferenceStart());
+        EXPECT_EQ(111, s3.ReferenceEnd());      // 100 + 8= + 3D
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(0, s3_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s3_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s3_rev.QueryStart());
+        EXPECT_EQ(qEnd,   s3_rev.QueryEnd());
+        EXPECT_EQ(500, s3_rev.AlignedStart());
+        EXPECT_EQ(510, s3_rev.AlignedEnd());         // 500 + 8= + 2I
+        EXPECT_EQ(100, s3_rev.ReferenceStart());
+        EXPECT_EQ(111, s3_rev.ReferenceEnd());      // 100 + 8= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s3_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+}
+
+TEST(BamRecordMappingTest, SoftClipMapping)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const string seq      = "TTAACCGTTAGCAAA";
+    const string quals    = "--?]?]?]?]?*+++";
+    const string tagBases = "TTAACCGTTAGCAAA";
+    const string tagQuals = "--?]?]?]?]?*+++";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+    const uint8_t mapQual = 80;
+
+    const string clipped_seq   = "AACCGTTAGC";
+    const string clipped_quals = "?]?]?]?]?*";
+    const string clipped_tagBases   = "AACCGTTAGC";
+    const string clipped_tagQuals = "?]?]?]?]?*";
+    const f_data clipped_frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const string seq_rev   = "TTTGCTAACGGTTAA";
+    const string quals_rev = "+++*?]?]?]?]?--";
+    const string tagBases_rev = seq_rev;
+    const string tagQuals_rev = quals_rev;
+    const f_data frames_rev = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 };
+
+    const string clipped_seq_rev   = "GCTAACGGTT";
+    const string clipped_quals_rev = "*?]?]?]?]?";
+    const string clipped_tagBases_rev = clipped_seq_rev;
+    const string clipped_tagQuals_rev = clipped_quals_rev;
+    const f_data clipped_frames_rev = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 };
+
+    const string s1_cigar = "2S10=3S";
+    const string s2_cigar = "2S5=3D5=3S";
+    const string s3_cigar = "2S4=1D2I2D4=3S";
+
+    BamRecord s1 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3 = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s1_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3_rev = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+
+    s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual);
+    s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual);
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual);
+    s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(0, s1.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(mapQual, s1.MapQuality());
+
+        EXPECT_EQ(qStart, s1.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s1.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(502, s1.AlignedStart());       // QStart + 2S
+        EXPECT_EQ(512, s1.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s1.ReferenceStart());     // 100
+        EXPECT_EQ(110, s1.ReferenceEnd());       // RefStart + 10=
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(0, s1_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s1_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s1_rev.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s1_rev.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(503, s1_rev.AlignedStart());       // QStart + 3S
+        EXPECT_EQ(513, s1_rev.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s1_rev.ReferenceStart());     // 100
+        EXPECT_EQ(110, s1_rev.ReferenceEnd());       // RefStart + 10=
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s1_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(0, s2.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(mapQual, s2.MapQuality());
+
+        EXPECT_EQ(qStart, s2.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s2.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(502, s2.AlignedStart());       // QStart + 2S
+        EXPECT_EQ(512, s2.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s2.ReferenceStart());     // 100
+        EXPECT_EQ(113, s2.ReferenceEnd());       // RefStart + 10= + 3D
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(0, s2_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s2_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s2_rev.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s2_rev.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(503, s2_rev.AlignedStart());       // QStart + 3S
+        EXPECT_EQ(513, s2_rev.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s2_rev.ReferenceStart());     // 100
+        EXPECT_EQ(113, s2_rev.ReferenceEnd());       // RefStart + 10= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s2_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(0, s3.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(mapQual, s3.MapQuality());
+
+        EXPECT_EQ(qStart, s3.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s3.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(502, s3.AlignedStart());       // QStart + 2S
+        EXPECT_EQ(512, s3.AlignedEnd());         // AStart + 8= + 2I
+        EXPECT_EQ(100, s3.ReferenceStart());     // 100
+        EXPECT_EQ(111, s3.ReferenceEnd());       // RefStart + 8= + 3D
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(0, s3_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s3_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s3_rev.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s3_rev.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(503, s3_rev.AlignedStart());       // QStart + 3S
+        EXPECT_EQ(513, s3_rev.AlignedEnd());         // AStart + 8= + 2I
+        EXPECT_EQ(100, s3_rev.ReferenceStart());     // 100
+        EXPECT_EQ(111, s3_rev.ReferenceEnd());       // RefStart + 8= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s3_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+}
+
+TEST(BamRecordMappingTest, MappedCopy)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const string seq      = "AACCGTTAGC";
+    const string quals    = "?]?]?]?]?*";
+    const string tagBases = "AACCGTTAGC";
+    const string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+    const string cigar    = "4=1D2I2D4=";
+
+    const BamRecord orig = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    const BamRecord mapped = orig.Mapped(0, 100, Strand::FORWARD, cigar, mapQual);
+
+    EXPECT_TRUE(mapped.IsMapped());
+    EXPECT_EQ(0, mapped.ReferenceId());
+    EXPECT_EQ(Strand::FORWARD, mapped.AlignedStrand());
+    EXPECT_EQ(mapQual, mapped.MapQuality());
+
+    EXPECT_EQ(500, mapped.QueryStart());      // 500
+    EXPECT_EQ(510, mapped.QueryEnd());        // QStart + seqLength
+    EXPECT_EQ(500, mapped.AlignedStart());    // QStart
+    EXPECT_EQ(510, mapped.AlignedEnd());      // QStart + 8= + 2I
+    EXPECT_EQ(100, mapped.ReferenceStart());  // 100
+    EXPECT_EQ(111, mapped.ReferenceEnd());    // RefStart + 8= + 3D
+
+    const BamRecordView view
+    {
+        mapped,
+        Orientation::NATIVE,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq,      view.Sequence());
+    EXPECT_EQ(quals,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases, view.DeletionTags());
+    EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames,   view.IPD().Data());
+}
+
+TEST(BamRecordMappingTest, StaticMapped)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const string seq      = "AACCGTTAGC";
+    const string quals    = "?]?]?]?]?*";
+    const string tagBases = "AACCGTTAGC";
+    const string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+    const string cigar    = "4=1D2I2D4=";
+
+    const BamRecord orig = tests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    const BamRecord mapped = BamRecord::Mapped(orig, 0, 100, Strand::FORWARD, cigar, mapQual);
+
+    EXPECT_TRUE(mapped.IsMapped());
+    EXPECT_EQ(0, mapped.ReferenceId());
+    EXPECT_EQ(Strand::FORWARD, mapped.AlignedStrand());
+    EXPECT_EQ(mapQual, mapped.MapQuality());
+
+    EXPECT_EQ(500, mapped.QueryStart());      // 500
+    EXPECT_EQ(510, mapped.QueryEnd());        // QStart + seqLength
+    EXPECT_EQ(500, mapped.AlignedStart());    // QStart
+    EXPECT_EQ(510, mapped.AlignedEnd());      // QStart + 8= + 2I
+    EXPECT_EQ(100, mapped.ReferenceStart());  // 100
+    EXPECT_EQ(111, mapped.ReferenceEnd());    // RefStart + 8= + 3D
+
+    const BamRecordView view
+    {
+        mapped,
+        Orientation::NATIVE,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq,      view.Sequence());
+    EXPECT_EQ(quals,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases, view.DeletionTags());
+    EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames,   view.IPD().Data());
+}
diff --git a/tests/src/test_BamWriter.cpp b/tests/src/test_BamWriter.cpp

new file mode 100644 (file)

index 0000000..75dffe5
--- /dev/null
+++ b/tests/src/test_BamWriter.cpp
@@ -0,0 +1,127 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/BamHeader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/EntireFileQuery.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(BamWriterTest, SingleWrite_UserRecord)
+{
+    const string fullName = "test/100/0_5";
+    const string rgId     = "6002b307";
+    const vector<float> expectedSnr = {0.2,0.2,0.2,0.2};
+
+    // setup header
+    const string hdrText = {
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;"
+             "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t"
+             "PU:test\tPM:SEQUEL\n"
+    };
+    BamHeader inputHeader(hdrText);
+
+    // setup record
+    BamRecord bamRecord(inputHeader);
+    bamRecord.impl_.Name(fullName);
+    bamRecord.impl_.SetSequenceAndQualities("ACGTC", 5);
+    bamRecord.impl_.CigarData("");
+    bamRecord.impl_.Bin(0);
+    bamRecord.impl_.Flag(0);
+    bamRecord.impl_.InsertSize(0);
+    bamRecord.impl_.MapQuality(0);
+    bamRecord.impl_.MatePosition(-1);
+    bamRecord.impl_.MateReferenceId(-1);
+    bamRecord.impl_.Position(-1);
+    bamRecord.impl_.ReferenceId(-1);
+    bamRecord.impl_.SetMapped(false);
+
+    TagCollection tags;
+    tags["zm"] = static_cast<int32_t>(100);
+    tags["qs"] = static_cast<Position>(0);
+    tags["qe"] = static_cast<Position>(5);
+    tags["np"] = static_cast<int32_t>(1);
+    tags["rq"] = static_cast<float>(0.6);
+    tags["RG"] = rgId;
+    tags["sn"] = expectedSnr;
+    bamRecord.impl_.Tags(tags);
+
+    // write record to file
+    const string generatedBamFn = tests::GeneratedData_Dir + "/bamwriter_generated.bam";
+    {
+        BamWriter writer(generatedBamFn, inputHeader);
+        writer.Write(bamRecord);
+    }
+
+    // check written header
+    BamFile file(generatedBamFn);
+    const auto header = file.Header();
+    EXPECT_EQ(std::string("1.1"),     header.Version());
+    EXPECT_EQ(std::string("unknown"), header.SortOrder());
+    EXPECT_EQ(std::string("3.0.1"),   header.PacBioBamVersion());
+
+    // check written record
+    EntireFileQuery entireFile(file);
+    auto firstIter = entireFile.begin();
+    auto record = *firstIter;
+    EXPECT_EQ(std::string("ACGTC"),        record.Sequence());
+    EXPECT_EQ(std::string("test/100/0_5"), record.FullName());
+    EXPECT_TRUE(record.HasHoleNumber());
+    EXPECT_TRUE(record.HasNumPasses());
+    EXPECT_TRUE(record.HasQueryEnd());
+    EXPECT_TRUE(record.HasQueryStart());
+    EXPECT_TRUE(record.HasReadAccuracy());
+    EXPECT_TRUE(record.HasSignalToNoise());
+    EXPECT_EQ(100, record.HoleNumber());
+    EXPECT_EQ(1,   record.NumPasses());
+    EXPECT_EQ(0,   record.QueryStart());
+    EXPECT_EQ(5,   record.QueryEnd());
+    EXPECT_EQ(expectedSnr, record.SignalToNoise());
+    EXPECT_EQ(rgId, record.ReadGroupId());
+
+    // clean up
+    remove(generatedBamFn.c_str());
+}
diff --git a/tests/src/test_BarcodeQuery.cpp b/tests/src/test_BarcodeQuery.cpp

new file mode 100644 (file)

index 0000000..6ec02a8
--- /dev/null
+++ b/tests/src/test_BarcodeQuery.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/BarcodeQuery.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(BarcodeQueryTest, QueryOk)
+{
+    // come back with barcoded data
+}
diff --git a/tests/src/test_Cigar.cpp b/tests/src/test_Cigar.cpp

new file mode 100644 (file)

index 0000000..796720a
--- /dev/null
+++ b/tests/src/test_Cigar.cpp
@@ -0,0 +1,198 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/Cigar.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(CigarTest, TypeToCar)
+{
+    EXPECT_EQ('M', CigarOperation::TypeToChar(CigarOperationType::ALIGNMENT_MATCH) );
+    EXPECT_EQ('I', CigarOperation::TypeToChar(CigarOperationType::INSERTION) );
+    EXPECT_EQ('D', CigarOperation::TypeToChar(CigarOperationType::DELETION) );
+    EXPECT_EQ('N', CigarOperation::TypeToChar(CigarOperationType::REFERENCE_SKIP) );
+    EXPECT_EQ('S', CigarOperation::TypeToChar(CigarOperationType::SOFT_CLIP) );
+    EXPECT_EQ('H', CigarOperation::TypeToChar(CigarOperationType::HARD_CLIP) );
+    EXPECT_EQ('P', CigarOperation::TypeToChar(CigarOperationType::PADDING) );
+    EXPECT_EQ('=', CigarOperation::TypeToChar(CigarOperationType::SEQUENCE_MATCH) );
+    EXPECT_EQ('X', CigarOperation::TypeToChar(CigarOperationType::SEQUENCE_MISMATCH) );
+}
+
+TEST(CigarTest, CharToType)
+{
+    EXPECT_EQ(CigarOperationType::ALIGNMENT_MATCH,   CigarOperation::CharToType('M'));
+    EXPECT_EQ(CigarOperationType::INSERTION,         CigarOperation::CharToType('I'));
+    EXPECT_EQ(CigarOperationType::DELETION,          CigarOperation::CharToType('D'));
+    EXPECT_EQ(CigarOperationType::REFERENCE_SKIP,    CigarOperation::CharToType('N'));
+    EXPECT_EQ(CigarOperationType::SOFT_CLIP,         CigarOperation::CharToType('S'));
+    EXPECT_EQ(CigarOperationType::HARD_CLIP,         CigarOperation::CharToType('H'));
+    EXPECT_EQ(CigarOperationType::PADDING,           CigarOperation::CharToType('P'));
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MATCH,    CigarOperation::CharToType('='));
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MISMATCH, CigarOperation::CharToType('X'));
+}
+
+TEST(CigarTest, SetOperationYieldsCorrectType)
+{
+    CigarOperation c1; c1.Type(CigarOperationType::ALIGNMENT_MATCH);
+    CigarOperation c2; c2.Type(CigarOperationType::INSERTION);
+    CigarOperation c3; c3.Type(CigarOperationType::DELETION);
+    CigarOperation c4; c4.Type(CigarOperationType::REFERENCE_SKIP);
+    CigarOperation c5; c5.Type(CigarOperationType::SOFT_CLIP);
+    CigarOperation c6; c6.Type(CigarOperationType::HARD_CLIP);
+    CigarOperation c7; c7.Type(CigarOperationType::PADDING);
+    CigarOperation c8; c8.Type(CigarOperationType::SEQUENCE_MATCH);
+    CigarOperation c9; c9.Type(CigarOperationType::SEQUENCE_MISMATCH);
+
+    EXPECT_EQ('M', c1.Char());
+    EXPECT_EQ('I', c2.Char());
+    EXPECT_EQ('D', c3.Char());
+    EXPECT_EQ('N', c4.Char());
+    EXPECT_EQ('S', c5.Char());
+    EXPECT_EQ('H', c6.Char());
+    EXPECT_EQ('P', c7.Char());
+    EXPECT_EQ('=', c8.Char());
+    EXPECT_EQ('X', c9.Char());
+}
+
+TEST(CigarTest, SetTypeYieldsCorrectOperation)
+{
+    CigarOperation c1; c1.Char('M');
+    CigarOperation c2; c2.Char('I');
+    CigarOperation c3; c3.Char('D');
+    CigarOperation c4; c4.Char('N');
+    CigarOperation c5; c5.Char('S');
+    CigarOperation c6; c6.Char('H');
+    CigarOperation c7; c7.Char('P');
+    CigarOperation c8; c8.Char('=');
+    CigarOperation c9; c9.Char('X');
+
+    EXPECT_EQ(CigarOperationType::ALIGNMENT_MATCH,   c1.Type());
+    EXPECT_EQ(CigarOperationType::INSERTION,         c2.Type());
+    EXPECT_EQ(CigarOperationType::DELETION,          c3.Type());
+    EXPECT_EQ(CigarOperationType::REFERENCE_SKIP,    c4.Type());
+    EXPECT_EQ(CigarOperationType::SOFT_CLIP,         c5.Type());
+    EXPECT_EQ(CigarOperationType::HARD_CLIP,         c6.Type());
+    EXPECT_EQ(CigarOperationType::PADDING,           c7.Type());
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MATCH,    c8.Type());
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MISMATCH, c9.Type());
+}
+
+TEST(CigarStringTest, FromStdString_Empty)
+{
+    const string emptyCigar = "";
+    Cigar cigar = Cigar::FromStdString(emptyCigar);
+    EXPECT_TRUE(cigar.empty());
+}
+
+TEST(CigarStringTest, FromStdString_SingleOp)
+{
+    const string singleCigar = "100=";
+
+    Cigar cigar = Cigar::FromStdString(singleCigar);
+    ASSERT_TRUE(cigar.size() == 1);
+
+    const CigarOperation& op = cigar.front();
+    EXPECT_TRUE(op.Char()   == '=');
+    EXPECT_TRUE(op.Length() == 100);
+}
+
+TEST(CigarStringTest, FromStdString_MultipleOps)
+{
+    const string multiCigar = "100=2D34I6=6X6=";
+
+    Cigar cigar = Cigar::FromStdString(multiCigar);
+    ASSERT_TRUE(cigar.size() == 6);
+
+    CigarOperation op0 = cigar.at(0);
+    CigarOperation op1 = cigar.at(1);
+    CigarOperation op2 = cigar.at(2);
+    CigarOperation op3 = cigar.at(3);
+    CigarOperation op4 = cigar.at(4);
+    CigarOperation op5 = cigar.at(5);
+
+    EXPECT_TRUE(op0.Char()   == '=');
+    EXPECT_TRUE(op0.Length() == 100);
+    EXPECT_TRUE(op1.Char()   == 'D');
+    EXPECT_TRUE(op1.Length() == 2);
+    EXPECT_TRUE(op2.Char()   == 'I');
+    EXPECT_TRUE(op2.Length() == 34);
+    EXPECT_TRUE(op3.Char()   == '=');
+    EXPECT_TRUE(op3.Length() == 6);
+    EXPECT_TRUE(op4.Char()   == 'X');
+    EXPECT_TRUE(op4.Length() == 6);
+    EXPECT_TRUE(op5.Char()   == '=');
+    EXPECT_TRUE(op5.Length() == 6);
+}
+
+TEST(CigarStringTest, ToStdString_Empty)
+{
+    const string empty;
+    Cigar cigar;
+    EXPECT_EQ(empty, cigar.ToStdString());
+}
+
+TEST(CigarStringTest, ToStdString_SingleOp)
+{
+    const string singleCigar = "100=";
+
+    Cigar cigar;
+    cigar.push_back( CigarOperation(CigarOperationType::SEQUENCE_MATCH, 100) );
+
+    EXPECT_EQ(singleCigar, cigar.ToStdString());
+}
+
+TEST(CigarStringTest, ToStdString_MultipleOps)
+{
+    const string multiCigar = "100=2D34I6=6X6=";
+
+    Cigar cigar;
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,  100));
+    cigar.push_back(CigarOperation(CigarOperationType::DELETION,          2));
+    cigar.push_back(CigarOperation(CigarOperationType::INSERTION,        34));
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,    6));
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MISMATCH, 6));
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,    6));
+
+    EXPECT_EQ(multiCigar, cigar.ToStdString());
+}
diff --git a/tests/src/test_Compare.cpp b/tests/src/test_Compare.cpp

new file mode 100644 (file)

index 0000000..e5ed933
--- /dev/null
+++ b/tests/src/test_Compare.cpp
@@ -0,0 +1,739 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/Compare.h>
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace tests {
+
+static inline
+BamRecord makeRecordWithTag(const string& tagName,
+                            const Tag& tag)
+{
+    auto r = BamRecord{ };
+    r.Impl().AddTag(tagName, tag);
+    return r;
+}
+
+static
+BamRecord makeRecord(const Position qStart,
+                     const Position qEnd,
+                     const string& seq,
+                     const string& quals,
+                     const string& tagBases,
+                     const string& tagQuals,
+                     const vector<uint16_t>& frames)
+{
+    BamRecordImpl impl;
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["qs"] = qStart;
+    tags["qe"] = qEnd;
+    tags["ip"] = frames;
+    tags["pw"] = frames;
+    tags["dt"] = tagBases;
+    tags["st"] = tagBases;
+    tags["dq"] = tagQuals;
+    tags["iq"] = tagQuals;
+    tags["mq"] = tagQuals;
+    tags["sq"] = tagQuals;
+    tags["pq"] = tagQuals;
+    tags["pv"] = tagQuals;
+    impl.Tags(tags);
+
+    return BamRecord(std::move(impl));
+}
+
+static
+std::vector<BamRecord> makeMappedRecords(void)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const string seq      = "AACCGTTAGC";
+    const string quals    = "?]?]?]?]?*";
+    const string tagBases = "AACCGTTAGC";
+    const string tagQuals = "?]?]?]?]?*";
+    const vector<uint16_t> frames  = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+
+    const string s1_cigar = "10=";
+    const string s2_cigar = "5=3D5=";
+    const string s3_cigar = "4=1D2I2D2X2=";
+
+    BamRecord s1 = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2 = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3 = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s1_rev = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2_rev = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3_rev = tests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+
+    s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual);
+    s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual);
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual);
+    s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    return std::vector<BamRecord> { s1, s2, s3, s1_rev, s2_rev, s3_rev };
+}
+
+} // namespace tests
+
+TEST(CompareTest, TypeToNameOk)
+{
+    EXPECT_EQ(string{"Compare::EQUAL"},              Compare::TypeToName(Compare::EQUAL));
+    EXPECT_EQ(string{"Compare::NOT_EQUAL"},          Compare::TypeToName(Compare::NOT_EQUAL));
+    EXPECT_EQ(string{"Compare::LESS_THAN"},          Compare::TypeToName(Compare::LESS_THAN));
+    EXPECT_EQ(string{"Compare::LESS_THAN_EQUAL"},    Compare::TypeToName(Compare::LESS_THAN_EQUAL));
+    EXPECT_EQ(string{"Compare::GREATER_THAN"},       Compare::TypeToName(Compare::GREATER_THAN));
+    EXPECT_EQ(string{"Compare::GREATER_THAN_EQUAL"}, Compare::TypeToName(Compare::GREATER_THAN_EQUAL));
+    EXPECT_EQ(string{"Compare::CONTAINS"},           Compare::TypeToName(Compare::CONTAINS));
+    EXPECT_EQ(string{"Compare::NOT_CONTAINS"},       Compare::TypeToName(Compare::NOT_CONTAINS));
+
+    // invalid type throws
+    EXPECT_THROW(Compare::TypeToName(static_cast<Compare::Type>(42)), std::runtime_error);
+}
+
+TEST(CompareTest, TypeToOperatorOk)
+{
+    { // normal
+        EXPECT_EQ(Compare::TypeToOperator(Compare::EQUAL),              string{"=="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_EQUAL),          string{"!="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN),          string{"<"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN_EQUAL),    string{"<="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN),       string{">"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN_EQUAL), string{">="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::CONTAINS),           string{"&"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_CONTAINS),       string{"~"});
+    }
+
+    { // alpha
+        EXPECT_EQ(Compare::TypeToOperator(Compare::EQUAL, true),              string{"eq"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_EQUAL, true),          string{"ne"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN, true),          string{"lt"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN_EQUAL, true),    string{"lte"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN, true),       string{"gt"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN_EQUAL, true), string{"gte"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::CONTAINS, true),           string{"and"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_CONTAINS, true),       string{"not"});
+    }
+
+    // invalid type throws
+    EXPECT_THROW(Compare::TypeToOperator(static_cast<Compare::Type>(42)), std::runtime_error);
+}
+
+TEST(CompareTest, FromOperatorOk)
+{
+    EXPECT_EQ(Compare::EQUAL,              Compare::TypeFromOperator("=="));
+    EXPECT_EQ(Compare::EQUAL,              Compare::TypeFromOperator("="));
+    EXPECT_EQ(Compare::EQUAL,              Compare::TypeFromOperator("eq"));
+    EXPECT_EQ(Compare::NOT_EQUAL,          Compare::TypeFromOperator("!="));
+    EXPECT_EQ(Compare::NOT_EQUAL,          Compare::TypeFromOperator("ne"));
+    EXPECT_EQ(Compare::LESS_THAN,          Compare::TypeFromOperator("<"));
+    EXPECT_EQ(Compare::LESS_THAN,          Compare::TypeFromOperator("lt"));
+    EXPECT_EQ(Compare::LESS_THAN,          Compare::TypeFromOperator("&lt;"));
+    EXPECT_EQ(Compare::LESS_THAN_EQUAL,    Compare::TypeFromOperator("<="));
+    EXPECT_EQ(Compare::LESS_THAN_EQUAL,    Compare::TypeFromOperator("lte"));
+    EXPECT_EQ(Compare::LESS_THAN_EQUAL,    Compare::TypeFromOperator("&lt;="));
+    EXPECT_EQ(Compare::GREATER_THAN,       Compare::TypeFromOperator(">"));
+    EXPECT_EQ(Compare::GREATER_THAN,       Compare::TypeFromOperator("gt"));
+    EXPECT_EQ(Compare::GREATER_THAN,       Compare::TypeFromOperator("&gt;"));
+    EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator(">="));
+    EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator("gte"));
+    EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator("&gt;="));
+    EXPECT_EQ(Compare::CONTAINS,           Compare::TypeFromOperator("&"));
+    EXPECT_EQ(Compare::NOT_CONTAINS,       Compare::TypeFromOperator("~"));
+
+    // invalid operator strings throw
+    EXPECT_THROW(Compare::TypeFromOperator(""),        std::runtime_error);
+    EXPECT_THROW(Compare::TypeFromOperator("invalid"), std::runtime_error);
+}
+
+TEST(CompareTest, AlignedEndOk)
+{
+    BamRecord r1; r1.alignedEnd_ = 300;
+    BamRecord r2; r2.alignedEnd_ = 200;
+    BamRecord r3; r3.alignedEnd_ = 400;
+    BamRecord r4; r4.alignedEnd_ = 100;
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::AlignedEnd());
+
+    EXPECT_EQ(r4.alignedEnd_, records.at(0).AlignedEnd());
+    EXPECT_EQ(r2.alignedEnd_, records.at(1).AlignedEnd());
+    EXPECT_EQ(r1.alignedEnd_, records.at(2).AlignedEnd());
+    EXPECT_EQ(r3.alignedEnd_, records.at(3).AlignedEnd());
+}
+
+TEST(CompareTest, AlignedStartOk)
+{
+    BamRecord r1; r1.alignedStart_ = 300;
+    BamRecord r2; r2.alignedStart_ = 200;
+    BamRecord r3; r3.alignedStart_ = 400;
+    BamRecord r4; r4.alignedStart_ = 100;
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::AlignedStart());
+
+    EXPECT_EQ(r4.alignedStart_, records.at(0).AlignedStart());
+    EXPECT_EQ(r2.alignedStart_, records.at(1).AlignedStart());
+    EXPECT_EQ(r1.alignedStart_, records.at(2).AlignedStart());
+    EXPECT_EQ(r3.alignedStart_, records.at(3).AlignedStart());
+}
+
+TEST(CompareTest, AlignedStrandOk)
+{
+    BamRecord r1; r1.Impl().SetReverseStrand(true);
+    BamRecord r2; r2.Impl().SetReverseStrand(false);
+    BamRecord r3; r3.Impl().SetReverseStrand(true);
+    BamRecord r4; r4.Impl().SetReverseStrand(false);
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::AlignedStrand());
+
+    EXPECT_EQ(Strand::FORWARD, records.at(0).AlignedStrand());
+    EXPECT_EQ(Strand::FORWARD, records.at(1).AlignedStrand());
+    EXPECT_EQ(Strand::REVERSE, records.at(2).AlignedStrand());
+    EXPECT_EQ(Strand::REVERSE, records.at(3).AlignedStrand());
+}
+
+TEST(CompareTest, BarcodeForwardOk)
+{
+    BamRecord r1; r1.Barcodes(std::make_pair<int16_t,int16_t>(30,20));
+    BamRecord r2; r2.Barcodes(std::make_pair<int16_t,int16_t>(20,30));
+    BamRecord r3; r3.Barcodes(std::make_pair<int16_t,int16_t>(40,10));
+    BamRecord r4; r4.Barcodes(std::make_pair<int16_t,int16_t>(10,40));
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::BarcodeForward());
+
+    EXPECT_EQ(r4.BarcodeForward(), records.at(0).BarcodeForward());
+    EXPECT_EQ(r2.BarcodeForward(), records.at(1).BarcodeForward());
+    EXPECT_EQ(r1.BarcodeForward(), records.at(2).BarcodeForward());
+    EXPECT_EQ(r3.BarcodeForward(), records.at(3).BarcodeForward());
+}
+
+TEST(CompareTest, BarcodeReverseOk)
+{
+    BamRecord r1; r1.Barcodes(std::make_pair<int16_t,int16_t>(30,20));
+    BamRecord r2; r2.Barcodes(std::make_pair<int16_t,int16_t>(20,30));
+    BamRecord r3; r3.Barcodes(std::make_pair<int16_t,int16_t>(40,10));
+    BamRecord r4; r4.Barcodes(std::make_pair<int16_t,int16_t>(10,40));
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::BarcodeReverse());
+
+    EXPECT_EQ(r3.BarcodeReverse(), records.at(0).BarcodeReverse());
+    EXPECT_EQ(r1.BarcodeReverse(), records.at(1).BarcodeReverse());
+    EXPECT_EQ(r2.BarcodeReverse(), records.at(2).BarcodeReverse());
+    EXPECT_EQ(r4.BarcodeReverse(), records.at(3).BarcodeReverse());
+}
+
+TEST(CompareTest, BarcodeQualityOk)
+{
+    uint8_t q1 = 30;
+    uint8_t q2 = 20;
+    uint8_t q3 = 40;
+    uint8_t q4 = 10;
+
+    auto records = vector<BamRecord>
+    {
+        tests::makeRecordWithTag("bq", Tag(q1)),
+        tests::makeRecordWithTag("bq", Tag(q2)),
+        tests::makeRecordWithTag("bq", Tag(q3)),
+        tests::makeRecordWithTag("bq", Tag(q4))
+    };
+    std::sort(records.begin(), records.end(), Compare::BarcodeQuality());
+
+    EXPECT_EQ(q4, records.at(0).BarcodeQuality());
+    EXPECT_EQ(q2, records.at(1).BarcodeQuality());
+    EXPECT_EQ(q1, records.at(2).BarcodeQuality());
+    EXPECT_EQ(q3, records.at(3).BarcodeQuality());
+}
+
+TEST(CompareTest, CustomCompareOk)
+{
+    struct CustomCompare : public Compare::MemberFunctionBase<bool, &BamRecord::HasDeletionTag> { };
+
+    auto records = vector<BamRecord>
+    {
+        tests::makeRecordWithTag("dt", Tag(string("foo"))),
+        tests::makeRecordWithTag("dt", Tag(string("foo"))),
+        tests::makeRecordWithTag("dt", Tag(string("foo"))),
+        tests::makeRecordWithTag("dt", Tag(string("foo")))
+    };
+    records.push_back(BamRecord());
+    records.push_back(BamRecord());
+    records.push_back(BamRecord());
+    records.push_back(BamRecord());
+    EXPECT_EQ(8, records.size());
+
+    std::sort(records.begin(), records.end(), CustomCompare());
+
+    EXPECT_FALSE(records.at(0).HasDeletionTag());
+    EXPECT_FALSE(records.at(1).HasDeletionTag());
+    EXPECT_FALSE(records.at(2).HasDeletionTag());
+    EXPECT_FALSE(records.at(3).HasDeletionTag());
+    EXPECT_TRUE(records.at(4).HasDeletionTag());
+    EXPECT_TRUE(records.at(5).HasDeletionTag());
+    EXPECT_TRUE(records.at(6).HasDeletionTag());
+    EXPECT_TRUE(records.at(7).HasDeletionTag());
+}
+
+TEST(CompareTest, FullNameOk)
+{
+    BamRecord r1; r1.Impl().Name("c");
+    BamRecord r2; r2.Impl().Name("b");
+    BamRecord r3; r3.Impl().Name("d");
+    BamRecord r4; r4.Impl().Name("a");
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::FullName());
+
+    EXPECT_EQ(r4.FullName(), records.at(0).FullName());
+    EXPECT_EQ(r2.FullName(), records.at(1).FullName());
+    EXPECT_EQ(r1.FullName(), records.at(2).FullName());
+    EXPECT_EQ(r3.FullName(), records.at(3).FullName());
+}
+
+TEST(CompareTest, LocalContextFlagOk)
+{
+    BamRecord r1; r1.LocalContextFlags(LocalContextFlags::BARCODE_AFTER);
+    BamRecord r2; r2.LocalContextFlags(LocalContextFlags::ADAPTER_AFTER);
+    BamRecord r3; r3.LocalContextFlags(LocalContextFlags::REVERSE_PASS);
+    BamRecord r4; r4.LocalContextFlags(LocalContextFlags::NO_LOCAL_CONTEXT);
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::LocalContextFlag());
+
+    EXPECT_EQ(r4.LocalContextFlags(), records.at(0).LocalContextFlags());
+    EXPECT_EQ(r2.LocalContextFlags(), records.at(1).LocalContextFlags());
+    EXPECT_EQ(r1.LocalContextFlags(), records.at(2).LocalContextFlags());
+    EXPECT_EQ(r3.LocalContextFlags(), records.at(3).LocalContextFlags());
+}
+
+TEST(CompareTest, MapQualityOk)
+{
+    BamRecord r1; r1.Impl().MapQuality(30);
+    BamRecord r2; r2.Impl().MapQuality(20);
+    BamRecord r3; r3.Impl().MapQuality(40);
+    BamRecord r4; r4.Impl().MapQuality(10);
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::MapQuality());
+
+    EXPECT_EQ(r4.MapQuality(), records.at(0).MapQuality());
+    EXPECT_EQ(r2.MapQuality(), records.at(1).MapQuality());
+    EXPECT_EQ(r1.MapQuality(), records.at(2).MapQuality());
+    EXPECT_EQ(r3.MapQuality(), records.at(3).MapQuality());
+}
+
+TEST(CompareTest, MovieNameOk)
+{
+    auto rg1 = ReadGroupInfo { "a", "SUBREAD" };
+    auto rg2 = ReadGroupInfo { "b", "SUBREAD" };
+    auto rg3 = ReadGroupInfo { "c", "SUBREAD" };
+    auto rg4 = ReadGroupInfo { "d", "SUBREAD" };
+
+    BamHeader header;
+    header.AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddReadGroup(rg4);
+
+    BamRecord r1(header); r1.ReadGroup(rg3);
+    BamRecord r2(header); r2.ReadGroup(rg2);
+    BamRecord r3(header); r3.ReadGroup(rg4);
+    BamRecord r4(header); r4.ReadGroup(rg1);
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::MovieName());
+
+    EXPECT_EQ(r4.MovieName(), records.at(0).MovieName());
+    EXPECT_EQ(r2.MovieName(), records.at(1).MovieName());
+    EXPECT_EQ(r1.MovieName(), records.at(2).MovieName());
+    EXPECT_EQ(r3.MovieName(), records.at(3).MovieName());
+}
+
+TEST(CompareTest, NoneOk)
+{
+    BamRecord r1; r1.Impl().Name("c");
+    BamRecord r2; r2.Impl().Name("b");
+    BamRecord r3; r3.Impl().Name("d");
+    BamRecord r4; r4.Impl().Name("a");
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::None());
+
+    EXPECT_EQ(r1.FullName(), records.at(0).FullName());
+    EXPECT_EQ(r2.FullName(), records.at(1).FullName());
+    EXPECT_EQ(r3.FullName(), records.at(2).FullName());
+    EXPECT_EQ(r4.FullName(), records.at(3).FullName());
+}
+
+TEST(CompareTest, NumDeletedBasesOk)
+{
+    // create test data
+    auto records = tests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(0, records.at(0).NumDeletedBases());
+    EXPECT_EQ(3, records.at(1).NumDeletedBases());
+    EXPECT_EQ(3, records.at(2).NumDeletedBases());
+    EXPECT_EQ(0, records.at(3).NumDeletedBases());
+    EXPECT_EQ(3, records.at(4).NumDeletedBases());
+    EXPECT_EQ(3, records.at(5).NumDeletedBases());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumDeletedBases());
+    EXPECT_EQ(0, records.at(0).NumDeletedBases());
+    EXPECT_EQ(0, records.at(1).NumDeletedBases());
+    EXPECT_EQ(3, records.at(2).NumDeletedBases());
+    EXPECT_EQ(3, records.at(3).NumDeletedBases());
+    EXPECT_EQ(3, records.at(4).NumDeletedBases());
+    EXPECT_EQ(3, records.at(5).NumDeletedBases());
+}
+
+TEST(CompareTest, NumInsertedBasesOk)
+{
+    // create test data
+    auto records = tests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(0, records.at(0).NumInsertedBases());
+    EXPECT_EQ(0, records.at(1).NumInsertedBases());
+    EXPECT_EQ(2, records.at(2).NumInsertedBases());
+    EXPECT_EQ(0, records.at(3).NumInsertedBases());
+    EXPECT_EQ(0, records.at(4).NumInsertedBases());
+    EXPECT_EQ(2, records.at(5).NumInsertedBases());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumInsertedBases());
+    EXPECT_EQ(0, records.at(0).NumInsertedBases());
+    EXPECT_EQ(0, records.at(1).NumInsertedBases());
+    EXPECT_EQ(0, records.at(2).NumInsertedBases());
+    EXPECT_EQ(0, records.at(3).NumInsertedBases());
+    EXPECT_EQ(2, records.at(4).NumInsertedBases());
+    EXPECT_EQ(2, records.at(5).NumInsertedBases());
+}
+
+TEST(CompareTest, NumMatchesOk)
+{
+    // create test data
+    auto records = tests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(10, records.at(0).NumMatches());
+    EXPECT_EQ(10, records.at(1).NumMatches());
+    EXPECT_EQ(6,  records.at(2).NumMatches());
+    EXPECT_EQ(10, records.at(3).NumMatches());
+    EXPECT_EQ(10, records.at(4).NumMatches());
+    EXPECT_EQ(6,  records.at(5).NumMatches());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumMatches());
+    EXPECT_EQ(6,  records.at(0).NumMatches());
+    EXPECT_EQ(6,  records.at(1).NumMatches());
+    EXPECT_EQ(10, records.at(2).NumMatches());
+    EXPECT_EQ(10, records.at(3).NumMatches());
+    EXPECT_EQ(10, records.at(4).NumMatches());
+    EXPECT_EQ(10, records.at(5).NumMatches());
+}
+
+TEST(CompareTest, NumMismatchesOk)
+{
+    // create test data
+    auto records = tests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(0, records.at(0).NumMismatches());
+    EXPECT_EQ(0, records.at(1).NumMismatches());
+    EXPECT_EQ(2, records.at(2).NumMismatches());
+    EXPECT_EQ(0, records.at(3).NumMismatches());
+    EXPECT_EQ(0, records.at(4).NumMismatches());
+    EXPECT_EQ(2, records.at(5).NumMismatches());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumMismatches());
+    EXPECT_EQ(0, records.at(0).NumMismatches());
+    EXPECT_EQ(0, records.at(1).NumMismatches());
+    EXPECT_EQ(0, records.at(2).NumMismatches());
+    EXPECT_EQ(0, records.at(3).NumMismatches());
+    EXPECT_EQ(2, records.at(4).NumMismatches());
+    EXPECT_EQ(2, records.at(5).NumMismatches());
+}
+
+TEST(CompareTest, QueryEndOk)
+{
+    Position q1 = 30;
+    Position q2 = 20;
+    Position q3 = 40;
+    Position q4 = 10;
+
+    auto records = vector<BamRecord>
+    {
+        tests::makeRecordWithTag("qe", Tag(q1)),
+        tests::makeRecordWithTag("qe", Tag(q2)),
+        tests::makeRecordWithTag("qe", Tag(q3)),
+        tests::makeRecordWithTag("qe", Tag(q4))
+    };
+    std::sort(records.begin(), records.end(), Compare::QueryEnd());
+
+    EXPECT_EQ(q4, records.at(0).QueryEnd());
+    EXPECT_EQ(q2, records.at(1).QueryEnd());
+    EXPECT_EQ(q1, records.at(2).QueryEnd());
+    EXPECT_EQ(q3, records.at(3).QueryEnd());
+}
+
+TEST(CompareTest, QueryStartOk)
+{
+    Position q1 = 30;
+    Position q2 = 20;
+    Position q3 = 40;
+    Position q4 = 10;
+
+    auto records = vector<BamRecord>
+    {
+        tests::makeRecordWithTag("qs", Tag(q1)),
+        tests::makeRecordWithTag("qs", Tag(q2)),
+        tests::makeRecordWithTag("qs", Tag(q3)),
+        tests::makeRecordWithTag("qs", Tag(q4))
+    };
+    std::sort(records.begin(), records.end(), Compare::QueryStart());
+
+    EXPECT_EQ(q4, records.at(0).QueryStart());
+    EXPECT_EQ(q2, records.at(1).QueryStart());
+    EXPECT_EQ(q1, records.at(2).QueryStart());
+    EXPECT_EQ(q3, records.at(3).QueryStart());
+}
+
+TEST(CompareTest, ReadGroupIdOk)
+{
+    auto rg1 = ReadGroupInfo { "foo", "SUBREAD" };
+    auto rg2 = ReadGroupInfo { "bar", "SUBREAD" };
+    auto rg3 = ReadGroupInfo { "c",   "SUBREAD" };
+    auto rg4 = ReadGroupInfo { "d",   "SUBREAD" };
+
+    BamHeader header;
+    header.AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddReadGroup(rg4);
+
+    BamRecord r1(header); r1.ReadGroup(rg3); // -> 99365356
+    BamRecord r2(header); r2.ReadGroup(rg2); // -> d9f305e4
+    BamRecord r3(header); r3.ReadGroup(rg4); // -> 54397cd6
+    BamRecord r4(header); r4.ReadGroup(rg1); // -> a60ddc69
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReadGroupId()); // lexical, NOT numeric ordering
+
+    EXPECT_EQ(r3.ReadGroupId(), records.at(0).ReadGroupId());
+    EXPECT_EQ(r1.ReadGroupId(), records.at(1).ReadGroupId());
+    EXPECT_EQ(r4.ReadGroupId(), records.at(2).ReadGroupId());
+    EXPECT_EQ(r2.ReadGroupId(), records.at(3).ReadGroupId());
+}
+
+TEST(CompareTest, ReadGroupNumericIdOk)
+{
+    auto rg1 = ReadGroupInfo { "a", "SUBREAD" };
+    auto rg2 = ReadGroupInfo { "b", "SUBREAD" };
+    auto rg3 = ReadGroupInfo { "c", "SUBREAD" };
+    auto rg4 = ReadGroupInfo { "d", "SUBREAD" };
+
+    BamHeader header;
+    header.AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddReadGroup(rg4);
+
+    BamRecord r1(header); r1.ReadGroup(rg3); // -> -1724492970
+    BamRecord r2(header); r2.ReadGroup(rg2); // ->   235381373
+    BamRecord r3(header); r3.ReadGroup(rg4); // ->  1413053654
+    BamRecord r4(header); r4.ReadGroup(rg1); // ->  1153643386
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReadGroupNumericId()); // numeric ordering
+
+    EXPECT_EQ(r1.ReadGroupNumericId(), records.at(0).ReadGroupNumericId());
+    EXPECT_EQ(r2.ReadGroupNumericId(), records.at(1).ReadGroupNumericId());
+    EXPECT_EQ(r4.ReadGroupNumericId(), records.at(2).ReadGroupNumericId());
+    EXPECT_EQ(r3.ReadGroupNumericId(), records.at(3).ReadGroupNumericId());
+}
+
+TEST(CompareTest, ReadAccuracyOk)
+{
+    Accuracy a1 = 30;
+    Accuracy a2 = 20;
+    Accuracy a3 = 40;
+    Accuracy a4 = 10;
+
+    auto records = vector<BamRecord>
+    {
+        tests::makeRecordWithTag("rq", Tag(a1)),
+        tests::makeRecordWithTag("rq", Tag(a2)),
+        tests::makeRecordWithTag("rq", Tag(a3)),
+        tests::makeRecordWithTag("rq", Tag(a4))
+    };
+    std::sort(records.begin(), records.end(), Compare::ReadAccuracy());
+
+    EXPECT_EQ(a4, records.at(0).ReadAccuracy());
+    EXPECT_EQ(a2, records.at(1).ReadAccuracy());
+    EXPECT_EQ(a1, records.at(2).ReadAccuracy());
+    EXPECT_EQ(a3, records.at(3).ReadAccuracy());
+}
+
+TEST(CompareTest, ReferenceEndOk)
+{
+    // create test data
+    auto records = tests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(110, records.at(0).ReferenceEnd());
+    EXPECT_EQ(113, records.at(1).ReferenceEnd());
+    EXPECT_EQ(111, records.at(2).ReferenceEnd());
+    EXPECT_EQ(110, records.at(3).ReferenceEnd());
+    EXPECT_EQ(113, records.at(4).ReferenceEnd());
+    EXPECT_EQ(111, records.at(5).ReferenceEnd());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::ReferenceEnd());
+    EXPECT_EQ(110, records.at(0).ReferenceEnd());
+    EXPECT_EQ(110, records.at(1).ReferenceEnd());
+    EXPECT_EQ(111, records.at(2).ReferenceEnd());
+    EXPECT_EQ(111, records.at(3).ReferenceEnd());
+    EXPECT_EQ(113, records.at(4).ReferenceEnd());
+    EXPECT_EQ(113, records.at(5).ReferenceEnd());
+}
+
+TEST(CompareTest, ReferenceIdOk)
+{
+    BamRecord r1; r1.Impl().ReferenceId(30);
+    BamRecord r2; r2.Impl().ReferenceId(20);
+    BamRecord r3; r3.Impl().ReferenceId(40);
+    BamRecord r4; r4.Impl().ReferenceId(10);
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReferenceId());
+
+    EXPECT_EQ(r4.ReferenceId(), records.at(0).ReferenceId());
+    EXPECT_EQ(r2.ReferenceId(), records.at(1).ReferenceId());
+    EXPECT_EQ(r1.ReferenceId(), records.at(2).ReferenceId());
+    EXPECT_EQ(r3.ReferenceId(), records.at(3).ReferenceId());
+}
+
+TEST(CompareTest, ReferenceNameOk)
+{
+    auto seq1 = SequenceInfo { "seq1" };
+    auto seq2 = SequenceInfo { "seq2" };
+    auto seq3 = SequenceInfo { "seq3" };
+    auto seq4 = SequenceInfo { "seq4" };
+
+    BamHeader header;
+    header.AddSequence(seq1)  // -> 0
+          .AddSequence(seq2)  // -> 1
+          .AddSequence(seq3)  // -> 2
+          .AddSequence(seq4); // -> 3
+
+    BamRecord r1(header); r1.Impl().SetMapped(true); r1.Impl().ReferenceId(2);
+    BamRecord r2(header); r2.Impl().SetMapped(true); r2.Impl().ReferenceId(1);
+    BamRecord r3(header); r3.Impl().SetMapped(true); r3.Impl().ReferenceId(3);
+    BamRecord r4(header); r4.Impl().SetMapped(true); r4.Impl().ReferenceId(0);
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReferenceName());
+
+    EXPECT_EQ(seq1.Name(), records.at(0).ReferenceName());
+    EXPECT_EQ(seq2.Name(), records.at(1).ReferenceName());
+    EXPECT_EQ(seq3.Name(), records.at(2).ReferenceName());
+    EXPECT_EQ(seq4.Name(), records.at(3).ReferenceName());
+}
+
+TEST(CompareTest, ReferenceStartOk)
+{
+    BamRecord r1; r1.Impl().Position(30);
+    BamRecord r2; r2.Impl().Position(20);
+    BamRecord r3; r3.Impl().Position(40);
+    BamRecord r4; r4.Impl().Position(10);
+
+    auto records = vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReferenceStart());
+
+    EXPECT_EQ(r4.ReferenceStart(), records.at(0).ReferenceStart());
+    EXPECT_EQ(r2.ReferenceStart(), records.at(1).ReferenceStart());
+    EXPECT_EQ(r1.ReferenceStart(), records.at(2).ReferenceStart());
+    EXPECT_EQ(r3.ReferenceStart(), records.at(3).ReferenceStart());
+}
+
+TEST(CompareTest, ZmwOk)
+{
+    int32_t z1 = 30;
+    int32_t z2 = 20;
+    int32_t z3 = 40;
+    int32_t z4 = 10;
+
+    auto records = vector<BamRecord>
+    {
+        tests::makeRecordWithTag("zm", Tag(z1)),
+        tests::makeRecordWithTag("zm", Tag(z2)),
+        tests::makeRecordWithTag("zm", Tag(z3)),
+        tests::makeRecordWithTag("zm", Tag(z4))
+    };
+    std::sort(records.begin(), records.end(), Compare::Zmw());
+
+    EXPECT_EQ(z4, records.at(0).HoleNumber());
+    EXPECT_EQ(z2, records.at(1).HoleNumber());
+    EXPECT_EQ(z1, records.at(2).HoleNumber());
+    EXPECT_EQ(z3, records.at(3).HoleNumber());
+}
diff --git a/tests/src/test_DataSetCore.cpp b/tests/src/test_DataSetCore.cpp

new file mode 100644 (file)

index 0000000..df9eae2
--- /dev/null
+++ b/tests/src/test_DataSetCore.cpp
@@ -0,0 +1,537 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/DataSet.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace tests {
+
+static inline
+DataSet CreateDataSet(void)
+{
+    DataSet d;
+    d.Name("foo");
+    return d;
+}
+
+} // namespace tests
+
+TEST(DataSetCoreTest, XmlNameParts)
+{
+    internal::XmlName name("ns:node_name");
+    EXPECT_EQ(boost::string_ref("ns"),           name.Prefix());
+    EXPECT_EQ(boost::string_ref("node_name"),    name.LocalName());
+    EXPECT_EQ(boost::string_ref("ns:node_name"), name.QualifiedName());
+
+    internal::XmlName bareName("node_name");
+    EXPECT_EQ(boost::string_ref(""),          bareName.Prefix());
+    EXPECT_EQ(boost::string_ref("node_name"), bareName.LocalName());
+    EXPECT_EQ(boost::string_ref("node_name"), bareName.QualifiedName());
+
+    internal::XmlName leadingColon(":node_name");
+    EXPECT_EQ(boost::string_ref(""),           leadingColon.Prefix());
+    EXPECT_EQ(boost::string_ref(":node_name"), leadingColon.LocalName());
+    EXPECT_EQ(boost::string_ref(":node_name"), leadingColon.QualifiedName());
+}
+
+TEST(DataSetCoreTest, DefaultsOk)
+{
+    DataSet dataset;
+    EXPECT_EQ(DataSet::GENERIC, dataset.Type());
+    EXPECT_FALSE(dataset.CreatedAt().empty());
+    EXPECT_FALSE(dataset.MetaType().empty());
+    EXPECT_FALSE(dataset.TimeStampedName().empty());
+    EXPECT_FALSE(dataset.UniqueId().empty());
+    EXPECT_FALSE(dataset.Version().empty());
+
+    EXPECT_EQ(0, dataset.TimeStampedName().find("pacbio_dataset_"));
+
+    EXPECT_TRUE(dataset.Format().empty());
+    EXPECT_TRUE(dataset.ModifiedAt().empty());
+    EXPECT_TRUE(dataset.Name().empty());
+    EXPECT_TRUE(dataset.ResourceId().empty());
+    EXPECT_TRUE(dataset.Tags().empty());
+    EXPECT_EQ(0, dataset.ExternalResources().Size());
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    EXPECT_EQ(string{"3.0.1"}, dataset.Version());
+}
+
+TEST(DataSetCoreTest, TimeStampedNamesOk)
+{
+    DataSet dataset;
+    AlignmentSet alignmentSet;
+    BarcodeSet barcodeSet;
+    ContigSet contigSet;
+    ConsensusAlignmentSet consensusAlignmentSet;
+    ConsensusReadSet consensusReadSet;
+    HdfSubreadSet hdfSubreadSet;
+    ReferenceSet referenceSet;
+    SubreadSet subreadSet;
+
+    EXPECT_EQ(0, dataset.TimeStampedName().find("pacbio_dataset_dataset-"));
+    EXPECT_EQ(0, alignmentSet.TimeStampedName().find("pacbio_dataset_alignmentset-"));
+    EXPECT_EQ(0, barcodeSet.TimeStampedName().find("pacbio_dataset_barcodeset-"));
+    EXPECT_EQ(0, contigSet.TimeStampedName().find("pacbio_dataset_contigset-"));
+    EXPECT_EQ(0, consensusAlignmentSet.TimeStampedName().find("pacbio_dataset_consensusalignmentset-"));
+    EXPECT_EQ(0, consensusReadSet.TimeStampedName().find("pacbio_dataset_consensusreadset-"));
+    EXPECT_EQ(0, hdfSubreadSet.TimeStampedName().find("pacbio_dataset_hdfsubreadset-"));
+    EXPECT_EQ(0, referenceSet.TimeStampedName().find("pacbio_dataset_referenceset-"));
+    EXPECT_EQ(0, subreadSet.TimeStampedName().find("pacbio_dataset_subreadset-"));
+}
+
+TEST(DataSetCoreTest, BasicGettersSettersOk)
+{
+    DataSet dataset;
+    dataset.CreatedAt("now");
+    dataset.Format("format");
+    dataset.MetaType("meta");
+    dataset.ModifiedAt("later");
+    dataset.Name("foo");
+    dataset.ResourceId("path/to/file");
+    dataset.Tags("tag tag");
+    dataset.TimeStampedName("now:30");
+    dataset.UniqueId("uuid");
+    dataset.Version("0.0.0");
+
+    EXPECT_EQ(string("now"),          dataset.CreatedAt());
+    EXPECT_EQ(string("format"),       dataset.Format());
+    EXPECT_EQ(string("meta"),         dataset.MetaType());
+    EXPECT_EQ(string("later"),        dataset.ModifiedAt());
+    EXPECT_EQ(string("foo"),          dataset.Name());
+    EXPECT_EQ(string("path/to/file"), dataset.ResourceId());
+    EXPECT_EQ(string("tag tag"),      dataset.Tags());
+    EXPECT_EQ(string("now:30"),       dataset.TimeStampedName());
+    EXPECT_EQ(string("uuid"),         dataset.UniqueId());
+    EXPECT_EQ(string("0.0.0"),        dataset.Version());
+}
+
+TEST(DataSetCoreTest, CopyOk)
+{
+    DataSet d1;
+    d1.Name("foo");
+
+    // copy ctor
+    DataSet d2(d1);
+    EXPECT_EQ(string("foo"), d2.Name());
+
+    // copy assignment
+    DataSet d3;
+    d3 = d1;
+    EXPECT_EQ(string("foo"), d3.Name());
+}
+
+TEST(DataSetCoreTest, MoveOk)
+{
+    DataSet d1;
+    d1.Name("foo");
+
+    // move ctor
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    DataSet d2(std::move(tests::CreateDataSet()));
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+    EXPECT_EQ(string("foo"), d2.Name());
+
+    // move assignment
+    DataSet d3;
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    d3 = std::move(tests::CreateDataSet());
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif 
+    EXPECT_EQ(string("foo"), d3.Name());
+}
+
+TEST(DataSetCoreTest, AddExternalResources)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.ExternalResources().Size());
+
+    ExternalResource resource1("metatype", "id");
+    resource1.Name("file1");
+
+    ExternalResource resource2("metatype", "id2");
+    resource2.Name("file2");
+
+    dataset.ExternalResources().Add(resource1);
+    dataset.ExternalResources().Add(resource2);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+     // disallow duplicates (checking on ResourceId)
+    ExternalResource duplicateResource("metatype", "id");
+    dataset.ExternalResources().Add(duplicateResource);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+    // direct access
+    const ExternalResources& resources = dataset.ExternalResources();
+    EXPECT_EQ(string("file1"), resources[0].Name());
+    EXPECT_EQ(string("file2"), resources[1].Name());
+
+    // iterable
+    size_t i = 0;
+    for (auto r : resources) {
+        if (i == 0)
+            EXPECT_EQ(string("file1"), r.Name());
+        else
+            EXPECT_EQ(string("file2"), r.Name());
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, EditExternalResources)
+{
+    DataSet dataset;
+
+    ExternalResource resource("metatype", "id");
+    resource.Name("file1");
+    dataset.ExternalResources().Add(resource);
+
+    resource.Name("file2").ResourceId("id2");
+    dataset.ExternalResources().Add(resource);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+    // edit
+    dataset.ExternalResources()[0].Name("some new name");
+    EXPECT_EQ(string("some new name"), dataset.ExternalResources()[0].Name());
+    EXPECT_EQ(string("file2"),         dataset.ExternalResources()[1].Name());
+}
+
+TEST(DataSetCoreTest, NestedExternalResources)
+{
+    ExternalResource resource("metatype", "filename");
+    resource.ExternalResources().Add(ExternalResource("metatype.child",  "filename.child"));
+    resource.ExternalResources().Add(ExternalResource("metatype.child2", "filename.child2"));
+
+    const ExternalResources& childResources = resource.ExternalResources();
+    EXPECT_EQ(2, childResources.Size());
+    EXPECT_EQ(string("metatype.child"),  childResources[0].MetaType());
+    EXPECT_EQ(string("metatype.child2"), childResources[1].MetaType());
+    EXPECT_EQ(string("filename.child"),  childResources[0].ResourceId());
+    EXPECT_EQ(string("filename.child2"), childResources[1].ResourceId());
+}
+
+TEST(DataSetCoreTest, AddFilters)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    Filter filter;
+    filter.Properties().Add(Property("rq", "0.85", ">"));
+    filter.Properties().Add(Property("RNAME", "chr1", "=="));
+    EXPECT_EQ(2, filter.Properties().Size());
+
+    Filter filter2;
+    filter2.Properties().Add(Property("rq", "0.50", ">="));
+    filter2.Properties().Add(Property("RNAME", "chr2", "!="));
+    EXPECT_EQ(2, filter2.Properties().Size());
+
+    dataset.Filters().Add(filter);
+    dataset.Filters().Add(filter2);
+
+    const Filters& filters = dataset.Filters();
+    EXPECT_EQ(2, filters.Size());
+    EXPECT_EQ(2, filters[0].Properties().Size());
+    EXPECT_EQ(2, filters[1].Properties().Size());
+
+    // direct access
+    const Property& p0 = filters[0].Properties()[0];
+    EXPECT_EQ(string("rq"),   p0.Name());
+    EXPECT_EQ(string("0.85"), p0.Value());
+    EXPECT_EQ(string(">"),    p0.Operator());
+
+    const Property& p1 = filters[0].Properties()[1];
+    EXPECT_EQ(string("RNAME"), p1.Name());
+    EXPECT_EQ(string("chr1"),  p1.Value());
+    EXPECT_EQ(string("=="),    p1.Operator());
+
+    const Property& p2 = filters[1].Properties()[0];
+    EXPECT_EQ(string("rq"),   p2.Name());
+    EXPECT_EQ(string("0.50"), p2.Value());
+    EXPECT_EQ(string(">="),   p2.Operator());
+
+    const Property& p3 = filters[1].Properties()[1];
+    EXPECT_EQ(string("RNAME"), p3.Name());
+    EXPECT_EQ(string("chr2"),  p3.Value());
+    EXPECT_EQ(string("!="),    p3.Operator());
+
+    // iteratable
+    size_t i = 0;
+    size_t j = 0;
+    for (const Filter& f : filters) {
+        if (i == 0) {
+            const Properties& properties = f.Properties();
+            for (const Property& p : properties) {
+                if (j == 0) {
+                    EXPECT_EQ(string("rq"),   p.Name());
+                    EXPECT_EQ(string("0.85"), p.Value());
+                    EXPECT_EQ(string(">"),    p.Operator());
+                } else {
+                    EXPECT_EQ(string("RNAME"), p.Name());
+                    EXPECT_EQ(string("chr1"),  p.Value());
+                    EXPECT_EQ(string("=="),    p.Operator());
+                }
+                ++j;
+            }
+        } else {
+            const Properties& properties = f.Properties();
+            for (const Property& p : properties) {
+                if (j == 0) {
+                    EXPECT_EQ(string("rq"),   p.Name());
+                    EXPECT_EQ(string("0.50"), p.Value());
+                    EXPECT_EQ(string(">="),   p.Operator());
+                } else {
+                    EXPECT_EQ(string("RNAME"), p.Name());
+                    EXPECT_EQ(string("chr2"),  p.Value());
+                    EXPECT_EQ(string("!="),    p.Operator());
+                }
+                ++j;
+            }
+        }
+        ++i;
+        j = 0;
+    }
+
+}
+
+TEST(DataSetCoreTest, EditFilters)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    Filter filter;
+    filter.Properties().Add(Property("rq", "0.85", ">"));
+    filter.Properties().Add(Property("RNAME", "chr1", "=="));
+    EXPECT_EQ(2, filter.Properties().Size());
+
+    Filter filter2;
+    filter2.Properties().Add(Property("rq", "0.50", ">="));
+    filter2.Properties().Add(Property("RNAME", "chr2", "!="));
+    EXPECT_EQ(2, filter2.Properties().Size());
+
+    dataset.Filters().Add(filter);
+    dataset.Filters().Add(filter2);
+    EXPECT_EQ(2, dataset.Filters().Size());
+    EXPECT_EQ(2, dataset.Filters()[0].Properties().Size());
+    EXPECT_EQ(2, dataset.Filters()[1].Properties().Size());
+
+    // edit property in-place
+    Property& p = dataset.Filters()[0].Properties()[0];
+    p.Name("someNewName");
+    p.Value("someNewValue");
+    p.Operator("==");
+
+    const Property& p0 = dataset.Filters()[0].Properties()[0];
+    EXPECT_EQ(string("someNewName"),  p0.Name());
+    EXPECT_EQ(string("someNewValue"), p0.Value());
+    EXPECT_EQ(string("=="),           p0.Operator());
+
+    const Property& p1 = dataset.Filters()[0].Properties()[1];
+    EXPECT_EQ(string("RNAME"), p1.Name());
+    EXPECT_EQ(string("chr1"),  p1.Value());
+    EXPECT_EQ(string("=="),    p1.Operator());
+
+    const Property& p2 = dataset.Filters()[1].Properties()[0];
+    EXPECT_EQ(string("rq"),   p2.Name());
+    EXPECT_EQ(string("0.50"), p2.Value());
+    EXPECT_EQ(string(">="),   p2.Operator());
+
+    const Property& p3 = dataset.Filters()[1].Properties()[1];
+    EXPECT_EQ(string("RNAME"), p3.Name());
+    EXPECT_EQ(string("chr2"),  p3.Value());
+    EXPECT_EQ(string("!="),    p3.Operator());
+}
+
+TEST(DataSetCoreTest, AddSubDataSets)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    DataSetBase sub1;
+    sub1.Name("subset_1");
+
+    DataSetBase sub2;
+    sub2.Name("subset_2");
+
+    dataset.SubDataSets().Add(sub1);
+    dataset.SubDataSets().Add(sub2);
+    EXPECT_EQ(2, dataset.SubDataSets().Size());
+
+    // direct access
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    EXPECT_EQ(string("subset_1"), subdatasets[0].Name());
+    EXPECT_EQ(string("subset_2"), subdatasets[1].Name());
+
+    // iterable
+    size_t i = 0;
+    for (const DataSetBase& ds : subdatasets) {
+        if (i == 0)
+            EXPECT_EQ(string("subset_1"), ds.Name());
+        else
+            EXPECT_EQ(string("subset_2"), ds.Name());
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, EditSubDataSets)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    DataSetBase sub1;
+    sub1.Name("subset_1");
+
+    DataSetBase sub2;
+    sub2.Name("subset_2");
+
+    dataset.SubDataSets().Add(sub1);
+    dataset.SubDataSets().Add(sub2);
+    EXPECT_EQ(2, dataset.SubDataSets().Size());
+
+    // edit
+    dataset.SubDataSets()[0].Name("subset_1_edited");
+
+    // direct access
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    EXPECT_EQ(string("subset_1_edited"), subdatasets[0].Name());
+    EXPECT_EQ(string("subset_2"), subdatasets[1].Name());
+
+    // iterable
+    size_t i = 0;
+    for (const DataSetBase& ds : subdatasets) {
+        if (i == 0)
+            EXPECT_EQ(string("subset_1_edited"), ds.Name());
+        else
+            EXPECT_EQ(string("subset_2"), ds.Name());
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, RemoveExternalResources)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.ExternalResources().Size());
+
+    ExternalResource resource1("metatype", "id");
+    resource1.Name("file1");
+
+    ExternalResource resource2("metatype", "id2");
+    resource2.Name("file2");
+
+    dataset.ExternalResources().Add(resource1);
+    dataset.ExternalResources().Add(resource2);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+    // remove
+    dataset.ExternalResources().Remove(resource1);
+    EXPECT_EQ(1, dataset.ExternalResources().Size());
+
+    // direct access
+    const ExternalResources& resources = dataset.ExternalResources();
+    EXPECT_EQ(string("file2"), resources[0].Name());
+
+    // iterable
+    size_t i = 0;
+    for (auto r : resources) {
+        if (i == 0)
+            EXPECT_EQ(string("file2"), r.Name());
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, RemoveFilters)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    Filter filter;
+    filter.Properties().Add(Property("rq", "0.85", ">"));
+    filter.Properties().Add(Property("RNAME", "chr1", "=="));
+    EXPECT_EQ(2, filter.Properties().Size());
+
+    Filter filter2;
+    filter2.Properties().Add(Property("rq", "0.50", ">="));
+    filter2.Properties().Add(Property("RNAME", "chr2", "!="));
+    EXPECT_EQ(2, filter2.Properties().Size());
+
+    dataset.Filters().Add(filter);
+    dataset.Filters().Add(filter2);
+    EXPECT_EQ(2, dataset.Filters().Size());
+
+    // remove
+    dataset.Filters().Remove(filter);
+    EXPECT_EQ(1, dataset.Filters().Size());
+
+    const Filters& filters = dataset.Filters();
+    EXPECT_EQ(2, filters[0].Properties().Size());
+}
+
+TEST(DataSetCoreTest, RemoveSubDataSets)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    DataSetBase sub1;
+    sub1.Name("subset_1");
+
+    DataSetBase sub2;
+    sub2.Name("subset_2");
+
+    dataset.SubDataSets().Add(sub1);
+    dataset.SubDataSets().Add(sub2);
+    EXPECT_EQ(2, dataset.SubDataSets().Size());
+
+    // remove
+    dataset.SubDataSets().Remove(sub2);
+    EXPECT_EQ(1, dataset.SubDataSets().Size());
+}
diff --git a/tests/src/test_DataSetIO.cpp b/tests/src/test_DataSetIO.cpp

new file mode 100644 (file)

index 0000000..8f2adb5
--- /dev/null
+++ b/tests/src/test_DataSetIO.cpp
@@ -0,0 +1,1516 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include "../src/FileUtils.h"
+#include <gtest/gtest.h>
+#include <pbbam/DataSet.h>
+#include <pbbam/internal/DataSetElement.h>
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <unistd.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+const string alignedBamFn  = tests::Data_Dir + "/aligned.bam";
+const string bamGroupFofn  = tests::Generated_Dir + "/group.fofn";
+
+const string ali1XmlFn = tests::Data_Dir + "/dataset/ali1.xml";
+const string ali2XmlFn = tests::Data_Dir + "/dataset/ali2.xml";
+const string ali3XmlFn = tests::Data_Dir + "/dataset/ali3.xml";
+const string ali4XmlFn = tests::Data_Dir + "/dataset/ali4.xml";
+const string mappingStaggeredXmlFn = tests::Data_Dir + "/dataset/bam_mapping_staggered.xml";
+const string barcodeXmlFn = tests::Data_Dir + "/dataset/barcode.dataset.xml";
+const string ccsReadXmlFn = tests::Data_Dir + "/dataset/ccsread.dataset.xml";
+const string lambdaContigsXmlFn = tests::Data_Dir + "/dataset/lambda_contigs.xml";
+const string pbalchemyXmlFn   = tests::Data_Dir + "/dataset/pbalchemy10kbp.xml";
+const string referenceXmlFn   = tests::Data_Dir + "/dataset/reference.dataset.xml";
+const string subread1XmlFn    = tests::Data_Dir + "/dataset/subread_dataset1.xml";
+const string subread2XmlFn    = tests::Data_Dir + "/dataset/subread_dataset2.xml";
+const string subread3XmlFn    = tests::Data_Dir + "/dataset/subread_dataset3.xml";
+const string transformedXmlFn = tests::Data_Dir + "/dataset/transformed_rs_subread_dataset.xml";
+
+static void TestFromXmlString(void);
+static void TestAli1Xml(void);
+static void TestAli2Xml(void);
+static void TestAli3Xml(void);
+static void TestAli4Xml(void);
+static void TestMappingStaggeredXml(void);
+static void TestBarcodeXml(void);
+static void TestCcsReadXml(void);
+static void TestLambdaContigsXml(void);
+static void TestPbalchemyXml(void);
+static void TestReferenceXml(void);
+static void TestSubread1Xml(void);
+static void TestSubread2Xml(void);
+static void TestSubread3Xml(void);
+static void TestTransformedXml(void);
+
+static inline
+void changeCurrentDirectory(const std::string& dir)
+{ ASSERT_EQ(0, chdir(dir.c_str())); }
+
+TEST(DataSetIOTest, FromBamFilename)
+{
+    DataSet dataset(alignedBamFn);
+
+    EXPECT_EQ(1, dataset.ExternalResources().Size());
+    const ExternalResource& bamRef = dataset.ExternalResources()[0];
+
+    EXPECT_EQ(alignedBamFn, bamRef.ResourceId());
+}
+
+TEST(DataSetIOTest, FromBamFilenames)
+{
+    std::ifstream fofn(bamGroupFofn);
+    std::vector<std::string> files;
+    std::string file;
+    while (std::getline(fofn, file)) if (!file.empty()) files.emplace_back(file);
+    DataSet dataset(files);
+    EXPECT_EQ(3, dataset.ExternalResources().Size());
+}
+
+TEST(DataSetIOTest, FromBamFileObject)
+{
+    BamFile bamFile(alignedBamFn);
+    DataSet dataset(bamFile.Filename());
+
+    EXPECT_EQ(1, dataset.ExternalResources().Size());
+    const ExternalResource& bamRef = dataset.ExternalResources()[0];
+
+    EXPECT_EQ(alignedBamFn, bamRef.ResourceId());
+}
+
+TEST(DataSetIOTest, FromFofn)
+{
+    DataSet dataset(bamGroupFofn);
+    EXPECT_EQ(3, dataset.ExternalResources().Size());
+}
+
+TEST(DataSetIOTest, FromXml)
+{
+    EXPECT_NO_THROW(TestFromXmlString());
+}
+
+TEST(DataSetIOTest, FromXmlFile)
+{
+    EXPECT_NO_THROW(TestAli1Xml());
+    EXPECT_NO_THROW(TestAli2Xml());
+    EXPECT_NO_THROW(TestAli3Xml());
+    EXPECT_NO_THROW(TestAli4Xml());
+    EXPECT_NO_THROW(TestMappingStaggeredXml());
+    EXPECT_NO_THROW(TestBarcodeXml());
+    EXPECT_NO_THROW(TestCcsReadXml());
+    EXPECT_NO_THROW(TestLambdaContigsXml());
+    EXPECT_NO_THROW(TestPbalchemyXml());
+    EXPECT_NO_THROW(TestReferenceXml());
+    EXPECT_NO_THROW(TestSubread1Xml());
+    EXPECT_NO_THROW(TestSubread2Xml());
+    EXPECT_NO_THROW(TestSubread3Xml());
+    EXPECT_NO_THROW(TestTransformedXml());
+}
+
+TEST(DataSetIOTest, ThrowsOnNonexistentFofnFile)
+{
+    EXPECT_THROW(DataSet{"does/not/exist.fofn"}, std::runtime_error);
+}
+
+TEST(DataSetIOTest, ThrowsOnNonexistentXmlFile)
+{
+    EXPECT_THROW(DataSet{"does/not/exist.xml"}, std::runtime_error);
+}
+
+TEST(DataSetIOTest, ToXml)
+{
+    // top-level data
+    DataSet dataset(DataSet::ALIGNMENT);
+    dataset.CreatedAt("2015-01-27T09:00:01");
+    dataset.MetaType("PacBio.DataSet.AlignmentSet");
+    dataset.Name("DataSet_AlignmentSet");
+    dataset.Tags("barcode moreTags mapping mytags");
+    dataset.TimeStampedName("my_tsn");
+    dataset.UniqueId("b095d0a3-94b8-4918-b3af-a3f81bbe519c");
+    dataset.Attribute("xmlns",              "http://pacificbiosciences.com/PacBioDatasets.xsd")
+           .Attribute("xmlns:xsi",          "http://www.w3.org/2001/XMLSchema-instance")
+           .Attribute("xsi:schemaLocation", "http://pacificbiosciences.com/PacBioDatasets.xsd");
+
+    // external resources
+    ExternalResource resource1("AlignmentFile.AlignmentBamFile", "file:/mnt/path/to/alignments2.bam");    
+    resource1.Name("Third Alignments BAM");
+    resource1.Description("Points to an example Alignments BAM file.");
+    resource1.Tags("Example");
+    resource1.TimeStampedName("my_tsn");
+    resource1.UniqueId("my_uuid");
+    FileIndex pbi1("PacBio.Index.PacBioIndex", "file:/mnt/path/to/alignments2.pbi");
+    pbi1.TimeStampedName("my_tsn");
+    pbi1.UniqueId("my_uuid");
+    resource1.FileIndices().Add(pbi1);
+    dataset.ExternalResources().Add(resource1);
+
+    ExternalResource resource2("AlignmentFile.AlignmentBamFile", "file:./alignments3.bam");
+    resource2.Name("Fourth Alignments BAM");
+    resource2.Description("Points to another example Alignments BAM file, by relative path.");
+    resource2.Tags("Example");
+    resource2.TimeStampedName("my_tsn");
+    resource2.UniqueId("my_uuid");
+    FileIndex pbi2("PacBio.Index.PacBioIndex", "file:/mnt/path/to/alignments3.pbi");
+    pbi2.TimeStampedName("my_tsn");
+    pbi2.UniqueId("my_uuid");
+
+    resource2.FileIndices().Add(pbi2);
+    dataset.ExternalResources().Add(resource2);
+
+    // sub-datasets with filters
+    DataSetBase subDataSet1;
+    subDataSet1.Name("HighQuality Read Alignments");
+    subDataSet1.TimeStampedName("my_tsn");
+    subDataSet1.UniqueId("ab95d0a3-94b8-4918-b3af-a3f81bbe519c");
+    Filter filter1;
+    filter1.Properties().Add(Property("rq", "0.85", ">"));
+    subDataSet1.Filters().Add(filter1);
+    dataset.SubDataSets().Add(subDataSet1);
+
+    DataSetBase subDataSet2;
+    subDataSet2.Name("Alignments to chromosome 1");
+    subDataSet2.TimeStampedName("my_tsn");
+    subDataSet2.UniqueId("ac95d0a3-94b8-4918-b3af-a3f81bbe519c");
+    Filter filter2;
+    filter2.Properties().Add(Property("RNAME", "chr1", "=="));
+    subDataSet2.Filters().Add(filter2);
+    dataset.SubDataSets().Add(subDataSet2);
+
+    // write dataset
+    const string expectedXml =
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet "
+                "CreatedAt=\"2015-01-27T09:00:01\" "
+                "MetaType=\"PacBio.DataSet.AlignmentSet\" "
+                "Name=\"DataSet_AlignmentSet\" "
+                "Tags=\"barcode moreTags mapping mytags\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" Version=\"3.0.1\" "
+                "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:pbbase=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+                "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to an example Alignments BAM file.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Third Alignments BAM\" "
+                "ResourceId=\"file:/mnt/path/to/alignments2.bam\" "
+                "Tags=\"Example\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                "MetaType=\"PacBio.Index.PacBioIndex\" "
+                "ResourceId=\"file:/mnt/path/to/alignments2.pbi\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to another example Alignments BAM file, by relative path.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Fourth Alignments BAM\" "
+                "ResourceId=\"file:./alignments3.bam\" "
+                "Tags=\"Example\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                "MetaType=\"PacBio.Index.PacBioIndex\" "
+                "ResourceId=\"file:/mnt/path/to/alignments3.pbi\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t</pbbase:ExternalResources>\n"
+        "\t<pbds:DataSets>\n"
+        "\t\t<pbds:DataSet "
+                "MetaType=\"PacBio.DataSet.DataSet\" "
+                "Name=\"HighQuality Read Alignments\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"ab95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"3.0.1\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"rq\" Operator=\">\" Value=\"0.85\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t\t<pbds:DataSet "
+                "MetaType=\"PacBio.DataSet.DataSet\" "
+                "Name=\"Alignments to chromosome 1\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"ac95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"3.0.1\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"RNAME\" Operator=\"==\" Value=\"chr1\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t</pbds:DataSets>\n"
+        "</pbds:AlignmentSet>\n";
+
+    stringstream s;
+    dataset.SaveToStream(s);
+    EXPECT_EQ(expectedXml, s.str());
+}
+
+static void TestFromXmlString(void)
+{
+    const string inputXml =
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet "
+            "CreatedAt=\"2015-01-27T09:00:01\" "
+            "MetaType=\"PacBio.DataSet.AlignmentSet\" "
+            "Name=\"DataSet_AlignmentSet\" "
+            "Tags=\"barcode moreTags mapping mytags\" "
+            "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+            "Version=\"2.3.0\" "
+            "xmlns=\"http://pacificbiosciences.com/PacBioDataModel.xsd\" "
+            "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+            "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDataModel.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to an example Alignments BAM file.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Third Alignments BAM\" "
+                "ResourceId=\"file:/mnt/path/to/alignments2.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"file:/mnt/path/to/alignments2.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to another example Alignments BAM file, by relative path.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Fourth Alignments BAM\" "
+                "ResourceId=\"file:./alignments3.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"file:/mnt/path/to/alignments3.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t</pbbase:ExternalResources>\n"
+        "\t<pbds:DataSets>\n"
+        "\t\t<pbds:DataSet "
+                "Name=\"HighQuality Read Alignments\" "
+                "UniqueId=\"ab95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"2.3.0\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"rq\" Operator=\">\" Value=\"0.85\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t\t<pbds:DataSet "
+                "Name=\"Alignments to chromosome 1\" "
+                "UniqueId=\"ac95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"2.3.0\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"RNAME\" Operator=\"==\" Value=\"chr1\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t</pbds:DataSets>\n"
+        "</pbds:AlignmentSet>\n";
+
+    const DataSet dataset = DataSet::FromXml(inputXml);
+
+    EXPECT_EQ(DataSet::ALIGNMENT,                     dataset.Type());
+    EXPECT_EQ("2015-01-27T09:00:01",                  dataset.CreatedAt());
+    EXPECT_EQ("PacBio.DataSet.AlignmentSet",          dataset.MetaType());
+    EXPECT_EQ("DataSet_AlignmentSet",                 dataset.Name());
+    EXPECT_EQ("barcode moreTags mapping mytags",      dataset.Tags());
+    EXPECT_EQ("b095d0a3-94b8-4918-b3af-a3f81bbe519c", dataset.UniqueId());
+    EXPECT_EQ("2.3.0",                                dataset.Version());
+    EXPECT_EQ("http://pacificbiosciences.com/PacBioDataModel.xsd", dataset.Attribute("xmlns"));
+    EXPECT_EQ("http://www.w3.org/2001/XMLSchema-instance",         dataset.Attribute("xmlns:xsi"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    EXPECT_EQ(2, resources.Size());
+
+    const ExternalResource& resource1 = resources[0];
+    EXPECT_EQ("Third Alignments BAM",                      resource1.Name());
+    EXPECT_EQ("Points to an example Alignments BAM file.", resource1.Description());
+    EXPECT_EQ("AlignmentFile.AlignmentBamFile",            resource1.MetaType());
+    EXPECT_EQ("file:/mnt/path/to/alignments2.bam",         resource1.ResourceId());
+    EXPECT_EQ("Example",                                   resource1.Tags());
+    const FileIndices& fileIndices1 = resource1.FileIndices();
+    EXPECT_EQ(1, fileIndices1.Size());
+    const FileIndex& pbi1 = fileIndices1[0];
+    EXPECT_EQ("PacBio.Index.PacBioIndex",          pbi1.MetaType());
+    EXPECT_EQ("file:/mnt/path/to/alignments2.pbi", pbi1.ResourceId());
+
+    const ExternalResource& resource2 = resources[1];
+    EXPECT_EQ("Fourth Alignments BAM",                     resource2.Name());
+    EXPECT_EQ("Points to another example Alignments BAM file, by relative path.", resource2.Description());
+    EXPECT_EQ("AlignmentFile.AlignmentBamFile",            resource2.MetaType());
+    EXPECT_EQ("file:./alignments3.bam",                    resource2.ResourceId());
+    EXPECT_EQ("Example",                                   resource2.Tags());
+    const FileIndices& fileIndices2 = resource2.FileIndices();
+    EXPECT_EQ(1, fileIndices2.Size());
+    const FileIndex& pbi2 = fileIndices2[0];
+    EXPECT_EQ("PacBio.Index.PacBioIndex",          pbi2.MetaType());
+    EXPECT_EQ("file:/mnt/path/to/alignments3.pbi", pbi2.ResourceId());
+
+    const SubDataSets& subDatasets = dataset.SubDataSets();
+    EXPECT_EQ(2, subDatasets.Size());
+
+    const DataSetBase& sub1 = subDatasets[0];
+    EXPECT_EQ("HighQuality Read Alignments",          sub1.Name());
+    EXPECT_EQ("ab95d0a3-94b8-4918-b3af-a3f81bbe519c", sub1.UniqueId());
+    EXPECT_EQ("2.3.0",                                sub1.Version());
+    const Filters& sub1Filters = sub1.Filters();
+    EXPECT_EQ(1, sub1Filters.Size());
+    const Filter& sub1Filter = sub1Filters[0];
+    EXPECT_EQ(1, sub1Filter.Properties().Size());
+    const Property& property1 = sub1Filter.Properties()[0];
+    EXPECT_EQ("rq",   property1.Name());
+    EXPECT_EQ(">",    property1.Operator());
+    EXPECT_EQ("0.85", property1.Value());
+
+    const DataSetBase& sub2 = subDatasets[1];
+    EXPECT_EQ("Alignments to chromosome 1",          sub2.Name());
+    EXPECT_EQ("ac95d0a3-94b8-4918-b3af-a3f81bbe519c", sub2.UniqueId());
+    EXPECT_EQ("2.3.0",                                sub2.Version());
+    const Filters& sub2Filters = sub2.Filters();
+    EXPECT_EQ(1, sub2Filters.Size());
+    const Filter& sub2Filter = sub2Filters[0];
+    EXPECT_EQ(1, sub2Filter.Properties().Size());
+    const Property& property2 = sub2Filter.Properties()[0];
+    EXPECT_EQ("RNAME",   property2.Name());
+    EXPECT_EQ("==",    property2.Operator());
+    EXPECT_EQ("chr1", property2.Value());
+}
+
+static void TestAli1Xml(void)
+{
+    const DataSet dataset(ali1XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/alignments0.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/alignments0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:./alignments1.bam"),  resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/alignments1.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(string(""), subdataset.CreatedAt());
+            EXPECT_EQ(string(""), subdataset.MetaType());
+            EXPECT_EQ(string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(string(""), subdataset.Tags());
+            EXPECT_EQ(string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("rq"), property.Name());
+            EXPECT_EQ(string("0.85"), property.Value());
+            EXPECT_EQ(string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(string(""), subdataset.CreatedAt());
+            EXPECT_EQ(string(""), subdataset.MetaType());
+            EXPECT_EQ(string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(string(""), subdataset.Tags());
+            EXPECT_EQ(string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("RNAME"), property.Name());
+            EXPECT_EQ(string("chr1"), property.Value());
+            EXPECT_EQ(string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestAli2Xml(void)
+{
+    const DataSet dataset(ali2XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/alignments2.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/alignments2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:./alignments3.bam"),  resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/alignments3.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(string(""), subdataset.CreatedAt());
+            EXPECT_EQ(string(""), subdataset.MetaType());
+            EXPECT_EQ(string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(string(""), subdataset.Tags());
+            EXPECT_EQ(string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("rq"), property.Name());
+            EXPECT_EQ(string("0.85"), property.Value());
+            EXPECT_EQ(string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(string(""), subdataset.CreatedAt());
+            EXPECT_EQ(string(""), subdataset.MetaType());
+            EXPECT_EQ(string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(string(""), subdataset.Tags());
+            EXPECT_EQ(string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("RNAME"), property.Name());
+            EXPECT_EQ(string("chr1"), property.Value());
+            EXPECT_EQ(string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestAli3Xml(void)
+{
+    const DataSet dataset(ali3XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/alignments2.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/alignments2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:./alignments3.bam"),  resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/alignments3.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(string(""), subdataset.CreatedAt());
+            EXPECT_EQ(string(""), subdataset.MetaType());
+            EXPECT_EQ(string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(string(""), subdataset.Tags());
+            EXPECT_EQ(string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("rq"), property.Name());
+            EXPECT_EQ(string("0.75"), property.Value());
+            EXPECT_EQ(string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(string(""), subdataset.CreatedAt());
+            EXPECT_EQ(string(""), subdataset.MetaType());
+            EXPECT_EQ(string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(string(""), subdataset.Tags());
+            EXPECT_EQ(string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("RNAME"), property.Name());
+            EXPECT_EQ(string("chr1"), property.Value());
+            EXPECT_EQ(string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestAli4Xml(void)
+{
+    const DataSet dataset(ali4XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/alignments0.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/alignments0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:./alignments1.bam"),  resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/alignments1.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(string(""), subdataset.CreatedAt());
+            EXPECT_EQ(string(""), subdataset.MetaType());
+            EXPECT_EQ(string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(string(""), subdataset.Tags());
+            EXPECT_EQ(string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("rq"), property.Name());
+            EXPECT_EQ(string("0.85"), property.Value());
+            EXPECT_EQ(string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(string(""), subdataset.CreatedAt());
+            EXPECT_EQ(string(""), subdataset.MetaType());
+            EXPECT_EQ(string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(string(""), subdataset.Tags());
+            EXPECT_EQ(string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("RNAME"), property.Name());
+            EXPECT_EQ(string("chr1"), property.Value());
+            EXPECT_EQ(string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestMappingStaggeredXml(void)
+{
+    const DataSet dataset(mappingStaggeredXmlFn);
+    EXPECT_EQ(DataSet::GENERIC, dataset.Type());
+    EXPECT_EQ(string("2015-05-13T10:58:26"),    dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.DataSet"), dataset.MetaType());
+    EXPECT_EQ(string(""), dataset.Name());
+    EXPECT_EQ(string(""), dataset.Tags());
+    EXPECT_EQ(string("30f72098-bc5b-e06b-566c-8b28dda909a8"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"), dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(string(""), resource.Name());
+            EXPECT_EQ(string(""), resource.Description());
+            EXPECT_EQ(string(""), resource.MetaType());
+            EXPECT_EQ(string("file:tests/data/bam_mapping_1.bam"), resource.ResourceId());
+            EXPECT_EQ(string(""), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:tests/data/bam_mapping_1.bam.bai"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string(""), resource.Name());
+            EXPECT_EQ(string(""), resource.Description());
+            EXPECT_EQ(string(""), resource.MetaType());
+            EXPECT_EQ(string("file:tests/data/bam_mapping_2.bam"), resource.ResourceId());
+            EXPECT_EQ(string(""), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:tests/data/bam_mapping_2.bam.bai"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(string("2015-05-13T10:58:26"),    subdataset.CreatedAt());
+            EXPECT_EQ(string(""), subdataset.MetaType());
+            EXPECT_EQ(string(""), subdataset.Name());
+            EXPECT_EQ(string(""), subdataset.Tags());
+            EXPECT_EQ(string("c5402d06-4643-057c-e300-fe229b4e8909"), subdataset.UniqueId());
+            EXPECT_EQ(string("2.3.0"), subdataset.Version());
+
+            const ExternalResources& resources = subdataset.ExternalResources();
+            ASSERT_EQ(1, resources.Size());
+            const ExternalResource& resource = resources[0];
+            EXPECT_EQ(string("file:tests/data/bam_mapping_2.bam"), resource.ResourceId());
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:tests/data/bam_mapping_2.bam.bai"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string("2015-05-13T10:58:26"),    subdataset.CreatedAt());
+            EXPECT_EQ(string(""), subdataset.MetaType());
+            EXPECT_EQ(string(""), subdataset.Name());
+            EXPECT_EQ(string(""), subdataset.Tags());
+            EXPECT_EQ(string("f8b54a55-5fb7-706f-ab35-39afc9c86924"), subdataset.UniqueId());
+            EXPECT_EQ(string("2.3.0"), subdataset.Version());
+
+            const ExternalResources& resources = subdataset.ExternalResources();
+            ASSERT_EQ(1, resources.Size());
+            const ExternalResource& resource = resources[0];
+            EXPECT_EQ(string("file:tests/data/bam_mapping_1.bam"), resource.ResourceId());
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:tests/data/bam_mapping_1.bam.bai"), index.ResourceId());
+        }
+    }
+}
+
+static void TestBarcodeXml(void)
+{
+    const DataSet dataset(barcodeXmlFn);
+    EXPECT_EQ(DataSet::BARCODE, dataset.Type());
+    EXPECT_EQ(string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.BarcodeSet"), dataset.MetaType());
+    EXPECT_EQ(string("DataSet_BarcodeSet"), dataset.Name());
+    EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"), dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(string("First Barcodes FASTA"), resource.Name());
+    EXPECT_EQ(string("Points to an example Barcodes FASTA file."), resource.Description());
+    EXPECT_EQ(string("BarcodeFile.BarcodeFastaFile"), resource.MetaType());
+    EXPECT_EQ(string("file:///mnt/path/to/barcode.fasta"), resource.ResourceId());
+    EXPECT_EQ(string("Example"), resource.Tags());
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(string("30"),     metadata.NumRecords());
+    EXPECT_EQ(string("400"),    metadata.TotalLength());
+
+    // access metadata extensions directly for now
+    EXPECT_EQ(string("paired"), metadata.ChildText("BarcodeConstruction"));
+}
+
+static void TestCcsReadXml(void)
+{
+    const DataSet dataset(ccsReadXmlFn);
+    EXPECT_EQ(DataSet::CONSENSUS_READ, dataset.Type());
+    EXPECT_EQ(string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.ConsensusReadSet"), dataset.MetaType());
+    EXPECT_EQ(string("DataSet_ConsensusReadSet"), dataset.Name());
+    EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"), dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(string("First ConsensusRead BAM"), resource.Name());
+            EXPECT_EQ(string("Points to an example ConsensusRead BAM file."), resource.Description());
+            EXPECT_EQ(string("PacBio.ConsensusReadFile.ConsensusReadBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/ccsreads0.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("PacBio.Index.PacBioIndex"), index.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/ccsreads0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string("Second ConsensusRead BAM"), resource.Name());
+            EXPECT_EQ(string("Points to another example ConsensusRead BAM file."), resource.Description());
+            EXPECT_EQ(string("PacBio.ConsensusReadFile.ConsensusReadBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/ccsreads1.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("PacBio.Index.PacBioIndex"), index.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/ccsreads0.pbi"), index.ResourceId());
+        }
+    }
+}
+
+static void TestLambdaContigsXml(void)
+{
+    const DataSet dataset(lambdaContigsXmlFn);
+    EXPECT_EQ(DataSet::REFERENCE, dataset.Type());
+    EXPECT_EQ(string("2015-05-28T10:56:36"),    dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.ReferenceSet"), dataset.MetaType());
+    EXPECT_EQ(string(""), dataset.Name());
+    EXPECT_EQ(string(""), dataset.Tags());
+    EXPECT_EQ(string("596e87db-34f9-d2fd-c905-b017543170e1"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"), dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(string("file:tests/data/lambda_contigs.fasta"), resource.ResourceId());
+}
+
+static void TestPbalchemyXml(void)
+{
+    const DataSet dataset(pbalchemyXmlFn);
+    EXPECT_EQ(DataSet::GENERIC, dataset.Type());
+    EXPECT_EQ(string("2015-05-22T16:56:16"),    dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.DataSet"), dataset.MetaType());
+    EXPECT_EQ(string(""), dataset.Name());
+    EXPECT_EQ(string(""), dataset.Tags());
+    EXPECT_EQ(string("58e3f7c5-24c1-b58b-fbd5-37de268cc2f0"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"), dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(string("file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam"), resource.ResourceId());
+    const FileIndices& fileIndices = resource.FileIndices();
+    ASSERT_EQ(1, fileIndices.Size());
+    const FileIndex& index = fileIndices[0];
+    EXPECT_EQ(string("file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam.bai"), index.ResourceId());
+
+    // TYPOs: Should be Filter Properties/Property not Parameter(s)
+
+}
+
+static void TestReferenceXml(void)
+{
+    const DataSet dataset(referenceXmlFn);
+    EXPECT_EQ(DataSet::REFERENCE, dataset.Type());
+    EXPECT_EQ(string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.ReferenceSet"), dataset.MetaType());
+    EXPECT_EQ(string("DataSet_ReferenceSet"), dataset.Name());
+    EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"), dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(string("First References FASTA"), resource.Name());
+    EXPECT_EQ(string("Points to an example references FASTA file."), resource.Description());
+    EXPECT_EQ(string("PacBio.ReferenceFile.ReferenceFastaFile"), resource.MetaType());
+    EXPECT_EQ(string("file:///mnt/path/to/reference.fasta"), resource.ResourceId());
+    EXPECT_EQ(string("Example"), resource.Tags());
+    const FileIndices& fileIndices = resource.FileIndices();
+    ASSERT_EQ(2, fileIndices.Size());
+    for (size_t i = 0; i < fileIndices.Size(); ++i) {
+        const FileIndex& index = fileIndices[i];
+        if (i == 0) {
+            EXPECT_EQ(string("PacBio.Index.SaWriterIndex"), index.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/reference.fasta.sa"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string("PacBio.Index.SamIndex"), index.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/reference.fasta.fai"), index.ResourceId());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(string("500"),     metadata.NumRecords());
+    EXPECT_EQ(string("5000000"), metadata.TotalLength());
+
+    // access metadata extensions directly for now
+    EXPECT_EQ(string("Tribble"), metadata.ChildText("Organism"));
+    EXPECT_EQ(string("Diploid"), metadata.ChildText("Ploidy"));
+
+    const internal::DataSetListElement<internal::DataSetElement>& contigs =
+            metadata.Child<internal::DataSetListElement<internal::DataSetElement> >("Contigs");
+    ASSERT_EQ(1, contigs.NumChildren());
+    const internal::DataSetElement& contig = contigs[0];
+    EXPECT_EQ(string("gi|229359445|emb|AM181176.4|"), contig.Attribute("Name"));
+    EXPECT_EQ(string("Pseudomonas fluorescens SBW25 complete genome|quiver"), contig.Attribute("Description"));
+    EXPECT_EQ(string("6722109"), contig.Attribute("Length"));
+    EXPECT_EQ(string("f627c795efad7ce0050ed42b942d408e"), contig.Attribute("Digest"));
+}
+
+static void TestSubread1Xml(void)
+{
+    const DataSet dataset(subread1XmlFn);
+    EXPECT_EQ(DataSet::SUBREAD, dataset.Type());
+    EXPECT_EQ(string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(string("DataSet_SubreadSet"), dataset.Name());
+    EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"), dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(string("First Subreads BAM"), resource.Name());
+            EXPECT_EQ(string("Points to an example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/subreads0.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/subreads0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string("Second Subreads BAM"), resource.Name());
+            EXPECT_EQ(string("Points to another example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/subreads1.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/subreads0.pbi"), index.ResourceId());
+        }
+    }
+
+    const Filters& filters = dataset.Filters();
+    ASSERT_EQ(2, filters.Size());
+    for (size_t i = 0; i < filters.Size(); ++i) {
+        const Filter& filter = filters[i];
+        if (i == 0) {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("rq"), property.Name());
+            EXPECT_EQ(string("0.75"), property.Value());
+            EXPECT_EQ(string(">"), property.Operator());
+        } else {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("QNAME"), property.Name());
+            EXPECT_EQ(string("100/0/0_100"), property.Value());
+            EXPECT_EQ(string("=="), property.Operator());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(string("500"),    metadata.NumRecords());
+    EXPECT_EQ(string("500000"), metadata.TotalLength());
+}
+
+static void TestSubread2Xml(void)
+{
+    const DataSet dataset(subread2XmlFn);
+    EXPECT_EQ(DataSet::SUBREAD, dataset.Type());
+    EXPECT_EQ(string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(string("DataSet_SubreadSet"), dataset.Name());
+    EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"), dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(string("First Subreads BAM"), resource.Name());
+            EXPECT_EQ(string("Points to an example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/subreads2.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/subreads2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string("Second Subreads BAM"), resource.Name());
+            EXPECT_EQ(string("Points to another example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/subreads3.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/subreads3.pbi"), index.ResourceId());
+        }
+    }
+
+    const Filters& filters = dataset.Filters();
+    ASSERT_EQ(2, filters.Size());
+    for (size_t i = 0; i < filters.Size(); ++i) {
+        const Filter& filter = filters[i];
+        if (i == 0) {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("rq"), property.Name());
+            EXPECT_EQ(string("0.75"), property.Value());
+            EXPECT_EQ(string(">"), property.Operator());
+        } else {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("QNAME"), property.Name());
+            EXPECT_EQ(string("100/0/0_100"), property.Value());
+            EXPECT_EQ(string("=="), property.Operator());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(string("500"),    metadata.NumRecords());
+    EXPECT_EQ(string("500000"), metadata.TotalLength());
+}
+
+static void TestSubread3Xml(void)
+{
+    const DataSet dataset(subread3XmlFn);
+    EXPECT_EQ(DataSet::SUBREAD, dataset.Type());
+    EXPECT_EQ(string("2015-01-27T09:00:01"), dataset.CreatedAt());
+    EXPECT_EQ(string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(string("DataSet_SubreadSet"), dataset.Name());
+    EXPECT_EQ(string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(string("2.3.0"), dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(string("First Subreads BAM"), resource.Name());
+            EXPECT_EQ(string("Points to an example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/subreads2.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/subreads2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string("Second Subreads BAM"), resource.Name());
+            EXPECT_EQ(string("Points to another example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/path/to/subreads3.bam"), resource.ResourceId());
+            EXPECT_EQ(string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(string("file:///mnt/path/to/subreads3.pbi"), index.ResourceId());
+        }
+    }
+
+    const Filters& filters = dataset.Filters();
+    ASSERT_EQ(2, filters.Size());
+    for (size_t i = 0; i < filters.Size(); ++i) {
+        const Filter& filter = filters[i];
+        if (i == 0) {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("rq"), property.Name());
+            EXPECT_EQ(string("0.85"), property.Value());
+            EXPECT_EQ(string(">"), property.Operator());
+        } else {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(string("QNAME"), property.Name());
+            EXPECT_EQ(string("100/0/0_100"), property.Value());
+            EXPECT_EQ(string("=="), property.Operator());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(string("500"),    metadata.NumRecords());
+    EXPECT_EQ(string("500000"), metadata.TotalLength());
+}
+
+static void TestTransformedXml(void)
+{
+    const DataSet dataset(transformedXmlFn);
+    EXPECT_EQ(DataSet::HDF_SUBREAD, dataset.Type());
+    EXPECT_EQ(string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(string("Subreads from run r001173_42129_130607"), dataset.Name());
+    EXPECT_EQ(string("pacbio.secondary.instrument=RS"), dataset.Tags());
+    EXPECT_EQ(string("abbc9183-b01e-4671-8c12-19efee534647"), dataset.UniqueId());
+    EXPECT_EQ(string("0.5"), dataset.Version());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(string("http://www.w3.org/2001/XMLSchema"),         dataset.Attribute("xmlns:xs"));
+    EXPECT_EQ(string("http://www.w3.org/2005/xpath-functions"), dataset.Attribute("xmlns:fn"));
+    EXPECT_EQ(string("java:java.util.UUID"), dataset.Attribute("xmlns:uuid"));
+    EXPECT_EQ(string("http://whatever"), dataset.Attribute("xmlns:bax"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(3, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(string("PacBio.SubreadFile.BaxFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/secondary-siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.0.bax.h5"),
+                      resource.ResourceId());
+        }
+        else if (i == 1) {
+            EXPECT_EQ(string("PacBio.SubreadFile.BaxFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/secondary-siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.1.bax.h5"),
+                      resource.ResourceId());
+        }
+        else {
+            EXPECT_EQ(string("PacBio.SubreadFile.BaxFile"), resource.MetaType());
+            EXPECT_EQ(string("file:///mnt/secondary-siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.2.bax.h5"),
+                      resource.ResourceId());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(string("150000"),   metadata.NumRecords());
+    EXPECT_EQ(string("50000000"), metadata.TotalLength());
+}
+
+TEST(DataSetIOTest, InspectMalformedXml)
+{
+    const string xmlFn = tests::Data_Dir + "/dataset/malformed.xml";
+
+    DataSet ds(xmlFn);
+    stringstream s;
+    ds.SaveToStream(s);
+
+    const string expected =
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<SubreadSet CreatedAt=\"2015-08-19T15:39:36.331\" Description=\"Merged dataset from 1 files using DatasetMerger 0.1.2\" "
+                    "MetaType=\"PacBio.DataSet.HdfSubreadSet\" Name=\"Subreads from runr000013_42267_150403\" "
+                    "Tags=\"pacbio.secondary.instrument=RS\" TimeStampedName=\"hdfsubreadset_2015-08-19T15:39:36.331-07:00\" "
+                    "UniqueId=\"b4741521-2a4c-42df-8a13-0a755ca9ed1e\" Version=\"0.5\" "
+                    "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                    "xmlns:ns0=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+                    "xmlns:ns1=\"http://pacificbiosciences.com/PacBioSampleInfo.xsd\" "
+                    "xmlns:ns2=\"http://pacificbiosciences.com/PacBioCollectionMetadata.xsd\" "
+                    "xmlns:ns3=\"http://pacificbiosciences.com/PacBioReagentKit.xsd\" "
+                    "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                    "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t<ns0:ExternalResources>\n"
+        "\t\t<ns0:ExternalResource MetaType=\"SubreadFile.SubreadBamFile\" "
+                                  "ResourceId=\"file:///mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0//mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0/file.subreads.subreads.bam\" "
+                                  "TimeStampedName=\"SubreadFile.SubreadBamFile_00000000000000\" "
+                                  "UniqueId=\"251acf71-9eb0-489e-9dd1-cdbd11432753\" />\n"
+        "\t</ns0:ExternalResources>\n"
+        "\t<DataSetMetadata>\n"
+        "\t\t<TotalLength>50000000</TotalLength>\n"
+        "\t\t<NumRecords>150000</NumRecords>\n"
+        "\t\t<ns2:Collections>\n"
+        "\t\t\t<ns2:CollectionMetadata Context=\"m150404_101626_42267_c100807920800000001823174110291514_s1_p0\" "
+                                      "InstrumentId=\"1\" InstrumentName=\"42267\" MetaType=\"PacBio.Collection\" "
+                                      "TimeStampedName=\"m150404_101626_42267_c100807920800000001823174110291514_s1_p0\" "
+                                      "UniqueId=\"d66c8372-2b70-4dcf-b64f-9f8b5cc351fd\">\n"
+        "\t\t\t\t<ns2:InstCtrlVer>2.3.0.1.142990</ns2:InstCtrlVer>\n"
+        "\t\t\t\t<ns2:SigProcVer>NRT@172.31.128.10:8082, SwVer=2301.142990, HwVer=1.0</ns2:SigProcVer>\n"
+        "\t\t\t\t<ns2:RunDetails>\n"
+        "\t\t\t\t\t<ns2:RunId>r000013_42267_150403</ns2:RunId>\n"
+        "\t\t\t\t\t<ns2:Name>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:Name>\n"
+        "\t\t\t\t</ns2:RunDetails>\n"
+        "\t\t\t\t<ns2:WellSample Name=\"Inst42267-040315-SAT-100pM-2kb-P6C4\">\n"
+        "\t\t\t\t\t<ns2:PlateId>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:PlateId>\n"
+        "\t\t\t\t\t<ns2:WellName>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:WellName>\n"
+        "\t\t\t\t\t<ns2:Concentration>0.0</ns2:Concentration>\n"
+        "\t\t\t\t\t<ns2:SampleReuseEnabled>false</ns2:SampleReuseEnabled>\n"
+        "\t\t\t\t\t<ns2:StageHotstartEnabled>false</ns2:StageHotstartEnabled>\n"
+        "\t\t\t\t\t<ns2:SizeSelectionEnabled>false</ns2:SizeSelectionEnabled>\n"
+        "\t\t\t\t\t<ns2:UseCount>1</ns2:UseCount>\n"
+        "\t\t\t\t\t<ns1:BioSamplePointers>\n"
+        "\t\t\t\t\t\t<ns1:BioSamplePointer>251acf71-9eb0-489e-9dd1-cdbd11432752</ns1:BioSamplePointer>\n"
+        "\t\t\t\t\t</ns1:BioSamplePointers>\n"
+        "\t\t\t\t</ns2:WellSample>\n"
+        "\t\t\t\t<ns2:Automation>\n"
+        "\t\t\t\t\t<ns0:AutomationParameters>\n"
+        "\t\t\t\t\t\t<ns0:AutomationParameter />\n"
+        "\t\t\t\t\t</ns0:AutomationParameters>\n"
+        "\t\t\t\t</ns2:Automation>\n"
+        "\t\t\t\t<ns2:CollectionNumber>7</ns2:CollectionNumber>\n"
+        "\t\t\t\t<ns2:CellIndex>4</ns2:CellIndex>\n"
+        "\t\t\t\t<ns2:CellPac Barcode=\"10080792080000000182317411029151\" />\n"
+        "\t\t\t\t<ns2:Primary>\n"
+        "\t\t\t\t\t<ns2:AutomationName>BasecallerV1</ns2:AutomationName>\n"
+        "\t\t\t\t\t<ns2:ConfigFileName>2-3-0_P6-C4.xml</ns2:ConfigFileName>\n"
+        "\t\t\t\t\t<ns2:SequencingCondition />\n"
+        "\t\t\t\t\t<ns2:OutputOptions>\n"
+        "\t\t\t\t\t\t<ns2:ResultsFolder>Analysis_Results</ns2:ResultsFolder>\n"
+        "\t\t\t\t\t\t<ns2:CollectionPathUri>rsy://mp-rsync/vol55//RS_DATA_STAGING/42267/Inst42267-040315-SAT-100pM-2kb-P6C4_13/A04_7/</ns2:CollectionPathUri>\n"
+        "\t\t\t\t\t\t<ns2:CopyFiles>\n"
+        "\t\t\t\t\t\t\t<ns2:CollectionFileCopy>Fasta</ns2:CollectionFileCopy>\n"
+        "\t\t\t\t\t\t</ns2:CopyFiles>\n"
+        "\t\t\t\t\t\t<ns2:Readout>Bases</ns2:Readout>\n"
+        "\t\t\t\t\t\t<ns2:MetricsVerbosity>Minimal</ns2:MetricsVerbosity>\n"
+        "\t\t\t\t\t</ns2:OutputOptions>\n"
+        "\t\t\t\t</ns2:Primary>\n"
+        "\t\t\t</ns2:CollectionMetadata>\n"
+        "\t\t</ns2:Collections>\n"
+        "\t\t<ns1:BioSamples>\n"
+        "\t\t\t<ns1:BioSample Description=\"Inst42267-SAT-100pM-2kbLambda-P6C4-Std120_CPS_040315\" "
+                            "MetaType=\"PacBio.Sample\" Name=\"Inst42267-040315-SAT-100pM-2kb-P6C4\" "
+                            "TimeStampedName=\"biosample_2015-08-19T15:39:36.331-07:00\" UniqueId=\"251acf71-9eb0-489e-9dd1-cdbd11432752\" />\n"
+        "\t\t</ns1:BioSamples>\n"
+        "\t</DataSetMetadata>\n"
+        "</SubreadSet>\n";
+
+    EXPECT_EQ(expected, s.str());
+}
+
+TEST(DataSetIOTest, RelativePathCarriedThroughOk_FromString)
+{
+    const string inputXml =
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet "
+            "CreatedAt=\"2015-01-27T09:00:01\" "
+            "MetaType=\"PacBio.DataSet.AlignmentSet\" "
+            "Name=\"DataSet_AlignmentSet\" "
+            "Tags=\"barcode moreTags mapping mytags\" "
+            "TimeStampedName=\"biosample_2015-08-19T15:39:36.331-07:00\" "
+            "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+            "Version=\"2.3.0\" "
+            "xmlns=\"http://pacificbiosciences.com/PacBioDataModel.xsd\" "
+            "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+            "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+            "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDataModel.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to an example Alignments BAM file.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Third Alignments BAM\" "
+                "ResourceId=\"../path/to/resource1.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"../path/to/resource1.bam.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to another example Alignments BAM file, by relative path.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Fourth Alignments BAM\" "
+                "ResourceId=\"../path/to/resource2.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"../path/to/resource2.bam.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t</pbbase:ExternalResources>\n"
+        "</pbds:AlignmentSet>\n";
+
+    auto dataset = DataSet::FromXml(inputXml);
+
+    stringstream stream;
+    dataset.SaveToStream(stream);
+    auto outputXml = stream.str();
+
+    EXPECT_EQ(inputXml, outputXml);
+}
+
+TEST(DataSetIOTest, RelativePathCarriedThroughOk_FromFile)
+{
+    DataSet dataset(tests::Data_Dir + "/relative/relative.xml");
+    auto resources = dataset.ExternalResources();
+    EXPECT_EQ("./a/test.bam",  resources[0].ResourceId());
+    EXPECT_EQ("./b/test1.bam", resources[1].ResourceId());
+    EXPECT_EQ("./b/test2.bam", resources[2].ResourceId());
+
+    stringstream out;
+    dataset.SaveToStream(out);
+
+    auto newDataset = DataSet::FromXml(out.str());
+    auto newResources = newDataset.ExternalResources();
+    EXPECT_EQ("./a/test.bam",  newResources[0].ResourceId());
+    EXPECT_EQ("./b/test1.bam", newResources[1].ResourceId());
+    EXPECT_EQ("./b/test2.bam", newResources[2].ResourceId());
+}
+
+TEST(DataSetIOTest, DataSetFromRelativeBamFilename)
+{
+    // cache initial directory and move to location so we can test relatvie filename ok
+    const string startingDirectory = internal::FileUtils::CurrentWorkingDirectory();
+
+    const string targetDirectory = tests::Data_Dir + "/dataset";
+    changeCurrentDirectory(targetDirectory);
+    ASSERT_EQ(targetDirectory, internal::FileUtils::CurrentWorkingDirectory());
+
+    EXPECT_NO_THROW(
+    {
+        const string relativeBamFn = "../phi29.bam";
+        const DataSet ds(relativeBamFn);
+        const auto& files = ds.BamFiles();
+        EXPECT_EQ(1, files.size());
+    });
+
+    // restore working directory
+    changeCurrentDirectory(startingDirectory);
+}
+
diff --git a/tests/src/test_DataSetMetadata.cpp b/tests/src/test_DataSetMetadata.cpp

new file mode 100644 (file)

index 0000000..ac5d469
--- /dev/null
+++ b/tests/src/test_DataSetMetadata.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/dataset/DataSet.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace tests {
+
+//static inline
+//DataSet CreateDataSet(void)
+//{
+//    DataSet d;
+//    d.Name("foo");
+//    return d;
+//}
+
+} // namespace tests
+
+TEST(DataSetMetadataTest, DummyTest) {
+    EXPECT_TRUE(true);
+}
diff --git a/tests/src/test_DataSetQuery.cpp b/tests/src/test_DataSetQuery.cpp

new file mode 100644 (file)

index 0000000..996fbfe
--- /dev/null
+++ b/tests/src/test_DataSetQuery.cpp
@@ -0,0 +1,500 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "TestData.h"
+#include <boost/any.hpp>
+#include <gtest/gtest.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/GenomicIntervalQuery.h>
+#include <pbbam/ZmwQuery.h>
+#include <pbbam/ZmwGroupQuery.h>
+#include <pbbam/DataSet.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+const string alignedBamFn      = tests::Data_Dir + "/aligned.bam";
+const string aligned2BamFn     = tests::Data_Dir + "/aligned2.bam";
+const string alignedCopyBamFn  = tests::GeneratedData_Dir + "/aligned.bam";
+const string aligned2CopyBamFn = tests::GeneratedData_Dir + "/aligned2.bam";
+
+const string group_fofn   = tests::Generated_Dir + "/group.fofn";
+const string group_file1  = tests::Data_Dir + "/group/test1.bam";
+const string group_file2  = tests::Data_Dir + "/group/test2.bam";
+const string group_file3  = tests::Data_Dir + "/group/test3.bam";
+
+const vector<string> group_file1_names =
+{
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/24962/0_427"
+};
+
+const vector<string> group_file2_names =
+{
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2114_2531",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/4101_5571",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237"
+};
+
+const vector<string> group_file3_names =
+{
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/45203/0_893",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/45203/0_893",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/3759_4005",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/4052_4686",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/4732_4869",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/9482_9628",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/9675_10333",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/10378_10609",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49194/0_798",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49194/845_1541",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49521/0_134"
+};
+
+static inline
+bool InGroup(const string& name, const vector<string>& group)
+{
+    for (const string& s : group) {
+        if (s == name)
+            return true;
+    }
+    return false;
+}
+
+TEST(DataSetQueryTest, EntireFileQueryTest)
+{
+    // single file
+    EXPECT_NO_THROW(
+    {
+        BamFile bamFile(alignedBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+
+        int count =0;
+        EntireFileQuery query(dataset); // from DataSet object
+        for (const BamRecord& record : query) {
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+
+        count = 0;
+        EntireFileQuery query2(alignedBamFn); // from BAM filename
+        for (const BamRecord& record : query2) {
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+
+        count = 0;
+        EntireFileQuery query3(bamFile); // from BamFile object
+        for (const BamRecord& record : query3) {
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+    });
+
+    // duplicate file attempt
+    EXPECT_NO_THROW(
+    {
+        BamFile bamFile(alignedBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+        dataset.ExternalResources().Add(bamFile);
+
+        int count =0;
+        EntireFileQuery query(dataset);
+        for (const BamRecord& record : query) {
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(4, count); // same as single
+    });
+
+    // true multi-file dataset
+    EXPECT_NO_THROW(
+    {
+        BamFile file1(group_file1); // 1 read
+        BamFile file2(group_file2); // 4 reads
+        BamFile file3(group_file3); // 13 reads
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(file1);
+        dataset.ExternalResources().Add(file2);
+        dataset.ExternalResources().Add(file3);
+
+        int count = 0;
+        EntireFileQuery query(dataset);
+        for (const BamRecord& record : query) {
+
+            // ensure sequential merge of files
+            if (count == 0)     EXPECT_TRUE(InGroup(record.FullName(), group_file1_names));
+            else if (count < 5) EXPECT_TRUE(InGroup(record.FullName(), group_file2_names));
+            else                EXPECT_TRUE(InGroup(record.FullName(), group_file3_names));
+
+            ++count;
+        }
+        EXPECT_EQ(18, count);
+    });
+
+    // same as above, from FOFN
+    EXPECT_NO_THROW(
+    {
+        int count = 0;
+
+        DataSet dataset(group_fofn);
+        EntireFileQuery query(dataset);
+        for (const BamRecord& record : query) {
+
+            // ensure sequential merge of files
+            if (count == 0)     EXPECT_TRUE(InGroup(record.FullName(), group_file1_names));
+            else if (count < 5) EXPECT_TRUE(InGroup(record.FullName(), group_file2_names));
+            else                EXPECT_TRUE(InGroup(record.FullName(), group_file3_names));
+
+            ++count;
+        }
+        EXPECT_EQ(18, count);
+    });
+}
+
+TEST(DataSetQueryTest, GenomicIntervalQueryTest)
+{
+    const string rname = "lambda_NEB3011";
+
+    // single file
+    EXPECT_NO_THROW(
+    {
+        DataSet dataset(alignedBamFn); // from BAM filename
+
+        // count records
+        int count = 0;
+        GenomicInterval interval(rname, 5000, 6000);
+        GenomicIntervalQuery query(interval, dataset);
+        for (const BamRecord& record : query) {
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+
+        // adjust interval and pass back in
+        count = 0;
+        interval.Start(9000);
+        interval.Stop(9500);
+        query.Interval(interval);
+        for (const BamRecord& record : query) {
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+
+        // unknown ref
+        count = 0;
+        interval.Name("does not exist");
+        interval.Start(0);
+        interval.Stop(100);
+        EXPECT_THROW(query.Interval(interval), std::exception);
+        for (const BamRecord& record : query) { // iteration is still safe, just returns no data
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(0, count);
+
+        // adjust again - make sure we can read a real region after an invalid one
+        interval.Name(rname);
+        interval.Start(5000);
+        interval.Stop(6000);
+        query.Interval(interval);
+        count = 0;
+        for (const BamRecord& record : query) {
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+    });
+
+    // duplicate file
+    EXPECT_NO_THROW(
+    {
+        BamFile bamFile(alignedBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+        dataset.ExternalResources().Add(bamFile);
+
+        // count records & also ensure sorted merge
+        int count = 0;
+        int prevId = 0;
+        int prevPos = 0;
+
+        GenomicInterval interval(rname, 5000, 6000);
+        GenomicIntervalQuery query(interval, dataset);
+        for (const BamRecord& record : query) {
+
+            EXPECT_TRUE(record.ReferenceId()   >= prevId);
+            EXPECT_TRUE(record.ReferenceStart() >= prevPos);
+
+            prevId = record.ReferenceId();
+            prevPos = record.ReferenceStart();
+            ++count;
+        }
+        EXPECT_EQ(2, count); // same as single file
+
+        // adjust interval and pass back in
+        count = 0;
+        interval.Start(9000);
+        interval.Stop(10000);
+        query.Interval(interval);
+        for (const BamRecord& record : query) {
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(2, count); // same as single file
+
+        // unknown ref
+        count = 0;
+        interval.Name("does not exist");
+        interval.Start(0);
+        interval.Stop(100);
+        EXPECT_THROW(query.Interval(interval), std::exception);
+        for (const BamRecord& record : query) { // iteration is still safe, just returns no data
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(0, count); // same as single file
+
+        // adjust again - make sure we can read a real region after an invalid one
+        interval.Name(rname);
+        interval.Start(5000);
+        interval.Stop(5300);
+        query.Interval(interval);
+        count = 0;
+        for (const BamRecord& record : query) {
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(2, count); // same as single file
+    });
+
+    // multi file BAM (same record content for easy testing, but different filename(ResourceId)
+    EXPECT_NO_THROW(
+    {
+        BamFile bamFile(alignedBamFn);
+        BamFile copyFile(alignedCopyBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+        dataset.ExternalResources().Add(copyFile);
+
+        // count records & also ensure sorted merge
+        int count = 0;
+        int prevId = 0;
+        int prevPos = 0;
+
+        GenomicInterval interval(rname, 5000, 6000);
+        GenomicIntervalQuery query(interval, dataset);
+        for (const BamRecord& record : query) {
+
+            EXPECT_TRUE(record.ReferenceId()   >= prevId);
+            EXPECT_TRUE(record.ReferenceStart() >= prevPos);
+
+            prevId = record.ReferenceId();
+            prevPos = record.ReferenceStart();
+            ++count;
+        }
+        EXPECT_EQ(4, count); // single file * 2
+
+        // adjust interval and pass back in
+        count = 0;
+        interval.Start(9000);
+        interval.Stop(10000);
+        query.Interval(interval);
+        for (const BamRecord& record : query) {
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(4, count); // single file * 2
+
+        // unknown ref
+        count = 0;
+        interval.Name("does not exist");
+        interval.Start(0);
+        interval.Stop(100);
+        EXPECT_THROW(query.Interval(interval), std::exception);
+        for (const BamRecord& record : query) { // iteration is still safe, just returns no data
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(0, count); // single file * 2
+
+        // adjust again - make sure we can read a real region after an invalid one
+        interval.Name(rname);
+        interval.Start(5000);
+        interval.Stop(5300);
+        query.Interval(interval);
+        count = 0;
+        for (const BamRecord& record : query) {
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(4, count); // single file * 2
+    });
+}
+
+// TODO: implement me
+TEST(DataSetQueryTest, QNameQueryTest)
+{
+    EXPECT_TRUE(true);
+}
+
+TEST(DataSetQueryTest, ZmwQueryTest)
+{
+    const std::vector<int32_t> whitelist = { 13473, 30983 };
+
+    // single file
+    EXPECT_NO_THROW(
+    {
+        BamFile bamFile(aligned2BamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        DataSet dataset(bamFile);
+
+        int count = 0;
+        ZmwQuery query(whitelist, dataset);
+        for (const BamRecord& record: query) {
+            const int32_t holeNumber = record.HoleNumber();
+            EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+    });
+
+    // multi-file
+    EXPECT_NO_THROW(
+    {
+        BamFile bamFile(aligned2BamFn);
+        BamFile bamFile2(aligned2CopyBamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        ASSERT_TRUE(bamFile2.PacBioIndexExists());
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(ExternalResource(bamFile));
+        dataset.ExternalResources().Add(ExternalResource(bamFile2));
+
+        int count = 0;
+        ZmwQuery query(whitelist, dataset);
+        for (const BamRecord& r : query) {
+            const auto holeNumber = r.HoleNumber();
+            EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+            ++count;
+        }
+        EXPECT_EQ(8, count);
+    });
+}
+
+TEST(DataSetQueryTest, ZmwGroupQueryTest)
+{
+    const std::vector<int32_t> whitelist = { 13473, 30983 };
+
+    // single-file
+    EXPECT_NO_THROW(
+    {
+        BamFile bamFile(aligned2BamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        DataSet dataset(bamFile);
+
+        int count = 0;
+        int32_t groupZmw = -1;
+        ZmwGroupQuery query(whitelist, dataset);
+        for (const vector<BamRecord>& group : query)  {
+            for (const BamRecord& record: group) {
+                const auto holeNumber = record.HoleNumber();
+                if (groupZmw == -1)
+                    groupZmw = holeNumber;
+                EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+                EXPECT_EQ(groupZmw, holeNumber);
+                ++count;
+            }
+            groupZmw = -1;
+        }
+        EXPECT_EQ(4, count);
+    });
+
+    // multi-file
+    EXPECT_NO_THROW(
+    {
+        BamFile bamFile(aligned2BamFn);
+        BamFile bamFile2(aligned2CopyBamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        ASSERT_TRUE(bamFile2.PacBioIndexExists());
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(ExternalResource(bamFile));
+        dataset.ExternalResources().Add(ExternalResource(bamFile2));
+
+        int totalCount = 0;
+        int numRecordsInGroup = 0;
+        int groupCount = 0;
+        int32_t groupZmw = -1;
+        ZmwGroupQuery query(whitelist, dataset);
+        for (const vector<BamRecord>& group : query)  {
+            for (const BamRecord& record: group) {
+                const auto holeNumber = record.HoleNumber();
+                ++numRecordsInGroup;
+                if (groupZmw == -1)
+                    groupZmw = holeNumber;
+                EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+                EXPECT_EQ(groupZmw, holeNumber);
+                ++totalCount;
+            }
+            if (groupCount == 0)
+                EXPECT_EQ(4, numRecordsInGroup);
+            else if (groupCount == 1)
+                EXPECT_EQ(4, numRecordsInGroup);
+            else
+                EXPECT_TRUE(false); // should not get here
+            numRecordsInGroup = 0;
+            ++groupCount;
+            groupZmw = -1;
+        }
+        EXPECT_EQ(8, totalCount);
+    });
+}
diff --git a/tests/src/test_DataSetXsd.cpp b/tests/src/test_DataSetXsd.cpp

new file mode 100644 (file)

index 0000000..1238122
--- /dev/null
+++ b/tests/src/test_DataSetXsd.cpp
@@ -0,0 +1,182 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/DataSet.h>
+#include <pbbam/DataSetXsd.h>
+#include <string>
+#include <sstream>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(DataSetXsdTest, DefaultsOk)
+{
+    NamespaceRegistry registry;
+
+    const NamespaceInfo& baseInfo = registry.Namespace(XsdType::BASE_DATA_MODEL);
+    const NamespaceInfo& dsInfo   = registry.Namespace(XsdType::DATASETS);
+    const NamespaceInfo& defaultInfo = registry.DefaultNamespace();
+
+    EXPECT_EQ(XsdType::DATASETS, registry.DefaultXsd());
+
+    EXPECT_EQ(string("pbds"),   dsInfo.Name());
+    EXPECT_EQ(string("pbbase"), baseInfo.Name());
+    EXPECT_EQ(string("pbds"),   defaultInfo.Name());
+
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioBaseDataModel.xsd"), baseInfo.Uri());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"),      dsInfo.Uri());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"),      defaultInfo.Uri());
+}
+
+TEST(DataSetXsdTest, EditDefaultOk)
+{
+    NamespaceRegistry registry;
+    registry.SetDefaultXsd(XsdType::DATASETS);
+
+    const NamespaceInfo& defaultInfo = registry.DefaultNamespace();
+
+    EXPECT_EQ(XsdType::DATASETS, registry.DefaultXsd());
+    EXPECT_EQ(string("pbds"), defaultInfo.Name());
+    EXPECT_EQ(string("http://pacificbiosciences.com/PacBioDatasets.xsd"), defaultInfo.Uri());
+}
+
+TEST(DataSetXsdTest, EditRegistryOk)
+{
+    NamespaceRegistry registry;
+    registry.Register(XsdType::DATASETS, NamespaceInfo("custom", "http://custom/uri.xsd"));
+
+    const NamespaceInfo& dsInfo = registry.Namespace(XsdType::DATASETS);
+
+    EXPECT_EQ(string("custom"),                dsInfo.Name());
+    EXPECT_EQ(string("http://custom/uri.xsd"), dsInfo.Uri());
+}
+
+TEST(DataSetXsdTest, EditDatasetRegistry)
+{
+    DataSet dataset(DataSet::ALIGNMENT);
+    dataset.CreatedAt("2015-01-27T09:00:01");
+    dataset.MetaType("PacBio.DataSet.AlignmentSet");
+    dataset.Name("DataSet_AlignmentSet");
+    dataset.Tags("barcode moreTags mapping mytags");
+    dataset.TimeStampedName("my_time_stamped_name");
+    dataset.UniqueId("b095d0a3-94b8-4918-b3af-a3f81bbe519c");
+    dataset.Attribute("xmlns",              "http://pacificbiosciences.com/PacBioDatasets.xsd")
+           .Attribute("xmlns:xsi",          "http://www.w3.org/2001/XMLSchema-instance")
+           .Attribute("xsi:schemaLocation", "http://pacificbiosciences.com/PacBioDatasets.xsd");
+
+    ExternalResource ext("Fake.MetaType", "filename");
+    ext.TimeStampedName("custom_tsn")
+       .UniqueId("my_uuid");
+    dataset.ExternalResources().Add(ext);
+
+    dataset.Namespaces().Register(XsdType::BASE_DATA_MODEL, NamespaceInfo("custom", "http://custom/uri.xsd"));
+
+    const string expectedXml =
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet CreatedAt=\"2015-01-27T09:00:01\" MetaType=\"PacBio.DataSet.AlignmentSet\" "
+                "Name=\"DataSet_AlignmentSet\" Tags=\"barcode moreTags mapping mytags\" "
+                "TimeStampedName=\"my_time_stamped_name\" "
+                "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" Version=\"3.0.1\" "
+                "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:custom=\"http://custom/uri.xsd\" "
+                "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t<custom:ExternalResources>\n"
+        "\t\t<custom:ExternalResource MetaType=\"Fake.MetaType\" ResourceId=\"filename\" TimeStampedName=\"custom_tsn\" UniqueId=\"my_uuid\" Version=\"3.0.1\" />\n"
+        "\t</custom:ExternalResources>\n"
+        "</pbds:AlignmentSet>\n";
+
+    stringstream s;
+    dataset.SaveToStream(s);
+    EXPECT_EQ(expectedXml, s.str());
+}
+
+TEST(DataSetXsdTest, ElementRegistryOk)
+{
+    { // default namespaces
+
+        DataSet ds;
+
+        // append child elements that do not have a C++ built-in, nor namespace prefix with addition
+        DataSetMetadata& metadata = ds.Metadata();
+        metadata.AddChild(internal::DataSetElement("SummaryStats"));
+        metadata.AddChild(internal::DataSetElement("CopyFiles"));
+        metadata.AddChild(internal::DataSetElement("BioSamples"));
+        metadata.AddChild(internal::DataSetElement("AutomationParameters"));
+
+        stringstream s;
+        ds.SaveToStream(s);
+        const string output = s.str();
+
+        // check that default namespace is propagated properly
+        EXPECT_TRUE(output.find("pbds:SummaryStats") != string::npos);
+        EXPECT_TRUE(output.find("pbmeta:CopyFiles") != string::npos);
+        EXPECT_TRUE(output.find("pbsample:BioSamples") != string::npos);
+        EXPECT_TRUE(output.find("pbbase:AutomationParameters") != string::npos);
+    }
+
+    { // custom namespaces
+
+        DataSet ds;
+
+        // setup custom namespaces
+        ds.Namespaces().Register(XsdType::BASE_DATA_MODEL,     NamespaceInfo("custom_base",   "http://custom/base.xsd"));
+        ds.Namespaces().Register(XsdType::COLLECTION_METADATA, NamespaceInfo("custom_meta",   "http://custom/meta.xsd"));
+        ds.Namespaces().Register(XsdType::DATASETS,            NamespaceInfo("custom_ds",     "http://custom/datasets.xsd"));
+        ds.Namespaces().Register(XsdType::SAMPLE_INFO,         NamespaceInfo("custom_sample", "http://custom/base.xsd"));
+
+        // append child elements that do not have a C++ built-in, nor namespace prefix with addition
+        DataSetMetadata& metadata = ds.Metadata();
+        metadata.AddChild(internal::DataSetElement("SummaryStats"));
+        metadata.AddChild(internal::DataSetElement("CopyFiles"));
+        metadata.AddChild(internal::DataSetElement("BioSamples"));
+        metadata.AddChild(internal::DataSetElement("AutomationParameters"));
+
+        stringstream s;
+        ds.SaveToStream(s);
+        const string output = s.str();
+
+        // check that custom namespace is propagated properly
+        EXPECT_TRUE(output.find("custom_ds:SummaryStats") != string::npos);
+        EXPECT_TRUE(output.find("custom_meta:CopyFiles") != string::npos);
+        EXPECT_TRUE(output.find("custom_sample:BioSamples") != string::npos);
+        EXPECT_TRUE(output.find("custom_base:AutomationParameters") != string::npos);
+    }
+}
diff --git a/tests/src/test_EndToEnd.cpp b/tests/src/test_EndToEnd.cpp

new file mode 100644 (file)

index 0000000..9675914
--- /dev/null
+++ b/tests/src/test_EndToEnd.cpp
@@ -0,0 +1,254 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#define protected public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <htslib/sam.h>
+#include <pbbam/BamFile.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/EntireFileQuery.h>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <cstdio>
+#include <cstdlib>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::tests;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace tests {
+
+struct Bam1Deleter
+{
+    void operator()(bam1_t* b) {
+        if (b)
+            bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+struct SamFileDeleter
+{
+    void operator()(samFile* file) {
+        if (file)
+            sam_close(file);
+        file = nullptr;
+    }
+};
+
+struct BamHdrDeleter
+{
+    void operator()(bam_hdr_t* hdr) {
+        if (hdr)
+            bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+const string inputBamFn        = tests::Data_Dir + "/aligned.bam";
+const string goldStandardSamFn = tests::Data_Dir + "/aligned.sam";
+const string generatedBamFn    = tests::GeneratedData_Dir + "/generated.bam";
+const string generatedSamFn    = tests::GeneratedData_Dir + "/generated.sam";
+const vector<string> generatedFiles = { generatedBamFn, generatedSamFn };
+
+static inline
+int RunBam2Sam(const string& bamFn,
+               const string& samFn,
+               const string& args = string())
+{
+    stringstream s;
+    s << tests::Bam2Sam << " " << args << " " << bamFn << " > " << samFn;
+    return system(s.str().c_str());
+}
+
+static inline
+int RunDiff(const string& fn1, const string& fn2)
+{
+    stringstream s;
+    s << "diff " << fn1 << " " << fn2;
+    return system(s.str().c_str());
+}
+
+static inline
+void Remove(const vector<string>& files)
+{
+    for (const auto& fn : files)
+        remove(fn.c_str());
+}
+
+static inline
+void CheckGeneratedOutput(void)
+{
+    // convert to sam & diff against gold standard
+    const int convertRet = RunBam2Sam(generatedBamFn, generatedSamFn);
+    const int diffRet    = RunDiff(goldStandardSamFn, generatedSamFn);
+    EXPECT_EQ(0, convertRet);
+    EXPECT_EQ(0, diffRet);
+
+    // clean up
+    Remove(generatedFiles);
+}
+
+} // namespace tests
+} // namespace BAM
+} // namespace PacBio
+
+// sanity check for rest of tests below
+TEST(EndToEndTest, ReadAndWrite_PureHtslib)
+{
+    { // scoped to force flush & close before conversion/diff
+
+        // open files
+
+        unique_ptr<samFile, SamFileDeleter> inWrapper(sam_open(inputBamFn.c_str(), "r"));
+        samFile* in = inWrapper.get();
+        ASSERT_TRUE(in);
+
+        unique_ptr<samFile, SamFileDeleter> outWrapper(sam_open(generatedBamFn.c_str(), "wb"));
+        samFile* out = outWrapper.get();
+        ASSERT_TRUE(out);
+
+        // fetch & write header
+
+        unique_ptr<bam_hdr_t, BamHdrDeleter> headerWrapper(sam_hdr_read(in));
+        bam_hdr_t* hdr = headerWrapper.get();
+        ASSERT_TRUE(hdr);
+        ASSERT_EQ(0, sam_hdr_write(out, hdr));
+
+        // fetch & write records
+
+        unique_ptr<bam1_t, Bam1Deleter> record(bam_init1());
+        bam1_t* b = record.get();
+        ASSERT_TRUE(b);
+
+        while (sam_read1(in, hdr, b) >= 0)
+            sam_write1(out, hdr, b);
+    }
+
+    CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_SingleThread)
+{
+    EXPECT_NO_THROW(
+    {
+        // open input BAM file
+        BamFile bamFile(tests::inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(tests::generatedBamFn, bamFile.Header(), BamWriter::DefaultCompression, 1);
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_APIDefaultThreadCount)
+{
+    EXPECT_NO_THROW(
+    {
+        // open input BAM file
+        BamFile bamFile(inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(generatedBamFn, bamFile.Header());
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_SystemDefaultThreadCount)
+{
+    EXPECT_NO_THROW(
+    {
+        // open input BAM file
+        BamFile bamFile(inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(generatedBamFn,
+                         bamFile.Header(),
+                         BamWriter::DefaultCompression,
+                         0);
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_UserThreadCount)
+{
+    EXPECT_NO_THROW(
+    {
+        // open input BAM file
+        BamFile bamFile(inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(generatedBamFn,
+                         bamFile.Header(),
+                         BamWriter::DefaultCompression,
+                         3);
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    CheckGeneratedOutput();
+}
diff --git a/tests/src/test_EntireFileQuery.cpp b/tests/src/test_EntireFileQuery.cpp

new file mode 100644 (file)

index 0000000..47c25db
--- /dev/null
+++ b/tests/src/test_EntireFileQuery.cpp
@@ -0,0 +1,137 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/BamWriter.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+const string inputBamFn = tests::Data_Dir + "/aligned.bam";
+
+TEST(EntireFileQueryTest, CountRecords)
+{
+    EXPECT_NO_THROW(
+    {
+        BamFile bamFile(inputBamFn);
+        int count = 0;
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile) {
+            (void)record;
+            ++count;
+        }
+
+        EXPECT_EQ(4, count);
+    });
+}
+
+TEST(EntireFileQueryTest, NonConstBamRecord)
+{
+    EXPECT_NO_THROW(
+    {
+        BamFile bamFile(inputBamFn);
+        int count = 0;
+        EntireFileQuery entireFile(bamFile);
+        for (BamRecord& record : entireFile) {
+            (void)record;
+            ++count;
+        }
+
+        EXPECT_EQ(4, count);
+    });
+}
+
+TEST(BamRecordTest, HandlesDeletionOK)
+{
+    // this file raised no error in Debug mode, but segfaulted when
+    // trying to access the aligned qualities in Release mode
+
+    const string problemBamFn = tests::Data_Dir + "/segfault.bam";
+    BamFile bamFile(problemBamFn);
+    int count = 0;
+    EntireFileQuery entireFile(bamFile);
+    for (const BamRecord& record : entireFile) {
+
+        const auto rawQualities     = record.Qualities(Orientation::GENOMIC, false);
+        const auto alignedQualities = record.Qualities(Orientation::GENOMIC, true);
+
+        const string rawExpected =
+            "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII";
+
+        // 1=1D98=
+        const string alignedExpected =
+            "I!IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII";
+
+        EXPECT_EQ(rawExpected,     rawQualities.Fastq());
+        EXPECT_EQ(alignedExpected, alignedQualities.Fastq());
+
+        ++count;
+    }
+
+    EXPECT_EQ(1, count);
+}
+
+
+TEST(BamRecordTest, ReferenceName)
+{
+    {   // check reference name of first record
+        const string exampleBam  = tests::Data_Dir + "/aligned.bam";
+        BamFile bamFile(exampleBam);
+        EntireFileQuery records(bamFile);
+        auto firstIter = records.begin();
+        auto& firstRecord = *firstIter;
+        ASSERT_TRUE(firstRecord.IsMapped());
+        EXPECT_EQ("lambda_NEB3011", firstRecord.ReferenceName());
+    }
+
+    {   // unmapped records have no reference name, should throw
+        const string exampleBam  = tests::Data_Dir + "/unmap1.bam";
+        BamFile bamFile(exampleBam);
+        EntireFileQuery records(bamFile);
+        auto firstIter = records.begin();
+        auto& firstRecord = *firstIter;
+        ASSERT_FALSE(firstRecord.IsMapped());
+        EXPECT_THROW(firstRecord.ReferenceName(), std::runtime_error);
+    }
+}
diff --git a/tests/src/test_Fasta.cpp b/tests/src/test_Fasta.cpp

new file mode 100644 (file)

index 0000000..25b0390
--- /dev/null
+++ b/tests/src/test_Fasta.cpp
@@ -0,0 +1,105 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/FastaReader.h>
+#include <pbbam/FastaSequence.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+static void CheckSequence(const size_t index, const FastaSequence& seq)
+{
+    SCOPED_TRACE("checking FASTA seq:" + std::to_string(index));
+    switch (index) {
+        case 0 :
+            EXPECT_EQ("1", seq.Name());
+            EXPECT_EQ("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG", seq.Bases());
+            break;
+
+        case 1 :
+            EXPECT_EQ("2", seq.Name());
+            EXPECT_EQ("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC", seq.Bases());
+            break;
+
+        case 2 :
+            EXPECT_EQ("3", seq.Name());
+            EXPECT_EQ("TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT", seq.Bases());
+            break;
+
+        default:
+            ASSERT_TRUE(false); // invalid index
+    }
+}
+
+TEST(FastaSequenceTest, BasicConstructorOk)
+{
+    FastaSequence seq{ "1", "GATTACA" };
+    EXPECT_EQ("1",       seq.Name());
+    EXPECT_EQ("GATTACA", seq.Bases());
+}
+
+TEST(FastaReaderTest, IterableOk)
+{
+    const string fn = tests::GeneratedData_Dir + "/normal.fa";
+    FastaReader reader{ fn };
+
+    size_t count = 0;
+    FastaSequence seq;
+    while (reader.GetNext(seq)) {
+        CheckSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(FastaReaderTest, ReadAllOk)
+{
+    const string fn = tests::GeneratedData_Dir + "/normal.fa";
+
+    size_t count = 0;
+    for (const auto& seq : FastaReader::ReadAll(fn)) {
+        CheckSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
diff --git a/tests/src/test_FileUtils.cpp b/tests/src/test_FileUtils.cpp

new file mode 100644 (file)

index 0000000..c1b4beb
--- /dev/null
+++ b/tests/src/test_FileUtils.cpp
@@ -0,0 +1,326 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/../../src/FileUtils.h>
+#include <pbbam/../../src/TimeUtils.h>
+
+#include <boost/algorithm/string.hpp>
+
+#include <chrono>
+#include <string>
+#include <vector>
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+
+#include <iostream>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+TEST(FileUtilsTest, ExistsOk)
+{
+    EXPECT_FALSE(FileUtils::Exists("does_not_exist.txt"));
+
+    const string tmp = tests::GeneratedData_Dir + "/pbbam_exists_check.tmp";
+    const string cmd = string("touch ") + tmp;
+    ASSERT_EQ(0, system(cmd.c_str()));
+    EXPECT_TRUE(FileUtils::Exists(tmp));
+}
+
+TEST(FileUtilsTest, LastModifiedOk)
+{
+    // a little tricky to check without going a full 'mock' filesystem route, but we can approximate
+    //
+    // also, I can't seem to get better than second resolution (on OSX 10.9/clang at least, st_mtimespec.tv_nsec is always zero)
+
+    const auto now = CurrentTime();
+    const auto nowDuration = now.time_since_epoch();
+    const auto nowSeconds = chrono::duration_cast<chrono::seconds>(nowDuration).count();
+
+    const string tmp = tests::GeneratedData_Dir + "/pbbam_lastmod_check.tmp";
+    const string rmCmd = string("rm ") + tmp;
+    const string touchCmd = string("touch  ") + tmp;
+    int ret =  system(rmCmd.c_str());
+    (void)ret; // unused
+    ASSERT_EQ(0, system(touchCmd.c_str()));
+
+    const auto stamp = FileUtils::LastModified(tmp);
+    const auto stampDuration = stamp.time_since_epoch();
+    const auto stampSeconds = chrono::duration_cast<chrono::seconds>(stampDuration).count();
+
+    EXPECT_LE(nowSeconds, stampSeconds);
+}
+
+TEST(FileUtilsTest, ResolvedFilePathOk)
+{
+    const string testFrom = "/path/to/myDir";
+
+    // "raw" filenames - no URI scheme
+
+    const string absolutePath = "/absolute/path/to/file.txt";
+    const string relativePath = "../relative/path/to/file.txt";
+    const string noPathFn     = "file.txt";
+    
+    const string resolvedAbsolutePath = FileUtils::ResolvedFilePath(absolutePath, testFrom);
+    const string resolvedRelativePath = FileUtils::ResolvedFilePath(relativePath, testFrom);
+    const string resolvedNoPath       = FileUtils::ResolvedFilePath(noPathFn, testFrom);
+    const string resolvedAbsolutePath_defaultFrom = FileUtils::ResolvedFilePath(absolutePath);
+    const string resolvedRelativePath_defaultFrom = FileUtils::ResolvedFilePath(relativePath);
+    const string resolvedNoPath_defaultFrom       = FileUtils::ResolvedFilePath(noPathFn);
+
+    EXPECT_EQ("/absolute/path/to/file.txt",                  resolvedAbsolutePath);
+    EXPECT_EQ("/path/to/myDir/../relative/path/to/file.txt", resolvedRelativePath);
+    EXPECT_EQ("/path/to/myDir/file.txt",                     resolvedNoPath);
+
+    EXPECT_EQ("/absolute/path/to/file.txt",     resolvedAbsolutePath_defaultFrom);
+    EXPECT_EQ("./../relative/path/to/file.txt", resolvedRelativePath_defaultFrom);
+    EXPECT_EQ("./file.txt",                     resolvedNoPath_defaultFrom);
+
+    // filenames with URI scheme ("file://")
+
+    const string absoluteSchemeFn = "file:///absolute/path/to/file.txt";
+    const string relativeSchemeFn = "file://../relative/path/to/file.txt";
+    const string noPathSchemeFn   = "file://file.txt";
+
+    const string resolvedAbsoluteSchemePath = FileUtils::ResolvedFilePath(absoluteSchemeFn, testFrom);
+    const string resolvedRelativeSchemePath = FileUtils::ResolvedFilePath(relativeSchemeFn, testFrom);
+    const string resolvedNoPathSchemeFn     = FileUtils::ResolvedFilePath(noPathSchemeFn, testFrom);
+    const string resolvedAbsoluteSchemePath_defaultFrom = FileUtils::ResolvedFilePath(absoluteSchemeFn);
+    const string resolvedRelativeSchemePath_defaultFrom = FileUtils::ResolvedFilePath(relativeSchemeFn);
+    const string resolvedNoPathSchemeFn_defaultFrom     = FileUtils::ResolvedFilePath(noPathSchemeFn);
+
+    EXPECT_EQ("/absolute/path/to/file.txt",                  resolvedAbsoluteSchemePath);
+    EXPECT_EQ("/path/to/myDir/../relative/path/to/file.txt", resolvedRelativeSchemePath);
+    EXPECT_EQ("/path/to/myDir/file.txt",                     resolvedNoPathSchemeFn);
+
+    EXPECT_EQ("/absolute/path/to/file.txt",                  resolvedAbsoluteSchemePath_defaultFrom);
+    EXPECT_EQ("./../relative/path/to/file.txt", resolvedRelativeSchemePath_defaultFrom);
+    EXPECT_EQ("./file.txt",                     resolvedNoPathSchemeFn_defaultFrom);
+}
+
+TEST(FileUtilsTest, SizeOk)
+{
+    const string tmp = tests::GeneratedData_Dir + "/pbbam_empty_file.tmp";
+    const string cmd = string("touch ") + tmp;
+    ASSERT_EQ(0, system(cmd.c_str()));
+    EXPECT_EQ(0, FileUtils::Size(tmp));
+
+    EXPECT_THROW(FileUtils::Size("does_not_exist.txt"), std::runtime_error);
+}
+
+// ####################################################################################################
+// The code below is part of a simple check whether or not a (Windows-only) file path is absolute.
+//
+// NOTE: (and this is admittedly brittle for maintenance, but) the internal methods used are literally
+// copied here for direct driving. There's likely a better way going forward, than the manual copy/paste.
+// But in the absence of a similar runtime environment to build in & test against, while
+// the motivating behavior is blocking other work, this lets me get the fix in their hands ASAP and still
+// have some test code poking it beforehand. -DB
+//
+namespace test_windows {
+
+static string removeFileUriScheme(const string& uri)
+{
+    assert(!uri.empty());
+
+    auto schemeLess = uri;
+    const auto fileScheme = string{"file://"};
+    const auto schemeFound = schemeLess.find(fileScheme);
+    if (schemeFound != string::npos) {
+        if (schemeFound != 0)
+            throw runtime_error("Malformed URI: scheme not at beginning");
+        schemeLess = schemeLess.substr(fileScheme.size());
+    }
+    return schemeLess;
+}
+
+static
+string removeDiskName(const string& filePath)
+{
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':'))
+            return filePath.substr(2);
+    }
+    return filePath;
+}
+
+static const char native_pathSeparator = '\\';
+
+static bool native_pathIsAbsolute(const string& filePath)
+{
+    assert(!filePath.empty());
+
+    // if starts with single slash or double slash [cases 1,3]
+    if (boost::algorithm::starts_with(filePath, "\\"))
+        return true;
+
+    // if starts with single or double-dots -> not absolute [case 4 + ".\file.txt"]
+    if (boost::algorithm::starts_with(filePath, "."))
+        return false;
+
+    // if starts with drive name and colon ("C:\foo\bar.txt")
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':'))
+            return native_pathIsAbsolute(removeDiskName(filePath));
+    }
+
+    // otherwise, likely relative
+    return false;
+}
+
+static string native_resolvedFilePath(const string& filePath,
+                                      const string& from)
+{
+    // strip file:// scheme if present
+    auto schemeLess = removeFileUriScheme(filePath);
+
+    // if empty or already absolute path, just return it
+    // upfront empty check simplifies further parsing logic
+    if (schemeLess.empty() || native_pathIsAbsolute(schemeLess))
+        return schemeLess;
+
+    // else make relative from the provided 'from' directory
+    //
+    // first pop disk name, then any leading single-dot '.'
+    //
+    // since we're prepending the 'from' directory, we can remove
+    // any leading './' form our file path. this may just mean that
+    // we pop it off to add it right back (when from == '.'), but this
+    // keeps it consistent with other 'from' parent directories
+    //
+    schemeLess = removeDiskName(schemeLess);
+
+    const bool thisDirAtStart = (schemeLess.find(".") == 0);
+    if (thisDirAtStart) {
+        if (schemeLess.find(native_pathSeparator) == 1)
+            schemeLess = schemeLess.substr(2);
+    }
+    return from + native_pathSeparator + schemeLess;
+}
+
+} // namespace test_windows
+
+TEST(FileUtilsTest, WindowsPathsOk)
+{
+    { // remove disk name
+
+        // "C:\tmp.txt"
+        string f1 = "C:\\tmp.txt";
+        EXPECT_EQ(string("\\tmp.txt"), test_windows::removeDiskName(f1));
+
+        // "C:tmp.txt"
+        string f2 = "C:tmp.txt";
+        EXPECT_EQ(string("tmp.txt"), test_windows::removeDiskName(f2));
+
+        // "\tmp.txt"
+        string f3 = "\\tmp.txt";
+        EXPECT_EQ(f3, test_windows::removeDiskName(f3));
+
+        // "tmp.txt"
+        string f4 = "tmp.txt";
+        EXPECT_EQ(f4, test_windows::removeDiskName(f4));
+    }
+
+    { // isAbsolute ?
+
+        // "\\server\path\to\tmp.txt"
+        EXPECT_TRUE(test_windows::native_pathIsAbsolute("\\\\server\\path\\to\tmp.txt"));
+
+        // "..\tmp.txt"
+        EXPECT_FALSE(test_windows::native_pathIsAbsolute("..\\tmp.txt"));
+
+        // ".\tmp.txt"
+        EXPECT_FALSE(test_windows::native_pathIsAbsolute(".\\tmp.txt"));
+
+        // "C:\path\to\tmp.txt"
+        EXPECT_TRUE(test_windows::native_pathIsAbsolute("C:\\path\\to\\tmp.txt"));
+
+        // "C:..\path\to\tmp.txt"
+        EXPECT_FALSE(test_windows::native_pathIsAbsolute("C:..\\path\\to\\tmp.txt"));
+    }
+
+    { // resolve file path
+
+        const string myRootDir = "C:\\path\\to\\myRootDir";
+
+        // "\\server\path\to\tmp.txt"
+        const string fn1 = "\\\\server\\path\\to\tmp.txt";
+        const string fn1_expected  = fn1;
+        EXPECT_EQ(fn1_expected, test_windows::native_resolvedFilePath(fn1, myRootDir));
+
+        // "..\tmp.txt"
+        const string fn2 = "..\\tmp.txt";
+        const string fn2_expected = "C:\\path\\to\\myRootDir\\..\\tmp.txt";
+        EXPECT_EQ(fn2_expected, test_windows::native_resolvedFilePath(fn2, myRootDir));
+
+        // ".\tmp.txt"
+        const string fn3 = ".\\tmp.txt";
+        const string fn3_expected = "C:\\path\\to\\myRootDir\\tmp.txt";
+        EXPECT_EQ(fn3_expected, test_windows::native_resolvedFilePath(fn3, myRootDir));
+
+        // "C:\path\to\tmp.txt"
+        const string fn4 = "C:\\path\\to\\tmp.txt";
+        const string fn4_expected  = fn4;
+        EXPECT_EQ(fn4_expected, test_windows::native_resolvedFilePath(fn4, myRootDir));
+
+        // "C:..\path\to\tmp.txt"
+        const string fn5 = "C:..\\path\\to\\tmp.txt";
+        const string fn5_expected = "C:\\path\\to\\myRootDir\\..\\path\\to\\tmp.txt";
+        EXPECT_EQ(fn5_expected, test_windows::native_resolvedFilePath(fn5, myRootDir));
+
+        // "C:tmp.txt"
+        const string fn6 = "C:tmp.txt";
+        const string fn6_expected = "C:\\path\\to\\myRootDir\\tmp.txt";
+        EXPECT_EQ(fn6_expected, test_windows::native_resolvedFilePath(fn6, myRootDir));
+        EXPECT_EQ(fn3_expected, test_windows::native_resolvedFilePath(fn6, myRootDir)); // our path is equivalent to fn3's "./temp.txt"
+    }
+}
+//
+// ####################################################################################################
+
+
diff --git a/tests/src/test_Frames.cpp b/tests/src/test_Frames.cpp

new file mode 100644 (file)

index 0000000..797eb6a
--- /dev/null
+++ b/tests/src/test_Frames.cpp
@@ -0,0 +1,97 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/Frames.h>
+#include <string>
+#include <vector>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace tests {
+
+static const vector<uint16_t> testFrames =
+{
+    0,  8,  140, 0,   0,  7,   4,  0,  85, 2,
+    1,  3,  2,   10,  1,  20,  47, 10, 9,  60,
+    20, 3,  12,  5,   13, 165, 6,  14, 22, 12,
+    2,  4,  9,   218, 27, 3,   15, 2,  17, 2,
+    45, 24, 89,  10,  7,  1,   11, 15, 0,  7,
+    0,  28, 17,  12,  6,  10,  37, 0,  12, 52,
+    0,  7,  1,   14,  3,  26,  12, 0,  20, 17,
+    2,  13, 2,   9,   13, 7,   15, 29, 3,   6,
+    2,  1,  28,  10,  3,  14,  7,  1,  22, 1,
+    6,  6,  0,   19,  31, 6,   2,  14, 0,  0,
+    1000, 947, 948
+};
+
+static const vector<uint8_t> encodedFrames =
+{
+    0,     8,  102,   0,   0,   7,   4,   0,  75,   2,   1,   3,   2,
+    10,    1,   20,  47,  10,   9,  60,  20,   3,  12,   5,  13, 115,
+    6,    14,   22,  12,   2,   4,   9, 135,  27,   3,  15,   2,  17,
+    2,    45,   24,  77,  10,   7,   1,  11,  15,   0,   7,   0,  28,
+    17,   12,    6,  10,  37,   0,  12,  52,   0,   7,   1,  14,   3,
+    26,   12,    0,  20,  17,   2,  13,   2,   9,  13,   7,  15,  29,
+    3,     6,    2,   1,  28,  10,   3,  14,   7,   1,  22,   1,   6,
+    6,     0,   19,  31,   6,   2,  14,   0,   0,
+    255, 254,  255
+};
+
+} // namespace tests
+
+TEST(FramesTest, Constructors)
+{
+    const Frames f;
+    ASSERT_TRUE(f.Data().empty());
+
+    const Frames f2(tests::testFrames);
+    const auto d = f2.Data();
+    ASSERT_EQ(tests::testFrames, d);
+}
+
+TEST(FramesTest, Encoded)
+{
+    const Frames f(tests::testFrames);
+    const auto e = f.Encode();
+    ASSERT_EQ(tests::encodedFrames, e);
+}
diff --git a/tests/src/test_GenomicIntervalQuery.cpp b/tests/src/test_GenomicIntervalQuery.cpp

new file mode 100644 (file)

index 0000000..96727d7
--- /dev/null
+++ b/tests/src/test_GenomicIntervalQuery.cpp
@@ -0,0 +1,157 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.\r
+//\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted (subject to the limitations in the\r
+// disclaimer below) provided that the following conditions are met:\r
+//\r
+//  * Redistributions of source code must retain the above copyright\r
+//    notice, this list of conditions and the following disclaimer.\r
+//\r
+//  * Redistributions in binary form must reproduce the above\r
+//    copyright notice, this list of conditions and the following\r
+//    disclaimer in the documentation and/or other materials provided\r
+//    with the distribution.\r
+//\r
+//  * Neither the name of Pacific Biosciences nor the names of its\r
+//    contributors may be used to endorse or promote products derived\r
+//    from this software without specific prior written permission.\r
+//\r
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE\r
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC\r
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED\r
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES\r
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\r
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS\r
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF\r
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND\r
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF\r
+// SUCH DAMAGE.\r
+\r
+// Author: Derek Barnett\r
+\r
+#ifdef PBBAM_TESTING\r
+#define private public\r
+#endif\r
+\r
+#include "TestData.h"\r
+#include <gtest/gtest.h>\r
+#include <pbbam/GenomicIntervalQuery.h>\r
+#include <iostream>\r
+#include <string>\r
+using namespace PacBio;\r
+using namespace PacBio::BAM;\r
+using namespace std;\r
+\r
+const string inputBamFn = tests::Data_Dir + "/aligned.bam";\r
+\r
+TEST(GenomicIntervalQueryTest, ReuseQueryAndCountRecords)\r
+{\r
+    const string rname = "lambda_NEB3011";\r
+\r
+    BamFile bamFile(inputBamFn);\r
+\r
+    // setup with normal interval\r
+    int count = 0;\r
+    GenomicInterval interval(rname, 5000, 6000);\r
+    GenomicIntervalQuery query(interval, bamFile);\r
+    for (const BamRecord& record : query) {\r
+        (void)record;\r
+        ++count;\r
+    }\r
+    EXPECT_EQ(2, count);\r
+\r
+    // adjust interval and pass back in\r
+    count = 0;\r
+    interval.Start(9300);\r
+    interval.Stop(9400);\r
+    query.Interval(interval);\r
+    for (const BamRecord& record : query) {\r
+        (void)record;\r
+        ++count;\r
+    }\r
+    EXPECT_EQ(2, count);\r
+\r
+    // adjust again (empty region)\r
+    count = 0;\r
+    interval.Name(rname);\r
+    interval.Start(1000);\r
+    interval.Stop(2000);\r
+    query.Interval(interval);\r
+    for (const BamRecord& record : query) {\r
+        (void)record;\r
+        ++count;\r
+    }\r
+    EXPECT_EQ(0, count);\r
+\r
+    // unknown ref\r
+    count = 0;\r
+    interval.Name("does not exist");\r
+    interval.Start(0);\r
+    interval.Stop(100);\r
+    EXPECT_THROW(query.Interval(interval), std::runtime_error);\r
+    for (const BamRecord& record : query) { // iteration is still safe, just returns no data\r
+        (void)record;\r
+        ++count;\r
+    }\r
+    EXPECT_EQ(0, count);\r
+\r
+    // adjust again - make sure we can read a real region after an invalid one\r
+    interval.Name(rname);\r
+    interval.Start(5000);\r
+    interval.Stop(6000);\r
+    query.Interval(interval);\r
+    count = 0;\r
+    for (const BamRecord& record : query) {\r
+        (void)record;\r
+        ++count;\r
+    }\r
+    EXPECT_EQ(2, count);\r
+}\r
+\r
+TEST(GenomicIntervalQueryTest, NonConstBamRecord)\r
+{\r
+    EXPECT_NO_THROW(\r
+    {\r
+        BamFile bamFile(inputBamFn);\r
+        int count = 0;\r
+\r
+        GenomicInterval interval("lambda_NEB3011", 8000, 10000);\r
+        GenomicIntervalQuery query(interval, bamFile);\r
+        for (BamRecord& record : query) {\r
+            (void)record;\r
+            ++count;\r
+        }\r
+        EXPECT_EQ(2, count);\r
+    });\r
+}\r
+\r
+TEST(GenomicIntervalQueryTest,  MissingBaiShouldThrow)\r
+{\r
+    GenomicInterval interval("lambda_NEB3011", 0, 100);\r
+    const string phi29Bam = tests::Data_Dir + "/phi29.bam";\r
+    const string hasBaiBam = tests::Data_Dir + "/aligned.bam";\r
+\r
+    {   // single file, missing BAI\r
+        EXPECT_THROW(GenomicIntervalQuery query(interval, phi29Bam), std::runtime_error);\r
+    }\r
+\r
+    {   // from dataset, all missing BAI\r
+        DataSet ds;\r
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));\r
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));\r
+        EXPECT_THROW(GenomicIntervalQuery query(interval, ds), std::runtime_error);\r
+    }\r
+\r
+    {   // from dataset, mixed BAI presence\r
+        DataSet ds;\r
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));\r
+        ds.ExternalResources().Add(ExternalResource("PacBio.AlignmentFile.AlignmentBamFile", hasBaiBam));\r
+        EXPECT_THROW(GenomicIntervalQuery query(interval, ds), std::runtime_error);\r
+    }\r
+}\r
diff --git a/tests/src/test_IndexedFastaReader.cpp b/tests/src/test_IndexedFastaReader.cpp

new file mode 100644 (file)

index 0000000..68a0331
--- /dev/null
+++ b/tests/src/test_IndexedFastaReader.cpp
@@ -0,0 +1,212 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include "pbbam/IndexedFastaReader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/EntireFileQuery.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+const string lambdaFasta = tests::Data_Dir + "/lambdaNEB.fa";
+const string singleInsertionBam = tests::Data_Dir + "/aligned.bam";
+
+TEST(IndexedFastaReaderTests, PrintSingleInsertion)
+{
+    IndexedFastaReader r(lambdaFasta);
+
+    // Open BAM file
+    BamFile bamFile(singleInsertionBam);
+    EntireFileQuery bamQuery(bamFile);
+
+    auto it = bamQuery.begin();
+    auto record = *it++;
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    record = *it++;
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    record = *it++;
+    EXPECT_EQ("----------------------------------------------------AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+    EXPECT_EQ("----------------------------------------------------AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    record = *it++;
+    EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA----------------------------------------------------",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ("----------------------------------------------------TTGCCGCTGTT-ACCGTGCTGCGATCTTCTGCCATCGACGGACGTCCCACATTGGTGACTT",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    EXPECT_EQ("TTGCCGCTGTT-ACCGTGCTGCGATCTTCTGCCATCGACGGACGTCCCACATTGGTGACTT",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+
+    // {
+    //     std::stringstream output;
+    //     auto itSS = bamQuery.begin();
+    //     {
+    //         const auto recordSS = *itSS;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl;
+    //         output << std::endl;
+    //     }
+    //     ++itSS;
+    //     {
+    //         const auto recordSS = *itSS;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl;
+    //         output << std::endl;
+    //     }
+    //     ++itSS;
+    //     {
+    //         const auto recordSS = *itSS;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl;
+    //         output << std::endl;
+    //     }
+    //     ++itSS;
+    //     {
+    //         const auto recordSS = *itSS;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl;
+    //     }
+    //     std::cerr << output.str();
+    // }
+}
+
+TEST(IndexedFastaReaderTests, ReadLambda)
+{
+    IndexedFastaReader r(lambdaFasta);
+
+    EXPECT_TRUE(r.HasSequence("lambda_NEB3011"));
+    EXPECT_FALSE(r.HasSequence("dog"));
+    EXPECT_EQ(1, r.NumSequences());
+    EXPECT_EQ(48502, r.SequenceLength("lambda_NEB3011"));
+
+    string seq = r.Subsequence("lambda_NEB3011:0-10");
+    EXPECT_EQ("GGGCGGCGAC", seq);
+
+    string seq2 = r.Subsequence("lambda_NEB3011", 0, 10);
+    EXPECT_EQ("GGGCGGCGAC", seq2);
+
+    // subsequence extending beyond bounds returns clipped
+    string seq3 = r.Subsequence("lambda_NEB3011", 48400, 48600);
+    EXPECT_EQ(102, seq3.length());
+
+    // bad subsequence
+}
+
+TEST(IndexedFastaReaderTests, Errors)
+{
+    IndexedFastaReader r(lambdaFasta);
+
+    //
+    // attempt access without "opening"
+    //
+    // EXPECT_THROW(r.NumSequences(), std::exception);
+    // EXPECT_THROW(r.HasSequence("lambda_NEB3011"), std::exception);
+    // EXPECT_THROW(r.SequenceLength("lambda_NEB3011"), std::exception);
+    // EXPECT_THROW(r.Subsequence("lambda_NEB3011:0-10"), std::exception);
+
+    //
+    // invalid accesses after opening
+    //
+    EXPECT_THROW(r.SequenceLength("dog"), std::exception);
+    EXPECT_THROW(r.Subsequence("dog:0-10"), std::exception);
+}
diff --git a/tests/src/test_Intervals.cpp b/tests/src/test_Intervals.cpp

new file mode 100644 (file)

index 0000000..14231b1
--- /dev/null
+++ b/tests/src/test_Intervals.cpp
@@ -0,0 +1,330 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/GenomicInterval.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(IntervalTest, Constructors)
+{
+    Interval<Position> empty;
+    Interval<Position> singleton(4);
+    Interval<Position> normal(5, 8);
+
+    EXPECT_EQ(0, empty.Start());
+    EXPECT_EQ(0, empty.Stop());
+
+    EXPECT_EQ(4, singleton.Start());
+    EXPECT_EQ(5, singleton.Stop());
+
+    EXPECT_EQ(5, normal.Start());
+    EXPECT_EQ(8, normal.Stop());
+
+    // TODO: check out-of-order intervals, etc
+}
+
+TEST(IntervalTest, EqualityTest)
+{
+    Interval<Position> empty;
+    Interval<Position> empty2;
+
+    Interval<Position> singleton(4);
+    Interval<Position> sameAsSingleton(4,5);
+
+    Interval<Position> normal(5, 8);
+    Interval<Position> sameAsNormal(5, 8);
+
+    Interval<Position> different(20, 40);
+
+    // self-equality
+    EXPECT_TRUE(empty == empty);
+    EXPECT_TRUE(singleton == singleton);
+    EXPECT_TRUE(normal == normal);
+    EXPECT_TRUE(different == different);
+
+    // same values equality
+    EXPECT_TRUE(empty == empty2);
+    EXPECT_TRUE(singleton == sameAsSingleton);
+    EXPECT_TRUE(normal == sameAsNormal);
+
+    // different values
+    EXPECT_FALSE(empty == singleton);
+    EXPECT_FALSE(empty == normal);
+    EXPECT_FALSE(empty == different);
+    EXPECT_FALSE(singleton == normal);
+    EXPECT_FALSE(normal == different);
+}
+
+TEST(IntervalTest, Copy)
+{
+    Interval<Position> interval1(5, 8);
+    Interval<Position> interval2(interval1);
+    Interval<Position> interval3 = interval1;
+
+    EXPECT_TRUE(interval1 == interval1);
+    EXPECT_TRUE(interval1 == interval2);
+    EXPECT_TRUE(interval1 == interval3);
+}
+
+TEST(IntervalTest, Modifier)
+{
+    Interval<Position> interval1(5, 8);
+    Interval<Position> interval2(interval1);
+    interval2.Start(2);
+    interval2.Stop(10);
+
+    EXPECT_FALSE(interval1 == interval2);
+    EXPECT_EQ(2, interval2.Start());
+    EXPECT_EQ(10, interval2.Stop());
+}
+
+TEST(IntervalTest, CoverTest)
+{
+    Interval<Position> interval1(2, 4);
+    Interval<Position> interval2(3, 5);
+    Interval<Position> interval3(6, 8);
+    Interval<Position> interval4(1, 7);
+    Interval<Position> interval5(5, 8);
+
+    EXPECT_TRUE(interval1.Covers(interval1));    // self-cover: a.covers(a)
+    EXPECT_TRUE(interval1.CoveredBy(interval1)); // self-cover: a.coveredBy(a)
+
+    EXPECT_TRUE(interval2.CoveredBy(interval4)); // a.coveredBy(b)
+    EXPECT_TRUE(interval4.Covers(interval2));    // thus b.covers(a)
+    EXPECT_FALSE(interval2 == interval4);        // if a != b
+    EXPECT_FALSE(interval2.Covers(interval4));   // then !a.covers(b)
+
+    EXPECT_FALSE(interval2.Covers(interval3));    // completely disjoint
+    EXPECT_FALSE(interval3.Covers(interval2));
+    EXPECT_FALSE(interval2.CoveredBy(interval3));
+    EXPECT_FALSE(interval3.CoveredBy(interval2));
+
+    EXPECT_FALSE(interval2.Covers(interval5));    // a.stop == b.start
+    EXPECT_FALSE(interval2.CoveredBy(interval5));
+
+    EXPECT_TRUE(interval5.Covers(interval3));    // shared endpoint, start contained, thus a.covers(b)
+    EXPECT_TRUE(interval3.CoveredBy(interval5)); // and b.coveredBy(a)
+}
+
+TEST(IntervalTest, IntersectTest)
+{
+    Interval<Position> interval1(2, 4);
+    Interval<Position> interval2(3, 5);
+    Interval<Position> interval3(6, 8);
+    Interval<Position> interval4(1, 7);
+    Interval<Position> interval5(5, 8);
+
+    EXPECT_TRUE(interval1.Intersects(interval1)); // self-intersection: a.intersects(a)
+
+    EXPECT_TRUE(interval1.Intersects(interval2)); // if a.intersects(b)
+    EXPECT_TRUE(interval2.Intersects(interval1)); // then b.intersects(a)
+
+    EXPECT_TRUE(interval4.Covers(interval1));      // if b.covers(a),
+    EXPECT_TRUE(interval1.Intersects(interval4));  // then a.intersects(b)
+    EXPECT_TRUE(interval4.Intersects(interval1));  // and b.intersects(a)
+
+    EXPECT_FALSE(interval2.Intersects(interval3)); // b.start > a.stop (obvious disjoint)
+    EXPECT_FALSE(interval2.Intersects(interval5)); // b.start == a.stop (intervals are right open, so disjoint)
+}
+
+TEST(IntervalTest, ValidityTest)
+{
+    Interval<Position> interval1;        // default ctor
+    Interval<Position> interval2(0,0);   // start == stop (zero)
+    Interval<Position> interval3(4,4);   // start == stop (nonzero)
+    Interval<Position> interval4(0,1);   // start < stop  (start is zero)
+    Interval<Position> interval5(4,5);   // start < stop  (start is nonzero)
+    Interval<Position> interval6(5,4);   // start > stop
+
+    EXPECT_FALSE(interval1.IsValid());
+    EXPECT_FALSE(interval2.IsValid());
+    EXPECT_FALSE(interval3.IsValid());
+    EXPECT_TRUE(interval4.IsValid());
+    EXPECT_TRUE(interval5.IsValid());
+    EXPECT_FALSE(interval6.IsValid());
+}
+
+TEST(IntervalTest, LengthTest)
+{
+    Interval<Position> interval1(2, 4);
+    Interval<Position> interval2(3, 5);
+    Interval<Position> interval3(6, 8);
+    Interval<Position> interval4(1, 7);
+    Interval<Position> interval5(5, 8);
+
+    EXPECT_EQ(2, interval1.Length());
+    EXPECT_EQ(2, interval2.Length());
+    EXPECT_EQ(2, interval3.Length());
+    EXPECT_EQ(6, interval4.Length());
+    EXPECT_EQ(3, interval5.Length());
+
+    // TODO: check out-of-order intervals, etc
+}
+
+TEST(GenomicIntervalTest, DefaultConstructor)
+{
+    GenomicInterval gi;
+    EXPECT_EQ("", gi.Name());
+    EXPECT_EQ(0,  gi.Start());
+    EXPECT_EQ(0,  gi.Stop());
+}
+
+TEST(GenomicIntervalTest, ExplicitConstructor)
+{
+    GenomicInterval gi("foo", 100, 200);
+    EXPECT_EQ("foo", gi.Name());
+    EXPECT_EQ(100,   gi.Start());
+    EXPECT_EQ(200,   gi.Stop());
+}
+
+TEST(GenomicIntervalTest, RegionStringConstructor)
+{
+    GenomicInterval gi("foo:100-200");
+    EXPECT_EQ("foo", gi.Name());
+    EXPECT_EQ(100,   gi.Start());
+    EXPECT_EQ(200,   gi.Stop());
+
+    GenomicInterval refOnly("foo");
+    EXPECT_EQ("foo", refOnly.Name());
+    EXPECT_EQ(0,     refOnly.Start());
+    EXPECT_EQ(1<<29, refOnly.Stop()); // htslib's default, "read-to-end" interval stop
+}
+
+TEST(GenomicIntervalTest, Copy)
+{
+    GenomicInterval interval1("foo", 10, 20);
+    GenomicInterval interval2(interval1);
+    GenomicInterval interval3 = interval1;
+
+    EXPECT_TRUE(interval1 == interval1);
+    EXPECT_TRUE(interval1 == interval2);
+    EXPECT_TRUE(interval1 == interval3);
+}
+
+TEST(GenomicIntervalTest, Modifiers)
+{
+    GenomicInterval interval1("foo", 10, 20);
+
+    // modify individual properties
+    GenomicInterval interval2(interval1);
+    interval2.Name("bar");
+    interval2.Start(2);
+    interval2.Stop(10);
+
+    // modify interval as a whole
+    GenomicInterval interval3(interval1);
+    interval3.Interval(interval2.Interval());
+
+    EXPECT_FALSE(interval1 == interval2);
+    EXPECT_EQ("bar", interval2.Name());
+    EXPECT_EQ(2,     interval2.Start());
+    EXPECT_EQ(10,    interval2.Stop());
+
+    EXPECT_EQ(interval1.Name(),     interval3.Name());
+    EXPECT_EQ(interval2.Interval(), interval3.Interval());
+}
+
+TEST(GenomicIntervalTest, CoverTest)
+{
+    GenomicInterval interval1("foo", 2, 4);
+    GenomicInterval interval2("foo", 3, 5);
+    GenomicInterval interval3("foo", 6, 8);
+    GenomicInterval interval4("foo", 1, 7);
+    GenomicInterval interval5("foo", 5, 8);
+
+    // same as interval2, but different ref
+    GenomicInterval interval6(interval2);
+    interval6.Name("bar");
+
+    EXPECT_TRUE(interval1.Covers(interval1));    // self-cover: a.covers(a)
+    EXPECT_TRUE(interval1.CoveredBy(interval1)); // self-cover: a.coveredBy(a)
+
+    EXPECT_TRUE(interval2.CoveredBy(interval4)); // a.coveredBy(b)
+    EXPECT_TRUE(interval4.Covers(interval2));    // thus b.covers(a)
+    EXPECT_FALSE(interval2 == interval4);        // if a != b
+    EXPECT_FALSE(interval2.Covers(interval4));   // then !a.covers(b)
+
+    EXPECT_FALSE(interval6.CoveredBy(interval4)); // interval 6 has same start/stop as 2, w/ different ref
+    EXPECT_FALSE(interval4.Covers(interval6));    //
+    EXPECT_FALSE(interval6 == interval4);         //
+    EXPECT_FALSE(interval6.Covers(interval4));    //
+
+    EXPECT_FALSE(interval2.Covers(interval3));    // completely disjoint
+    EXPECT_FALSE(interval3.Covers(interval2));
+    EXPECT_FALSE(interval2.CoveredBy(interval3));
+    EXPECT_FALSE(interval3.CoveredBy(interval2));
+
+    EXPECT_FALSE(interval2.Covers(interval5));    // a.stop == b.start
+    EXPECT_FALSE(interval2.CoveredBy(interval5));
+
+    EXPECT_TRUE(interval5.Covers(interval3));    // shared endpoint, start contained, thus a.covers(b)
+    EXPECT_TRUE(interval3.CoveredBy(interval5)); // and b.coveredBy(a)
+}
+
+TEST(GenomicIntervalTest, ValidityTest)
+{
+    GenomicInterval interval1;            // default ctor
+    GenomicInterval interval2("foo",0,0);     // valid id, start == stop (zero)
+    GenomicInterval interval3("foo",4,4);     // valid id, start == stop (nonzero)
+    GenomicInterval interval4("foo",0,1);     // valid id, start < stop  (start is zero)
+    GenomicInterval interval5("foo",4,5);     // valid id, start < stop  (start is nonzero)
+    GenomicInterval interval6("foo",5,4);     // valid id, start > stop
+    GenomicInterval interval7("",0,0);    // invalid id, start == stop (zero)
+    GenomicInterval interval8("",4,4);    // invalid id, start == stop (nonzero)
+    GenomicInterval interval9("",0,1);    // invalid id, start < stop  (start is zero)
+    GenomicInterval interval10("",4,5);   // invalid id, start < stop  (start is nonzero)
+    GenomicInterval interval11("",5,4);   // invalid id, start > stop
+
+    EXPECT_FALSE(interval1.IsValid());
+    EXPECT_FALSE(interval2.IsValid());
+    EXPECT_FALSE(interval3.IsValid());
+    EXPECT_TRUE(interval4.IsValid());
+    EXPECT_TRUE(interval5.IsValid());
+    EXPECT_FALSE(interval6.IsValid());
+    EXPECT_FALSE(interval7.IsValid());
+    EXPECT_FALSE(interval8.IsValid());
+    EXPECT_FALSE(interval9.IsValid());
+    EXPECT_FALSE(interval10.IsValid());
+    EXPECT_FALSE(interval11.IsValid());
+}
diff --git a/tests/src/test_PacBioIndex.cpp b/tests/src/test_PacBioIndex.cpp

new file mode 100644 (file)

index 0000000..fa17dc7
--- /dev/null
+++ b/tests/src/test_PacBioIndex.cpp
@@ -0,0 +1,1012 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/BamFile.h>
+#include <pbbam/BamReader.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/PbiBuilder.h>
+#include <pbbam/PbiIndex.h>
+#include <pbbam/PbiLookupData.h>
+#include <pbbam/PbiRawData.h>
+#include <string>
+#include <cstdio>
+#include <cstdlib>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+const string test2BamFn = tests::Data_Dir + "/aligned2.bam";
+const string phi29BamFn = tests::Data_Dir + "/phi29.bam";
+
+namespace PacBio {
+namespace BAM {
+namespace tests {
+
+static
+PbiRawData Test2Bam_CoreIndexData(void)
+
+{
+    PbiRawData rawData;
+    rawData.Version(PbiFile::Version_3_0_1);
+    rawData.FileSections(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::REFERENCE);
+    rawData.NumReads(10);
+
+    PbiRawBasicData& basicData = rawData.BasicData();
+    basicData.rgId_       = { -1197849594, -1197849594, -1197849594, -1197849594, -1197849594, -1197849594, -1197849594, -1197849594, -1197849594, -1197849594 };
+    basicData.qStart_     = {48,387,0,9936,10232,7468,5557,7285,426,7064};
+    basicData.qEnd_       = {1132,1134,344,10187,10394,8906,7235,8657,1045,7421};
+    basicData.holeNumber_ = {49050,32328,32328,6469,6469,30983,13473,13473,19915,30983};
+    basicData.readQual_   = {0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6};
+    basicData.ctxtFlag_   = {0,0,0,0,0,0,0,0,0,0};
+    basicData.fileOffset_ = { 33816576, 33825163, 33831333, 33834264, 33836542, 33838065, 33849818, 33863499, 33874621, 1392836608 };
+
+    PbiRawMappedData& mappedData = rawData.MappedData();
+    mappedData.tId_       = {0,0,0,0,0,0,0,0,0,0};
+    mappedData.tStart_    = {0,302,675,2170,2203,3572,4506,4507,4592,4669};
+    mappedData.tEnd_      = {471,1019,1026,2397,2326,5015,6125,5850,5203,5011};
+    mappedData.aStart_    = {653,395,1,9960,10271,7468,5574,7285,441,7075};
+    mappedData.aEnd_      = {1129,1134,344,10185,10394,8906,7235,8647,1040,7418};
+    mappedData.revStrand_ = {0,1,0,1,0,1,1,0,1,0};
+    mappedData.nM_        = {460,704,339,216,118,1394,1581,1313,583,333};
+    mappedData.nMM_       = {0,0,0,0,0,0,0,0,0,0};
+    mappedData.mapQV_     = {254,254,254,254,254,254,254,254,254,254};
+
+    PbiRawReferenceData& referenceData = rawData.ReferenceData();
+    referenceData.entries_ = {
+        PbiReferenceEntry{0,0,10},
+        PbiReferenceEntry{4294967295,4294967295,4294967295}
+    };
+
+    return rawData;
+}
+
+// NOTE: We have 2 different sets of offsets because the copied, new file differs in size than the existing one.
+//
+//       Unsure which combination of write parameters were used on the original. Things like thread count,
+//       compression level, etc. can effect compression ratio, BGZF block sizes, etc. even though the BAM record
+//       content itself is equal. So we'll just track these index values separately, for now at least.
+//
+static
+PbiRawData Test2Bam_ExistingIndex(void)
+{
+    PbiRawData index = Test2Bam_CoreIndexData();
+    index.BasicData().fileOffset_ = { 33816576, 33825163, 33831333, 33834264, 33836542, 33838065, 33849818, 33863499, 33874621, 1392836608 };
+    return index;
+}
+
+static
+PbiRawData Test2Bam_NewIndex(void)
+{
+    PbiRawData index = Test2Bam_CoreIndexData();
+    index.BasicData().fileOffset_ = { 33816576, 236126208, 391315456, 469106688, 537067520, 587792384, 867303424, 1182793728, 1449787392, 1582628864 };
+    return index;
+}
+
+static
+void ExpectRawIndicesEqual(const PbiRawData& expected, const PbiRawData& actual)
+{
+    // header data
+    EXPECT_EQ(expected.Version(),      actual.Version());
+    EXPECT_EQ(expected.FileSections(), actual.FileSections());
+    EXPECT_EQ(expected.NumReads(),     actual.NumReads());
+
+    // subread data
+    const PbiRawBasicData& e = expected.BasicData();
+    const PbiRawBasicData& a = actual.BasicData();
+    EXPECT_EQ(e.rgId_,       a.rgId_);
+    EXPECT_EQ(e.qStart_,     a.qStart_);
+    EXPECT_EQ(e.qEnd_,       a.qEnd_);
+    EXPECT_EQ(e.holeNumber_, a.holeNumber_);
+    EXPECT_EQ(e.readQual_,   a.readQual_);
+    EXPECT_EQ(e.ctxtFlag_,   a.ctxtFlag_);
+    EXPECT_EQ(e.fileOffset_, a.fileOffset_);
+
+    // mapped data
+    EXPECT_EQ(expected.HasMappedData(), actual.HasMappedData());
+    if (expected.HasMappedData() && actual.HasMappedData()) {
+        const PbiRawMappedData& e = expected.MappedData();
+        const PbiRawMappedData& a = actual.MappedData();
+        EXPECT_EQ(e.tId_,       a.tId_);
+        EXPECT_EQ(e.tStart_,    a.tStart_);
+        EXPECT_EQ(e.tEnd_,      a.tEnd_);
+        EXPECT_EQ(e.aStart_,    a.aStart_);
+        EXPECT_EQ(e.aEnd_,      a.aEnd_);
+        EXPECT_EQ(e.revStrand_, a.revStrand_);
+        EXPECT_EQ(e.nM_,        a.nM_);
+        EXPECT_EQ(e.nMM_,       a.nMM_);
+        EXPECT_EQ(e.mapQV_,     a.mapQV_);
+    }
+
+    // reference data
+    EXPECT_EQ(expected.HasReferenceData(), actual.HasReferenceData());
+    if (expected.HasReferenceData() && actual.HasReferenceData()) {
+        const PbiRawReferenceData& e = expected.ReferenceData();
+        const PbiRawReferenceData& a = actual.ReferenceData();
+        EXPECT_EQ(e.entries_, a.entries_);
+    }
+
+    // barcode data
+    EXPECT_EQ(expected.HasBarcodeData(), actual.HasBarcodeData());
+    if (expected.HasBarcodeData() && actual.HasBarcodeData()) {
+        const PbiRawBarcodeData& e = expected.BarcodeData();
+        const PbiRawBarcodeData& a = actual.BarcodeData();
+        EXPECT_EQ(e.bcForward_,   a.bcForward_);
+        EXPECT_EQ(e.bcReverse_,  a.bcReverse_);
+        EXPECT_EQ(e.bcQual_,   a.bcQual_);
+    }
+}
+
+static
+bool BasicLookupsEqual(const BasicLookupData& lhs,
+                         const BasicLookupData& rhs)
+{
+    return (lhs.rgId_ == rhs.rgId_ &&
+            lhs.qStart_ == rhs.qStart_ &&
+            lhs.qEnd_ == rhs.qEnd_ &&
+            lhs.holeNumber_ == rhs.holeNumber_ &&
+            lhs.readQual_ == rhs.readQual_ &&
+            lhs.ctxtFlag_ == rhs.ctxtFlag_ &&
+            lhs.fileOffset_ == rhs.fileOffset_);
+}
+
+static
+bool MappedLookupsEqual(const MappedLookupData& lhs,
+                        const MappedLookupData& rhs)
+{
+    return (lhs.tId_ == rhs.tId_ &&
+            lhs.tStart_ == rhs.tStart_ &&
+            lhs.tEnd_ == rhs.tEnd_ &&
+            lhs.aStart_ == rhs.aStart_ &&
+            lhs.aEnd_ == rhs.aEnd_ &&
+            lhs.nM_ == rhs.nM_ &&
+            lhs.nMM_ == rhs.nMM_ &&
+            lhs.mapQV_ == rhs.mapQV_ &&
+            lhs.forwardStrand_ == rhs.forwardStrand_ &&
+            lhs.reverseStrand_ == rhs.reverseStrand_);
+}
+
+static
+bool ReferenceLookupsEqual(const ReferenceLookupData& lhs,
+                           const ReferenceLookupData& rhs)
+{
+    return lhs.references_ == rhs.references_;
+}
+
+static
+bool BarcodeLookupsEqual(const BarcodeLookupData& lhs,
+                         const BarcodeLookupData& rhs)
+{
+    return (lhs.bcForward_ == rhs.bcForward_ &&
+            lhs.bcReverse_ == rhs.bcReverse_ &&
+            lhs.bcQual_ == rhs.bcQual_);
+}
+
+static
+bool PbiIndicesEqual(const PbiIndex& lhs, const PbiIndex& rhs)
+{
+    using namespace ::PacBio::BAM;
+    const unique_ptr<internal::PbiIndexPrivate>& lhsImpl = lhs.d_;
+    const unique_ptr<internal::PbiIndexPrivate>& rhsImpl = rhs.d_;
+    if (lhsImpl == rhsImpl)
+        return true;
+    if (lhsImpl == nullptr || rhsImpl == nullptr)
+        return false;
+
+    // metadata compare
+    if (lhsImpl->version_  != rhsImpl->version_  ||
+        lhsImpl->sections_ != rhsImpl->sections_ ||
+        lhsImpl->numReads_ != rhsImpl->numReads_)
+    { return false; }
+
+    // component compare
+    if ( !BasicLookupsEqual(lhsImpl->basicData_,         rhsImpl->basicData_)   ||
+         !MappedLookupsEqual(lhsImpl->mappedData_,       rhsImpl->mappedData_)    ||
+         !ReferenceLookupsEqual(lhsImpl->referenceData_, rhsImpl->referenceData_) ||
+         !BarcodeLookupsEqual(lhsImpl->barcodeData_,     rhsImpl->barcodeData_))
+    { return false; }
+
+    // if we get here, OK
+    return true;
+}
+
+} // namespace tests
+} // namespace BAM
+} // namespace PacBio
+
+TEST(PacBioIndexTest, CreateFromExistingBam)
+{
+    // do this in temp directory, so we can ensure write access
+    const string tempDir    = tests::GeneratedData_Dir + "/";
+    const string tempBamFn  = tempDir + "aligned_copy.bam";
+    const string tempPbiFn  = tempBamFn + ".pbi";
+    string cmd("cp ");
+    cmd += test2BamFn;
+    cmd += " ";
+    cmd += tempBamFn;
+    int cmdResult = system(cmd.c_str());
+    (void)cmdResult;
+
+    BamFile bamFile(tempBamFn);
+    PbiFile::CreateFrom(bamFile);
+    EXPECT_EQ(tempPbiFn, bamFile.PacBioIndexFilename());
+
+    PbiRawData index(bamFile.PacBioIndexFilename());
+    EXPECT_EQ(PbiFile::Version_3_0_1,  index.Version());
+    EXPECT_EQ(10, index.NumReads());
+    EXPECT_TRUE(index.HasMappedData());
+
+    const PbiRawData& expectedIndex = tests::Test2Bam_ExistingIndex();
+    tests::ExpectRawIndicesEqual(expectedIndex, index);
+
+    // clean up temp file(s)
+    remove(tempBamFn.c_str());
+    remove(tempPbiFn.c_str());
+}
+
+::testing::AssertionResult CanRead(BamReader& reader, BamRecord& record, int i)
+{
+    if (reader.GetNext(record))
+        return ::testing::AssertionSuccess() << "i: " << i;
+    else
+        return ::testing::AssertionFailure() << "i: " << i;
+}
+
+TEST(PacBioIndexTest, CreateOnTheFly)
+{
+    // do this in temp directory, so we can ensure write access
+    const string tempDir    = tests::GeneratedData_Dir + "/";
+    const string tempBamFn  = tempDir + "temp.bam";
+    const string tempPbiFn  = tempBamFn + ".pbi";
+
+    // NOTE: new file differs in size than existing (different write parameters may yield different file sizes, even though content is same)
+    const vector<int64_t> expectedNewOffsets = { 33816576, 236126208, 391315456, 469106688, 537067520, 587792384, 867303424, 1182793728, 1449787392, 1582628864 };
+    vector<int64_t> observedOffsets;
+
+    // create PBI on the fly from input BAM while we write to new file
+    {
+        BamFile bamFile(test2BamFn);
+        BamHeader header = bamFile.Header();
+
+        BamWriter writer(tempBamFn, header); // default compression, default thread count
+        PbiBuilder builder(tempPbiFn, header.Sequences().size());
+
+        int64_t vOffset = 0;
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile) {
+            writer.Write(record, &vOffset);
+            builder.AddRecord(record, vOffset);
+            observedOffsets.push_back(vOffset);
+        }
+    }
+
+    EXPECT_EQ(expectedNewOffsets, observedOffsets);
+
+    // sanity check on original file
+    {
+        const vector<int64_t> originalFileOffsets = { 33816576, 33825163, 33831333, 33834264, 33836542, 33838065, 33849818, 33863499, 33874621, 1392836608 };
+        BamRecord r;
+        BamReader reader(test2BamFn);
+        for (int i = 0; i < originalFileOffsets.size(); ++i) {
+            reader.VirtualSeek(originalFileOffsets.at(i));
+            EXPECT_TRUE(CanRead(reader, r, i));
+        }
+    }
+
+    // attempt to seek in our new file using both expected & observed offsets
+    {
+        BamRecord r;
+        BamReader reader(tempBamFn);
+        for (int i = 0; i < expectedNewOffsets.size(); ++i) {
+            reader.VirtualSeek(expectedNewOffsets.at(i));
+            EXPECT_TRUE(CanRead(reader, r, i));
+        }
+        for (int i = 0; i < observedOffsets.size(); ++i) {
+            reader.VirtualSeek(observedOffsets.at(i));
+            EXPECT_TRUE(CanRead(reader, r, i));
+        }
+    }
+
+    // compare data in new PBI file, to expected data
+    const PbiRawData& expectedIndex = tests::Test2Bam_NewIndex();
+    const PbiRawData& fromBuilt = PbiRawData(tempPbiFn);
+    tests::ExpectRawIndicesEqual(expectedIndex, fromBuilt);
+
+    // straight diff of newly-generated PBI file to existing PBI
+    // TODO: Come back to this once pbindexump is in place.
+    //       We can't exactly do this since file offsets may differ between 2 BAMs of differing compression levels.
+    //       Should add some sort of BAM checksum based on contents, not just size, for this reason.
+//    const string pbiDiffCmd = string("diff -q ") + test2BamFn + ".pbi " + tempPbiFn;
+//    EXPECT_EQ(0, system(pbiDiffCmd.c_str()));
+
+    // clean up temp file(s)
+    remove(tempBamFn.c_str());
+    remove(tempPbiFn.c_str());
+}
+
+TEST(PacBioIndexTest, RawLoadFromPbiFile)
+{
+    const BamFile bamFile(test2BamFn);
+    const string& pbiFilename = bamFile.PacBioIndexFilename();
+    const PbiRawData loadedIndex(pbiFilename);
+
+    const PbiRawData& expectedIndex = tests::Test2Bam_ExistingIndex();
+    tests::ExpectRawIndicesEqual(expectedIndex, loadedIndex);
+}
+
+TEST(PacBioIndexTest, BasicAndBarodeSectionsOnly)
+{
+    // do this in temp directory, so we can ensure write access
+    const string tempDir    = tests::GeneratedData_Dir + "/";
+    const string tempBamFn  = tempDir + "phi29.bam";
+    const string tempPbiFn  = tempBamFn + ".pbi";
+    string cmd("cp ");
+    cmd += phi29BamFn;
+    cmd += " ";
+    cmd += tempDir;
+    int cmdResult = system(cmd.c_str());
+    (void)cmdResult;
+
+    BamFile bamFile(tempBamFn);
+    PbiFile::CreateFrom(bamFile);
+    EXPECT_EQ(tempPbiFn, bamFile.PacBioIndexFilename());
+
+    PbiRawData index(bamFile.PacBioIndexFilename());
+    EXPECT_EQ(PbiFile::Version_3_0_1,  index.Version());
+    EXPECT_EQ(120, index.NumReads());
+    EXPECT_FALSE(index.HasMappedData());
+    EXPECT_TRUE(index.HasBarcodeData());
+
+    const vector<int16_t> expectedBcForward = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2};
+    const vector<int16_t> expectedBcReverse = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+        2,2,2,2,2,2,2,2,2,2,2,2,2,2};
+    const vector<int8_t>  expectedBcQuality = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+        1,1,1,1,1,1,1,1,1,1,1};
+
+    const PbiRawBarcodeData& barcodeData = index.BarcodeData();
+    EXPECT_EQ(expectedBcForward, barcodeData.bcForward_);
+    EXPECT_EQ(expectedBcReverse, barcodeData.bcReverse_);
+    EXPECT_EQ(expectedBcQuality, barcodeData.bcQual_);
+
+
+    // clean up temp file(s)
+    remove(tempBamFn.c_str());
+    remove(tempPbiFn.c_str());
+
+}
+
+
+TEST(PacBioIndexTest, ReferenceDataNotLoadedOnUnsortedBam)
+{
+    BamFile bamFile(test2BamFn);
+    PbiRawData raw(bamFile.PacBioIndexFilename());
+    EXPECT_TRUE(raw.HasReferenceData());
+}
+
+TEST(PacBioIndexTest, LookupLoadFromFileOk)
+{
+    BamFile bamFile(test2BamFn);
+    EXPECT_NO_THROW(
+    {
+        PbiIndex index(bamFile.PacBioIndexFilename());
+        EXPECT_EQ(10, index.NumReads());
+        EXPECT_EQ(vector<int64_t>({ 33816576, 33825163, 33831333, 33834264, 33836542, 33838065, 33849818, 33863499, 33874621, 1392836608 }), index.BasicData().VirtualFileOffsets());
+    });
+}
+
+TEST(PacBioIndexTest, ThrowOnNonExistentPbiFile)
+{
+    EXPECT_THROW(PbiRawData raw("does_not_exist.pbi"), std::exception);
+    EXPECT_THROW(PbiIndex idx("does_not_exist.pbi"),   std::exception);
+}
+
+TEST(PacBioIndexTest, ThrowOnNonPbiFile)
+{
+    // completely wrong format
+    const std::string fastaFn = tests::Data_Dir + "/lambdaNEB.fa";
+    EXPECT_THROW(PbiRawData raw(fastaFn), std::exception);
+    EXPECT_THROW(PbiIndex idx(fastaFn),   std::exception);
+
+    // BGZF file, but not PBI
+    const std::string& bamFn = tests::Data_Dir + "/ex2.bam";
+    EXPECT_THROW(PbiRawData raw(bamFn), std::exception);
+    EXPECT_THROW(PbiIndex idx(bamFn),   std::exception);
+}
+
+TEST(PacBioIndexTest, Copy_and_Move)
+{
+    const PbiIndex lookup(test2BamFn + ".pbi");
+
+    const PbiIndex copyConstructed(lookup);
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    const PbiIndex moveConstructed(std::move(PbiIndex(test2BamFn + ".pbi")));
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    PbiIndex copyAssigned;
+    copyAssigned = lookup;
+
+    PbiIndex moveAssigned;
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    moveAssigned = std::move(PbiIndex(test2BamFn + ".pbi"));
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_TRUE(tests::PbiIndicesEqual(lookup, copyConstructed));
+    EXPECT_TRUE(tests::PbiIndicesEqual(lookup, moveConstructed));
+    EXPECT_TRUE(tests::PbiIndicesEqual(lookup, copyAssigned));
+    EXPECT_TRUE(tests::PbiIndicesEqual(lookup, moveAssigned));
+}
+
+TEST(PacBioIndexTest, OrderedLookup)
+{
+    using PacBio::BAM::IndexList;
+    using PacBio::BAM::OrderedLookup;
+
+    OrderedLookup<int>::container_type oRawData;
+    oRawData[11] = { 0, 3, 4 };
+    oRawData[20] = { 1 };
+    oRawData[42] = { 2, 7, 8 };
+    oRawData[10] = { 5 };
+    oRawData[12] = { 6 };
+    oRawData[99] = { 9 };
+
+    OrderedLookup<int> oLookup(oRawData);
+
+    // EQUAL
+    EXPECT_EQ(IndexList({5}),       oLookup.LookupIndices(10, Compare::EQUAL));
+    EXPECT_EQ(IndexList({0, 3, 4}), oLookup.LookupIndices(11, Compare::EQUAL));
+    EXPECT_EQ(IndexList({6}),       oLookup.LookupIndices(12, Compare::EQUAL));
+    EXPECT_EQ(IndexList({1}),       oLookup.LookupIndices(20, Compare::EQUAL));
+    EXPECT_EQ(IndexList({2, 7, 8}), oLookup.LookupIndices(42, Compare::EQUAL));
+    EXPECT_EQ(IndexList({9}),       oLookup.LookupIndices(99, Compare::EQUAL));
+    EXPECT_EQ(IndexList(),          oLookup.LookupIndices(66, Compare::EQUAL)); // does not exist
+
+    // NOT_EQUAL
+    EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 6, 7, 8, 9}),    oLookup.LookupIndices(10, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({1, 2, 5, 6, 7, 8, 9}),          oLookup.LookupIndices(11, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 7, 8, 9}),    oLookup.LookupIndices(12, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({0, 2, 3, 4, 5, 6, 7, 8, 9}),    oLookup.LookupIndices(20, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({0, 1, 3, 4, 5, 6, 9}),          oLookup.LookupIndices(42, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 6, 7, 8}),    oLookup.LookupIndices(99, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}), oLookup.LookupIndices(66, Compare::NOT_EQUAL)); // does not exist
+
+    // LESS_THAN
+    EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), oLookup.LookupIndices(13, Compare::LESS_THAN));
+    EXPECT_EQ(IndexList({0, 3, 4, 5}),    oLookup.LookupIndices(12, Compare::LESS_THAN));
+    // do more checks
+
+    // LESS_THAN_EQUAL
+    EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), oLookup.LookupIndices(13, Compare::LESS_THAN_EQUAL));
+    EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), oLookup.LookupIndices(12, Compare::LESS_THAN_EQUAL));
+    // more checks?
+
+    // GREATER_THAN
+    EXPECT_EQ(IndexList({2,7,8,9}), oLookup.LookupIndices(41, Compare::GREATER_THAN));
+    EXPECT_EQ(IndexList({9}),       oLookup.LookupIndices(42, Compare::GREATER_THAN));
+    // more checks?
+
+    // GREATER_THAN_EQUAL
+    EXPECT_EQ(IndexList({2,7,8,9}), oLookup.LookupIndices(41, Compare::GREATER_THAN_EQUAL));
+    EXPECT_EQ(IndexList({2,7,8,9}), oLookup.LookupIndices(42, Compare::GREATER_THAN_EQUAL));
+    // more checks?
+}
+
+TEST(PacBioIndexTest, UnorderedLookup)
+{
+    using PacBio::BAM::IndexList;
+    using PacBio::BAM::UnorderedLookup;
+
+    UnorderedLookup<int>::container_type uRawData;
+    uRawData[11] = { 0, 3, 4 };
+    uRawData[20] = { 1 };
+    uRawData[42] = { 2, 7, 8 };
+    uRawData[10] = { 5 };
+    uRawData[12] = { 6 };
+    uRawData[99] = { 9 };
+
+    UnorderedLookup<int> uLookup(uRawData);
+
+    // EQUAL
+    EXPECT_EQ(IndexList({5}),       uLookup.LookupIndices(10, Compare::EQUAL));
+    EXPECT_EQ(IndexList({0, 3, 4}), uLookup.LookupIndices(11, Compare::EQUAL));
+    EXPECT_EQ(IndexList({6}),       uLookup.LookupIndices(12, Compare::EQUAL));
+    EXPECT_EQ(IndexList({1}),       uLookup.LookupIndices(20, Compare::EQUAL));
+    EXPECT_EQ(IndexList({2, 7, 8}), uLookup.LookupIndices(42, Compare::EQUAL));
+    EXPECT_EQ(IndexList({9}),       uLookup.LookupIndices(99, Compare::EQUAL));
+    EXPECT_EQ(IndexList(),          uLookup.LookupIndices(66, Compare::EQUAL)); // does not exist
+
+    // NOT_EQUAL
+    EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 6, 7, 8, 9}),    uLookup.LookupIndices(10, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({1, 2, 5, 6, 7, 8, 9}),          uLookup.LookupIndices(11, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 7, 8, 9}),    uLookup.LookupIndices(12, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({0, 2, 3, 4, 5, 6, 7, 8, 9}),    uLookup.LookupIndices(20, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({0, 1, 3, 4, 5, 6, 9}),          uLookup.LookupIndices(42, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 6, 7, 8}),    uLookup.LookupIndices(99, Compare::NOT_EQUAL));
+    EXPECT_EQ(IndexList({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}), uLookup.LookupIndices(66, Compare::NOT_EQUAL)); // does not exist
+
+    // LESS_THAN
+    EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), uLookup.LookupIndices(13, Compare::LESS_THAN));
+    EXPECT_EQ(IndexList({0, 3, 4, 5}),    uLookup.LookupIndices(12, Compare::LESS_THAN));
+    // more checks?
+
+    // LESS_THAN_EQUAL
+    EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), uLookup.LookupIndices(13, Compare::LESS_THAN_EQUAL));
+    EXPECT_EQ(IndexList({0, 3, 4, 5, 6}), uLookup.LookupIndices(12, Compare::LESS_THAN_EQUAL));
+    // more checks?
+
+    // GREATER_THAN
+    EXPECT_EQ(IndexList({2,7,8,9}), uLookup.LookupIndices(41, Compare::GREATER_THAN));
+    EXPECT_EQ(IndexList({9}),       uLookup.LookupIndices(42, Compare::GREATER_THAN));
+    // more checks?
+
+    // GREATER_THAN_EQUAL
+    EXPECT_EQ(uLookup.LookupIndices(41, Compare::GREATER_THAN_EQUAL), IndexList({2,7,8,9}));
+    EXPECT_EQ(uLookup.LookupIndices(42, Compare::GREATER_THAN_EQUAL), IndexList({2,7,8,9}));
+    // more checks?
+}
+
+TEST(PacBioIndexTest, MergeBlocks)
+{
+    using PacBio::BAM::IndexList;
+    using PacBio::BAM::IndexResultBlock;
+    using PacBio::BAM::IndexResultBlocks;
+    using PacBio::BAM::mergedIndexBlocks;
+    using PacBio::BAM::OrderedLookup;
+
+    OrderedLookup<int>::container_type oRawData;
+    oRawData[11] = { 0, 3, 4 };
+    oRawData[20] = { 1 };
+    oRawData[42] = { 2, 7, 8 };
+    oRawData[10] = { 5 };
+    oRawData[12] = { 6 };
+    oRawData[99] = { 9 };
+
+    OrderedLookup<int> oLookup(oRawData);
+
+    // EQUAL
+    auto mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(10, Compare::EQUAL));
+    EXPECT_EQ(1, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(5, 1), mergedBlocks.at(0));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(11, Compare::EQUAL));
+    EXPECT_EQ(2, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(0, 1), mergedBlocks.at(0));
+    EXPECT_EQ(IndexResultBlock(3, 2), mergedBlocks.at(1));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(12, Compare::EQUAL));
+    EXPECT_EQ(1, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(6, 1), mergedBlocks.at(0));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(20, Compare::EQUAL));
+    EXPECT_EQ(1, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(1, 1), mergedBlocks.at(0));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(42, Compare::EQUAL));
+    EXPECT_EQ(2, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(2, 1), mergedBlocks.at(0));
+    EXPECT_EQ(IndexResultBlock(7, 2), mergedBlocks.at(1));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(99, Compare::EQUAL));
+    EXPECT_EQ(1, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(9, 1), mergedBlocks.at(0));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(66, Compare::EQUAL));
+    EXPECT_TRUE(mergedBlocks.empty());
+
+    // NOT_EQUAL
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(10, Compare::NOT_EQUAL));
+    EXPECT_EQ(2, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(0, 5), mergedBlocks.at(0));
+    EXPECT_EQ(IndexResultBlock(6, 4), mergedBlocks.at(1));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(11, Compare::NOT_EQUAL));
+    EXPECT_EQ(2, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(1, 2), mergedBlocks.at(0));
+    EXPECT_EQ(IndexResultBlock(5, 5), mergedBlocks.at(1));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(12, Compare::NOT_EQUAL));
+    EXPECT_EQ(2, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(0, 6), mergedBlocks.at(0));
+    EXPECT_EQ(IndexResultBlock(7, 3), mergedBlocks.at(1));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(20, Compare::NOT_EQUAL));
+    EXPECT_EQ(2, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(0, 1), mergedBlocks.at(0));
+    EXPECT_EQ(IndexResultBlock(2, 8), mergedBlocks.at(1));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(42, Compare::NOT_EQUAL));
+    EXPECT_EQ(3, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(0, 2), mergedBlocks.at(0));
+    EXPECT_EQ(IndexResultBlock(3, 4), mergedBlocks.at(1));
+    EXPECT_EQ(IndexResultBlock(9, 1), mergedBlocks.at(2));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(99, Compare::NOT_EQUAL));
+    EXPECT_EQ(1, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(0, 9), mergedBlocks.at(0));
+
+    mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(66, Compare::NOT_EQUAL));
+    EXPECT_EQ(1, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(0, 10), mergedBlocks.at(0));
+}
+
+TEST(PacBioIndexTest, ApplyOffsetsToBlocks)
+{
+    using PacBio::BAM::BasicLookupData;
+    using PacBio::BAM::IndexList;
+    using PacBio::BAM::IndexResultBlock;
+    using PacBio::BAM::IndexResultBlocks;
+    using PacBio::BAM::mergedIndexBlocks;
+    using PacBio::BAM::OrderedLookup;
+
+    OrderedLookup<int>::container_type oRawData;
+    oRawData[11] = { 0, 3, 4 };
+    oRawData[20] = { 1 };
+    oRawData[42] = { 2, 7, 8 };
+    oRawData[10] = { 5 };
+    oRawData[12] = { 6 };
+    oRawData[99] = { 9 };
+
+    OrderedLookup<int> oLookup(oRawData);
+    auto mergedBlocks = mergedIndexBlocks(oLookup.LookupIndices(10, Compare::NOT_EQUAL));
+
+    EXPECT_EQ(2, mergedBlocks.size());
+    EXPECT_EQ(IndexResultBlock(0, 5), mergedBlocks.at(0));
+    EXPECT_EQ(IndexResultBlock(6, 4), mergedBlocks.at(1));
+
+    BasicLookupData basicLookupData;
+    basicLookupData.fileOffset_ = { 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 };
+    basicLookupData.ApplyOffsets(mergedBlocks);
+
+    EXPECT_EQ(2,  mergedBlocks.size());
+    EXPECT_EQ(0,  mergedBlocks.at(0).virtualOffset_);
+    EXPECT_EQ(5,  mergedBlocks.at(0).numReads_);
+    EXPECT_EQ(60, mergedBlocks.at(1).virtualOffset_);
+    EXPECT_EQ(4,  mergedBlocks.at(1).numReads_);
+}
+
+TEST(PacBioIndexTest, LookupMulti)
+{
+    using PacBio::BAM::BasicLookupData;
+    using PacBio::BAM::IndexList;
+    using PacBio::BAM::IndexResultBlock;
+    using PacBio::BAM::IndexResultBlocks;
+    using PacBio::BAM::mergedIndexBlocks;
+    using PacBio::BAM::UnorderedLookup;
+
+    UnorderedLookup<int32_t>::container_type uRawData;
+    uRawData[11] = { 0, 3, 4 };
+    uRawData[20] = { 1 };
+    uRawData[42] = { 2, 7, 8 };
+    uRawData[10] = { 5 };
+    uRawData[12] = { 6 };
+    uRawData[99] = { 9 };
+
+    BasicLookupData basicLookup;
+    basicLookup.rgId_ = UnorderedLookup<int32_t>(uRawData);
+    basicLookup.fileOffset_ = { 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 };
+
+    const std::vector<int32_t> whitelist = { 11, 42, 20 };
+    const auto indices = basicLookup.IndicesMulti(BasicLookupData::RG_ID, whitelist);
+
+    IndexResultBlocks mergedBlocks = mergedIndexBlocks(indices);
+    basicLookup.ApplyOffsets(mergedBlocks);
+
+    EXPECT_EQ(IndexList({0, 3, 4, 2, 7, 8, 1}), indices);
+    EXPECT_EQ(2, mergedBlocks.size());
+
+    const IndexResultBlock& block0 = mergedBlocks.at(0);
+    EXPECT_EQ(0, block0.firstIndex_);
+    EXPECT_EQ(5, block0.numReads_);
+    EXPECT_EQ(0, block0.virtualOffset_);
+
+    const IndexResultBlock& block1 = mergedBlocks.at(1);
+    EXPECT_EQ(7,  block1.firstIndex_);
+    EXPECT_EQ(2,  block1.numReads_);
+    EXPECT_EQ(70, block1.virtualOffset_);
+}
+
+TEST(PacBioIndexTest, LookupAPI)
+{
+    const PbiIndex index(test2BamFn + ".pbi");
+    const BasicLookupData& basicData = index.BasicData();
+    const MappedLookupData& mappedData = index.MappedData();
+    const BarcodeLookupData& barcodeData = index.BarcodeData();
+
+    // rgId == x
+    IndexResultBlocks rgResult = mergedIndexBlocks(basicData.Indices(BasicLookupData::RG_ID, -1197849594));
+    basicData.ApplyOffsets(rgResult);
+    EXPECT_EQ(1, rgResult.size());
+    EXPECT_EQ(0, rgResult.at(0).firstIndex_);
+    EXPECT_EQ(10, rgResult.at(0).numReads_);
+    EXPECT_EQ(33816576, rgResult.at(0).virtualOffset_);
+
+    // rg != x
+    IndexResultBlocks notRgResult = mergedIndexBlocks(basicData.Indices(BasicLookupData::RG_ID,
+                                                                        -1197849594,
+                                                                        Compare::NOT_EQUAL));
+    basicData.ApplyOffsets(notRgResult);
+    EXPECT_TRUE(notRgResult.empty());
+
+    // tEnd <= x
+    IndexResultBlocks tEndLteResult = mergedIndexBlocks(mappedData.Indices(MappedLookupData::T_END,
+                                                                            4500,
+                                                                            Compare::LESS_THAN_EQUAL));
+    basicData.ApplyOffsets(tEndLteResult);
+    EXPECT_EQ(1, tEndLteResult.size());
+    EXPECT_EQ(0, tEndLteResult.at(0).firstIndex_);
+    EXPECT_EQ(5, tEndLteResult.at(0).numReads_);
+    EXPECT_EQ(33816576, tEndLteResult.at(0).virtualOffset_);
+
+    // tEnd >= x
+    IndexResultBlocks tEndGteResult = mergedIndexBlocks(mappedData.Indices(MappedLookupData::T_START,
+                                                                           4500,
+                                                                           Compare::GREATER_THAN_EQUAL));
+    basicData.ApplyOffsets(tEndGteResult);
+    EXPECT_EQ(1, tEndGteResult.size());
+    EXPECT_EQ(6, tEndGteResult.at(0).firstIndex_);
+    EXPECT_EQ(4, tEndGteResult.at(0).numReads_);
+    EXPECT_EQ(33849818, tEndGteResult.at(0).virtualOffset_);
+
+    // strand query
+    IndexResultBlocks forward = mergedIndexBlocks(mappedData.Indices(MappedLookupData::STRAND,
+                                                                     Strand::FORWARD));
+    basicData.ApplyOffsets(forward);
+    EXPECT_EQ(5, forward.size());
+    EXPECT_EQ(0, forward.at(0).firstIndex_);
+    EXPECT_EQ(1, forward.at(0).numReads_);
+    EXPECT_EQ(33816576, forward.at(0).virtualOffset_);
+
+    EXPECT_EQ(2, forward.at(1).firstIndex_);
+    EXPECT_EQ(1, forward.at(1).numReads_);
+    EXPECT_EQ(33831333, forward.at(1).virtualOffset_);
+
+    EXPECT_EQ(4, forward.at(2).firstIndex_);
+    EXPECT_EQ(1, forward.at(2).numReads_);
+    EXPECT_EQ(33836542, forward.at(2).virtualOffset_);
+
+    EXPECT_EQ(7, forward.at(3).firstIndex_);
+    EXPECT_EQ(1, forward.at(3).numReads_);
+    EXPECT_EQ(33863499, forward.at(3).virtualOffset_);
+
+    EXPECT_EQ(9, forward.at(4).firstIndex_);
+    EXPECT_EQ(1, forward.at(4).numReads_);
+    EXPECT_EQ(1392836608, forward.at(4).virtualOffset_);
+
+    // 0,1,0,1,0,1,1,0,1,0
+    IndexResultBlocks reverse = mergedIndexBlocks(mappedData.Indices(MappedLookupData::STRAND,
+                                                                     Strand::REVERSE));
+    basicData.ApplyOffsets(reverse);
+    EXPECT_EQ(4, reverse.size());
+    EXPECT_EQ(1, reverse.at(0).firstIndex_);
+    EXPECT_EQ(1, reverse.at(0).numReads_);
+    EXPECT_EQ(33825163, reverse.at(0).virtualOffset_);
+
+    EXPECT_EQ(3, reverse.at(1).firstIndex_);
+    EXPECT_EQ(1, reverse.at(1).numReads_);
+    EXPECT_EQ(33834264, reverse.at(1).virtualOffset_);
+
+    EXPECT_EQ(5, reverse.at(2).firstIndex_);
+    EXPECT_EQ(2, reverse.at(2).numReads_);
+    EXPECT_EQ(33838065, reverse.at(2).virtualOffset_);
+
+    EXPECT_EQ(8, reverse.at(3).firstIndex_);
+    EXPECT_EQ(1, reverse.at(3).numReads_);
+    EXPECT_EQ(33874621, reverse.at(3).virtualOffset_);
+
+    // query data field that is not in the PBI
+    IndexResultBlocks missing = mergedIndexBlocks(barcodeData.Indices(BarcodeLookupData::BC_QUALITY,
+                                                                      77,
+                                                                      Compare::GREATER_THAN));
+    basicData.ApplyOffsets(missing);
+    EXPECT_TRUE(missing.empty());
+}
+
+TEST(PacBioIndexTest, LookupByZmw)
+{
+    BamFile f(tests::Data_Dir + "/dataset/bam_mapping.bam");
+    f.EnsurePacBioIndexExists();
+
+    const PbiIndex index(f.PacBioIndexFilename());
+    const BasicLookupData& basicData = index.BasicData();
+
+    IndexResultBlocks blocks =  mergedIndexBlocks(basicData.Indices(BasicLookupData::ZMW,
+                                                                      20000,
+                                                                      Compare::LESS_THAN));
+    basicData.ApplyOffsets(blocks);
+    EXPECT_EQ(14, blocks.size());
+
+    //
+    // we'll take a look at first 5 contiguous blocks of reads with ZMW < 20000
+    //
+    // skipped: { 49050, 32328, 32328 }
+    // block0:  { 6469, 6469 }
+    // skipped: { 30983 }
+    // block1:  { 13473, 13473, 19915 }
+    // skipped: { 30983 }
+    // block2:  { 19915, 7247, 7247 }
+    // skipped: { 38025 }
+    // block3:  { 13473 }
+    // skipped: { 36363, 36363, 31174, 31174, 38025, 50257, 50257 }
+    // block4:  { 14743, 14743 }
+    //
+
+    const IndexResultBlock& block0 = blocks.at(0);
+    EXPECT_EQ(3, block0.firstIndex_);
+    EXPECT_EQ(2, block0.numReads_);
+    EXPECT_EQ(32654529, block0.virtualOffset_);
+
+    const IndexResultBlock& block1 = blocks.at(1);
+    EXPECT_EQ(6, block1.firstIndex_);
+    EXPECT_EQ(3, block1.numReads_);
+    EXPECT_EQ(32669996, block1.virtualOffset_);
+
+    const IndexResultBlock& block2 = blocks.at(2);
+    EXPECT_EQ(10, block2.firstIndex_);
+    EXPECT_EQ(3,  block2.numReads_);
+    EXPECT_EQ(1388841957, block2.virtualOffset_);
+
+    const IndexResultBlock& block3 = blocks.at(3);
+    EXPECT_EQ(14, block3.firstIndex_);
+    EXPECT_EQ(1,  block3.numReads_);
+    EXPECT_EQ(1388864866, block3.virtualOffset_);
+
+    const IndexResultBlock& block4 = blocks.at(4);
+    EXPECT_EQ(22, block4.firstIndex_);
+    EXPECT_EQ(2,  block4.numReads_);
+    EXPECT_EQ(1388892121, block4.virtualOffset_);
+}
+
+TEST(PacBioIndexTest, LookupMultiZmw)
+{
+    BamFile f(tests::Data_Dir + "/dataset/bam_mapping.bam");
+    f.EnsurePacBioIndexExists();
+
+    const PbiIndex index(f.PacBioIndexFilename());
+    const BasicLookupData& basicData = index.BasicData();
+
+    const std::vector<int32_t> whitelist = { 13473, 38025 };
+    IndexResultBlocks blocks = mergedIndexBlocks(basicData.IndicesMulti(BasicLookupData::ZMW, whitelist));
+    basicData.ApplyOffsets(blocks);
+
+    EXPECT_EQ(3, blocks.size());
+
+    const IndexResultBlock& block0 = blocks.at(0);
+    EXPECT_EQ(6, block0.firstIndex_);
+    EXPECT_EQ(2, block0.numReads_);
+    EXPECT_EQ(32669996, block0.virtualOffset_);
+
+    const IndexResultBlock& block1 = blocks.at(1);
+    EXPECT_EQ(13, block1.firstIndex_);
+    EXPECT_EQ(2, block1.numReads_);
+    EXPECT_EQ(1388851626, block1.virtualOffset_);
+
+    const IndexResultBlock& block2 = blocks.at(2);
+    EXPECT_EQ(19, block2.firstIndex_);
+    EXPECT_EQ(1,  block2.numReads_);
+    EXPECT_EQ(1388881468, block2.virtualOffset_);
+}
+
+TEST(PacBioIndexTest, AggregatePBI)
+{
+
+    DataSet ds;
+    ExternalResources& resources = ds.ExternalResources();
+    resources.Add(BamFile{tests::Data_Dir + "/aligned.bam"});                           // 4 reads, BASIC | MAPPED | REFERENCE
+    resources.Add(BamFile{tests::Data_Dir + "/polymerase/production.subreads.bam"});    // 8 reads, BASIC | BARCODE
+    resources.Add(BamFile{tests::Data_Dir + "/polymerase/production_hq.hqregion.bam"}); // 1 read,  BASIC only
+
+    const PbiRawData index{ds};
+    const PbiRawBasicData&   mergedBasicData   = index.BasicData();
+    const PbiRawBarcodeData& mergedBarcodeData = index.BarcodeData();
+    const PbiRawMappedData&  mergedMappedData  = index.MappedData();
+
+    const uint32_t expectedTotal = 13; // 4 + 8 + 1
+
+    // 'meta' info
+    EXPECT_EQ(expectedTotal, index.NumReads());
+    EXPECT_EQ(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE, index.FileSections());
+    EXPECT_TRUE(index.HasBarcodeData());
+    EXPECT_TRUE(index.HasMappedData());
+    EXPECT_FALSE(index.HasReferenceData());
+
+    // file numbers
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(0));
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(1));
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(2));
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(3));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(4));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(5));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(6));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(7));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(8));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(9));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(10));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(11));
+    EXPECT_EQ(2, mergedBasicData.fileNumber_.at(12));
+
+    // basic data
+    EXPECT_EQ(0,    mergedBasicData.qStart_.at(0)); // file 1
+    EXPECT_EQ(0,    mergedBasicData.qStart_.at(1));
+    EXPECT_EQ(2659, mergedBasicData.qStart_.at(4)); // file 2
+    EXPECT_EQ(3116, mergedBasicData.qStart_.at(5));
+    EXPECT_EQ(2659, mergedBasicData.qStart_.at(12)); // file 3
+
+    EXPECT_EQ(21102592, mergedBasicData.fileOffset_.at(0)); // file 1
+    EXPECT_EQ(21102883, mergedBasicData.fileOffset_.at(1));
+    EXPECT_EQ(19857408, mergedBasicData.fileOffset_.at(4)); // file 2
+    EXPECT_EQ(19860696, mergedBasicData.fileOffset_.at(5));
+    EXPECT_EQ(20054016, mergedBasicData.fileOffset_.at(12)); // file 3
+
+    // mapped data
+    EXPECT_EQ(60,  mergedMappedData.mapQV_.at(0)); // file 1
+    EXPECT_EQ(60,  mergedMappedData.mapQV_.at(1));
+    EXPECT_EQ(255, mergedMappedData.mapQV_.at(4)); // file 2
+    EXPECT_EQ(255, mergedMappedData.mapQV_.at(5));
+    EXPECT_EQ(255, mergedMappedData.mapQV_.at(12)); // file 3
+
+    // barcode data
+    EXPECT_EQ(-1,  mergedBarcodeData.bcForward_.at(0)); // file 1
+    EXPECT_EQ(-1,  mergedBarcodeData.bcForward_.at(1));
+    EXPECT_EQ(92, mergedBarcodeData.bcForward_.at(4)); // file 2
+    EXPECT_EQ(92, mergedBarcodeData.bcForward_.at(5));
+    EXPECT_EQ(-1, mergedBarcodeData.bcForward_.at(12)); // file 3
+}
diff --git a/tests/src/test_PbiFilter.cpp b/tests/src/test_PbiFilter.cpp

new file mode 100644 (file)

index 0000000..449c83d
--- /dev/null
+++ b/tests/src/test_PbiFilter.cpp
@@ -0,0 +1,1359 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.\r
+//\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted (subject to the limitations in the\r
+// disclaimer below) provided that the following conditions are met:\r
+//\r
+//  * Redistributions of source code must retain the above copyright\r
+//    notice, this list of conditions and the following disclaimer.\r
+//\r
+//  * Redistributions in binary form must reproduce the above\r
+//    copyright notice, this list of conditions and the following\r
+//    disclaimer in the documentation and/or other materials provided\r
+//    with the distribution.\r
+//\r
+//  * Neither the name of Pacific Biosciences nor the names of its\r
+//    contributors may be used to endorse or promote products derived\r
+//    from this software without specific prior written permission.\r
+//\r
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE\r
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC\r
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED\r
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES\r
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\r
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS\r
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF\r
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND\r
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT\r
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF\r
+// SUCH DAMAGE.\r
+\r
+// Author: Derek Barnett\r
+\r
+#ifdef PBBAM_TESTING\r
+#define private public\r
+#endif\r
+\r
+#include "TestData.h"\r
+#include <gtest/gtest.h>\r
+#include <pbbam/PbiFilter.h>\r
+#include <string>\r
+#include <cstdio>\r
+#include <cstdlib>\r
+using namespace PacBio;\r
+using namespace PacBio::BAM;\r
+using namespace std;\r
+\r
+namespace PacBio {\r
+namespace BAM {\r
+namespace tests {\r
+\r
+// helper structs & methods\r
+\r
+static\r
+PbiRawData test2Bam_RawIndex(void)\r
+{\r
+    PbiRawData index;\r
+    index.NumReads(4);\r
+\r
+    PbiRawBasicData& subreadData = index.BasicData();\r
+    subreadData.rgId_       = { -1197849594, -1197849594, -1197849594, -1197849594 };\r
+    subreadData.qStart_     = { 2114, 2579, 4101, 5615 };\r
+    subreadData.qEnd_       = { 2531, 4055, 5571, 6237 };\r
+    subreadData.holeNumber_ = { 14743, 14743, 14743, 14743 };\r
+    subreadData.readQual_   = { 0.901, 0.601, 0.901, 0.601 };\r
+    subreadData.ctxtFlag_   = { 0, 1, 2, 3 };\r
+    subreadData.fileOffset_ = { 35651584, 35655125, 35667128, 35679170 };\r
+\r
+    PbiRawMappedData& mappedData = index.mappedData_;\r
+    mappedData.tId_       = { 0, 0, 0, 0 };\r
+    mappedData.tStart_    = { 9507, 8453, 8455, 9291 };\r
+    mappedData.tEnd_      = { 9903, 9902, 9893, 9900 };\r
+    mappedData.aStart_    = { 2130, 2581, 4102, 5619 };\r
+    mappedData.aEnd_      = { 2531, 4055, 5560, 6237 };\r
+    mappedData.revStrand_ = { 0, 1, 0, 1 };\r
+    mappedData.mapQV_     = { 254, 254, 254, 254 };\r
+    mappedData.nM_        = { 384, 1411, 1393, 598 };\r
+    mappedData.nMM_       = { 0, 0, 0, 0 };\r
+\r
+    PbiRawBarcodeData& barcodeData = index.barcodeData_;\r
+    barcodeData.bcForward_ = { 0, 17, 256, 17 };\r
+    barcodeData.bcReverse_ = { 1, 18, 257, 18 };\r
+    barcodeData.bcQual_    = { 42, 80, 42, 110 };\r
+\r
+    PbiRawReferenceData& referenceData = index.referenceData_;\r
+    referenceData.entries_.emplace_back( 0, 0, 3 );\r
+    referenceData.entries_.emplace_back( 1 );\r
+    referenceData.entries_.emplace_back( PbiReferenceEntry::UNMAPPED_ID );\r
+\r
+    return index;\r
+}\r
+\r
+static const PbiRawData shared_index = test2Bam_RawIndex();\r
+\r
+static\r
+void checkFilterRows(const PbiFilter& filter, const std::vector<size_t> expectedRows)\r
+{\r
+    for (size_t row : expectedRows)\r
+        EXPECT_TRUE(filter.Accepts(shared_index, row));\r
+}\r
+\r
+static\r
+void checkFilterInternals(const PbiFilter& filter,\r
+                          const PbiFilter::CompositionType expectedType,\r
+                          const size_t expectedNumChildren,\r
+                          const std::vector<size_t> expectedRows)\r
+{\r
+    EXPECT_EQ(expectedType,        filter.d_->type_);\r
+    EXPECT_EQ(expectedNumChildren, filter.d_->filters_.size());\r
+    checkFilterRows(filter, expectedRows);\r
+}\r
+\r
+struct SimpleFilter\r
+{\r
+    bool Accepts(const PbiRawData& idx, const size_t row) const\r
+    { (void)idx; (void)row; return true; }\r
+};\r
+\r
+struct NoncompliantFilter { };\r
+\r
+struct SortUniqueTestFilter\r
+{\r
+    bool Accepts(const PbiRawData& idx, const size_t row) const\r
+    {\r
+        (void)idx;\r
+        switch(row) {\r
+            case 0: // fall through\r
+            case 1: // .\r
+            case 2: // .\r
+            case 3: // .\r
+            case 4: // .\r
+            case 7: // .\r
+            case 8: return true;\r
+            default:\r
+                return false;\r
+        }\r
+    }\r
+};\r
+\r
+struct SortUniqueTestFilter2\r
+{\r
+    bool Accepts(const PbiRawData& idx, const size_t row) const\r
+    {\r
+        (void)idx;\r
+        switch(row) {\r
+            case 3: // fall through\r
+            case 7: // .\r
+            case 5: return true;\r
+            default:\r
+                return false;\r
+        }\r
+    }\r
+};\r
+\r
+static inline\r
+PbiFilter emptyFilter(void)\r
+{ return PbiFilter{ }; }\r
+\r
+static inline\r
+PbiFilter simpleFilter(void)\r
+{ return PbiFilter{ SimpleFilter{ } }; }\r
+\r
+} // namespace tests\r
+} // namespace BAM\r
+} // namespace PacBio\r
+\r
+TEST(PbiFilterTest, DefaultCtorOk)\r
+{\r
+    auto filter = PbiFilter{ };\r
+    tests::checkFilterInternals(filter, PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});\r
+}\r
+\r
+TEST(PbiFilterTest, CompositionOk)\r
+{\r
+    auto filter = PbiFilter{ };\r
+    filter.Add(PbiFilter{ });\r
+    tests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});\r
+}\r
+\r
+TEST(PbiFilterTest, CustomFilterOk)\r
+{\r
+    { // ctor\r
+        auto filter = PbiFilter{ tests::SimpleFilter{ } };\r
+        tests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector<size_t>{});\r
+    }\r
+    { // Add\r
+        auto filter = PbiFilter{ };\r
+        filter.Add(tests::SimpleFilter{ });\r
+        tests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector<size_t>{});\r
+    }\r
+\r
+//    PbiFilter shouldNotCompile = PbiFilter{ tests::NoncompliantFilter{ } };                       // <-- when uncommented, should not compile\r
+//    PbiFilter shouldNotCompileEither; shouldNotCompileEither.Add(tests::NoncompliantFilter{ });   // <-- when uncommented, should not compile\r
+}\r
+\r
+TEST(PbiFilterTest, CopyOk)\r
+{\r
+    { // empty\r
+        const auto original = PbiFilter{ };\r
+\r
+        PbiFilter copyCtor(original);\r
+        PbiFilter copyAssign;\r
+        copyAssign = original;\r
+\r
+        tests::checkFilterInternals(original,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});\r
+        tests::checkFilterInternals(copyCtor,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});\r
+        tests::checkFilterInternals(copyAssign, PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});\r
+    }\r
+    { // with children\r
+        const auto original = PbiFilter{ tests::SimpleFilter{ } };\r
+\r
+        PbiFilter copyCtor(original);\r
+        PbiFilter copyAssign;\r
+        copyAssign = original;\r
+\r
+        tests::checkFilterInternals(original,   PbiFilter::INTERSECT, 1, std::vector<size_t>{});\r
+        tests::checkFilterInternals(copyCtor,   PbiFilter::INTERSECT, 1, std::vector<size_t>{});\r
+        tests::checkFilterInternals(copyAssign, PbiFilter::INTERSECT, 1, std::vector<size_t>{});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, MoveOk)\r
+{\r
+    { // empty\r
+        const auto original = tests::emptyFilter();\r
+\r
+        PbiFilter moveCtor(tests::emptyFilter());\r
+        PbiFilter moveAssign;\r
+        moveAssign = tests::emptyFilter();\r
+\r
+        tests::checkFilterInternals(original,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});\r
+        tests::checkFilterInternals(moveCtor,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});\r
+        tests::checkFilterInternals(moveAssign, PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});\r
+    }\r
+    { // with children\r
+        const auto original = tests::simpleFilter();\r
+\r
+        PbiFilter moveCtor(tests::simpleFilter());\r
+        PbiFilter moveAssign;\r
+        moveAssign = tests::simpleFilter();\r
+\r
+        tests::checkFilterInternals(original,   PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});\r
+        tests::checkFilterInternals(moveCtor,   PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});\r
+        tests::checkFilterInternals(moveAssign, PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, SortsAndUniquesChildFilterResultsOk)\r
+{\r
+    const auto childFilter = tests::SortUniqueTestFilter{ };\r
+    const auto filter = PbiFilter{ childFilter };\r
+    tests::checkFilterRows(childFilter, std::vector<size_t>{2, 7, 0, 3, 4, 1, 8});\r
+    tests::checkFilterRows(filter, std::vector<size_t>{0, 1, 2, 3, 4, 7, 8});\r
+}\r
+\r
+TEST(PbiFilterTest, UnionOk)\r
+{\r
+    { // empty\r
+        { // copy\r
+            const auto emptyFilter = tests::emptyFilter();\r
+            const auto emptyFilter2 = tests::emptyFilter();\r
+            const auto u = PbiFilter::Union({ emptyFilter, emptyFilter2 });\r
+            tests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{0,1,2,3});\r
+        }\r
+        { // move\r
+            const auto u = PbiFilter::Union({ PbiFilter{ }, PbiFilter{ } });\r
+            tests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{0,1,2,3});\r
+        }\r
+    }\r
+\r
+    { // with (no-data) children - just checking composition\r
+        { // copy\r
+            const auto simpleFilter = tests::SimpleFilter{ };\r
+            const auto simpleFilter2 = tests::SimpleFilter{ };\r
+            const auto u = PbiFilter::Union({ simpleFilter, simpleFilter2 });\r
+            tests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{});\r
+        }\r
+        { // move\r
+            const auto u = PbiFilter::Union({ tests::SimpleFilter{ }, tests::SimpleFilter{ } });\r
+            tests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{});\r
+        }\r
+    }\r
+\r
+    { // 2-child union, results sorted & unique-d by PbiFilter\r
+\r
+        const auto child1 = tests::SortUniqueTestFilter{ };\r
+        const auto child2 = tests::SortUniqueTestFilter2{ };\r
+        const auto u = PbiFilter::Union({ child1, child2 });\r
+\r
+        tests::checkFilterRows(child1, std::vector<size_t>{2, 7, 0, 3, 4, 1, 8});\r
+        tests::checkFilterRows(child2, std::vector<size_t>{3, 7, 5});\r
+        tests::checkFilterRows(u, std::vector<size_t>{0, 1, 2, 3, 4, 5, 7, 8});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, IntersectOk)\r
+{\r
+    { // empty\r
+        { // copy\r
+            const auto emptyFilter = tests::emptyFilter();\r
+            const auto emptyFilter2 = tests::emptyFilter();\r
+            const auto i = PbiFilter::Intersection({ emptyFilter, emptyFilter2 });\r
+            tests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{0,1,2,3});\r
+        }\r
+        { // move\r
+            const auto i = PbiFilter::Intersection({ PbiFilter{ }, PbiFilter{ } });\r
+            tests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{0,1,2,3});\r
+        }\r
+    }\r
+\r
+    { // with (no-data) children - just checking composition\r
+        { // copy\r
+            const auto simpleFilter = tests::SimpleFilter{ };\r
+            const auto simpleFilter2 = tests::SimpleFilter{ };\r
+            const auto i = PbiFilter::Intersection({ simpleFilter, simpleFilter2 });\r
+            tests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{});\r
+        }\r
+        { // move\r
+            const auto i = PbiFilter::Intersection({ tests::SimpleFilter{ }, tests::SimpleFilter{ } });\r
+            tests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{});\r
+        }\r
+    }\r
+\r
+    { // 2-child intersect, sorted & unique-d by PbiFilter\r
+\r
+        const auto child1 = tests::SortUniqueTestFilter{ };\r
+        const auto child2 = tests::SortUniqueTestFilter2{ };\r
+        const auto i = PbiFilter::Intersection({ child1, child2 });\r
+\r
+        tests::checkFilterRows(child1, std::vector<size_t>{2, 7, 0, 3, 4, 1, 8});\r
+        tests::checkFilterRows(child2, std::vector<size_t>{3, 7, 5 });\r
+        tests::checkFilterRows(i, std::vector<size_t>{3, 7});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, AlignedEndFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4055 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4055, Compare::NOT_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,2,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4000, Compare::LESS_THAN } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 5560, Compare::GREATER_THAN } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 5560, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{2,3});\r
+    }\r
+\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 7000, Compare::GREATER_THAN } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, AlignedLengthFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedLengthFilter{ 500, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,2,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedLengthFilter{ 1000, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,2});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, AlignedStartFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 2600, Compare::LESS_THAN } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 4102, Compare::GREATER_THAN } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 4102, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{2,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 6000, Compare::GREATER_THAN } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{ });\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, AlignedStrandFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::FORWARD } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,2});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::REVERSE } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::FORWARD, Compare::NOT_EQUAL } }; // same as Strand::REVERSE\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,3});\r
+    }\r
+\r
+    // unsupported compare types throw\r
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::LESS_THAN),          std::runtime_error);\r
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::LESS_THAN_EQUAL),    std::runtime_error);\r
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::GREATER_THAN),       std::runtime_error);\r
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::GREATER_THAN_EQUAL), std::runtime_error);\r
+}\r
+\r
+TEST(PbiFilterTest, BarcodeFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodeFilter{ 17 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodeFilter{ 18 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodeFilter{ 0 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, BarcodeForwardFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ 17 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ 400 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ {0, 256} } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,2});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, BarcodeQualityFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodeQualityFilter{ 80, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodeQualityFilter{ 40, Compare::LESS_THAN } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, BarcodeReverseFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ 18 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ 400 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{ });\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ {1, 257} } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,2});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, BarcodesFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodesFilter{ 17, 18 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodesFilter{ 17, 19 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{ });\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiBarcodesFilter{ std::make_pair(17,18) } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,3});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, IdentityFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiIdentityFilter{ 0.95, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{3});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, LocalContextFilterOk)\r
+{\r
+    { // == NO_LOCAL_CONTEXT\r
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0});\r
+    }\r
+    { // != ADAPTER_BEFORE (exact match)\r
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,2,3});\r
+    }\r
+    { // contains ADAPTER_BEFORE\r
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,3});\r
+    }\r
+    { // does not contain ADAPTER_BEFORE\r
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,2});\r
+    }\r
+    { // include both ADAPTER_BEFORE and ADAPTER_AFTER\r
+        const auto filter = PbiFilter::Intersection(\r
+        {\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }\r
+        });\r
+        tests::checkFilterRows(filter, std::vector<size_t>{3});\r
+    }\r
+    { // exclude both ADAPTER_BEFORE and ADAPTER_AFTER\r
+        const auto filter = PbiFilter::Intersection(\r
+        {\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS },\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }\r
+        });\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0});\r
+    }\r
+    { // include everything with either ADAPTER_BEFORE or ADAPTER_AFTER\r
+        const auto filter = PbiFilter::Union(\r
+        {\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }\r
+        });\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,2,3});\r
+    }\r
+    { // include everything with either ADAPTER_BEFORE or ADAPTER_AFTER, but not both\r
+        const auto filter = PbiFilter::Intersection(\r
+        {\r
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL },\r
+                PbiFilter::Union(\r
+                {\r
+                    PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS },\r
+                    PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }\r
+                })\r
+        });\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,2});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, MapQualityFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiMapQualityFilter{ 254 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiMapQualityFilter{ 254, Compare::NOT_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, MovieNameFilterOk)\r
+{\r
+    const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } };\r
+    const auto index = PbiRawData{ bamFile.PacBioIndexFilename() };\r
+\r
+    {\r
+        const auto filter = PbiFilter{ PbiMovieNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0" } };\r
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};\r
+        for (size_t row : expectedRows)\r
+            EXPECT_TRUE(filter.Accepts(index, row));\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiMovieNameFilter{ "does_not_exist" } };\r
+        const auto expectedRows = std::vector<size_t>{};\r
+        for (size_t row : expectedRows)\r
+            EXPECT_TRUE(filter.Accepts(index, row));\r
+    }\r
+    {\r
+        const auto names = vector<string>{"does_not_exist",\r
+                                          "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0"};\r
+        const auto filter = PbiFilter{ PbiMovieNameFilter{ names } };\r
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};\r
+        for (size_t row : expectedRows)\r
+            EXPECT_TRUE(filter.Accepts(index, row));\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, NumDeletedBasesFilterOk)\r
+{\r
+    // del: { 12, 38, 45, 11} - calculated from raw data, not stored directly in testing object or read from PBI file\r
+\r
+    {\r
+        const auto filter = PbiFilter{ PbiNumDeletedBasesFilter{ 12, Compare::LESS_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiNumDeletedBasesFilter{ 45, Compare::EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{2});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, NumInsertedBasesFilterOk)\r
+{\r
+    // ins: { 17, 63, 65, 20 }  - calculated from raw data, not stored directly testing object or read from PBI file\r
+\r
+    {\r
+        const auto filter = PbiFilter{ PbiNumInsertedBasesFilter{ 63, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,2});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiNumInsertedBasesFilter{ 17, Compare::NOT_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,2,3});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, NumMatchesFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiNumMatchesFilter{ 1000, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,2});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiNumMatchesFilter{ 400, Compare::LESS_THAN } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, NumMismatchesFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiNumMismatchesFilter{ 0, Compare::EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiNumMismatchesFilter{ 0, Compare::NOT_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, QueryEndFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryEndFilter{ 4055 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryEndFilter{ 6200, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{3});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, QueryLengthFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryLengthFilter{ 500, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,2,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryLengthFilter{ 1000, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,2});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, QueryNameFilterOk)\r
+{\r
+    const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } };\r
+    const auto index = PbiIndex{ bamFile.PacBioIndexFilename() };\r
+\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055" } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237" } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{3});\r
+    }\r
+\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "does_not_exist/0/0_0" } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    }\r
+    {\r
+        const auto names = vector<string>{"m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055",\r
+                                          "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237"};\r
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ names } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1,3});\r
+    }\r
+\r
+    // invalid QNAME syntax throws\r
+    EXPECT_THROW(\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "" } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    },\r
+    std::runtime_error);\r
+    EXPECT_THROW(\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo" } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    },\r
+    std::runtime_error);\r
+    EXPECT_THROW(\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo/bar" } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    },\r
+    std::runtime_error);\r
+    EXPECT_THROW(\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo/bar/baz_bam" } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    },\r
+    std::exception); // come back to see why this is not runtime_error but something else\r
+}\r
+\r
+TEST(PbiFilterTest, QueryStartFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryStartFilter{ 4101 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{2});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryStartFilter{ 5000 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiQueryStartFilter{ 5000, Compare::GREATER_THAN } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{3});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, ReadAccuracyFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiReadAccuracyFilter{ 0.9 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiReadAccuracyFilter{ 0.9, Compare::GREATER_THAN } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,2});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, ReadGroupFilterOk)\r
+{\r
+    { // numeric ID\r
+        const auto filter = PbiReadGroupFilter{ -1197849594 };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+\r
+        const auto filter2 = PbiReadGroupFilter{ 200 };\r
+        tests::checkFilterRows(filter2, std::vector<size_t>{});\r
+    }\r
+    { // string ID\r
+        const auto filter = PbiReadGroupFilter{ "b89a4406" };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+\r
+        const auto filter2 = PbiReadGroupFilter{ "b89a4406" };\r
+        tests::checkFilterRows(filter2, std::vector<size_t>{0,1,2,3});\r
+    }\r
+    { // ReadGroupInfo object\r
+        const auto rg = ReadGroupInfo{ "b89a4406" };\r
+        const auto filter = PbiReadGroupFilter{ rg };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+    }\r
+    { // multi-ID\r
+        const auto ids = vector<int32_t>({-1197849594, 200});\r
+        const auto filter = PbiReadGroupFilter{ ids };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+    }\r
+    { // multi-string\r
+        const auto ids = vector<string>({"b89a4406", "deadbeef"});\r
+        const auto filter = PbiReadGroupFilter{ ids };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+    }\r
+    { // multi-ReadGroupInfo\r
+        const auto ids = vector<ReadGroupInfo>({ ReadGroupInfo("b89a4406"), ReadGroupInfo("deadbeef")});\r
+        const auto filter = PbiReadGroupFilter{ ids };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, ReferenceEndFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiReferenceEndFilter{ 9900 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiReferenceEndFilter{ 9900, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,3});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, ReferenceIdFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiReferenceIdFilter{ 0 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiReferenceIdFilter{ 0, Compare::NOT_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    }\r
+    {\r
+        const auto ids = vector<int32_t>({0, 42});\r
+        const auto filter = PbiFilter{ PbiReferenceIdFilter{ ids } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, ReferenceNameFilterOk)\r
+{\r
+    const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } };\r
+    const auto index = PbiRawData{ bamFile.PacBioIndexFilename() };\r
+\r
+    {\r
+        const auto filter = PbiFilter{ PbiReferenceNameFilter{ "lambda_NEB3011" } };\r
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};\r
+        for (size_t row : expectedRows)\r
+            EXPECT_TRUE(filter.Accepts(index, row));\r
+\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiReferenceNameFilter{ "lambda_NEB3011", Compare::NOT_EQUAL } };\r
+        const auto expectedRows = std::vector<size_t>{};\r
+        for (size_t row : expectedRows)\r
+            EXPECT_TRUE(filter.Accepts(index, row));\r
+    }\r
+    {\r
+        const auto names = vector<string>({ "lambda_NEB3011" }); // this file only has 1 :(\r
+        const auto filter = PbiFilter{ PbiReferenceNameFilter{ names } };\r
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};\r
+        for (size_t row : expectedRows)\r
+            EXPECT_TRUE(filter.Accepts(index, row));\r
+    }\r
+\r
+    // unsupported compare types throw\r
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::LESS_THAN),          std::runtime_error);\r
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::LESS_THAN_EQUAL),    std::runtime_error);\r
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::GREATER_THAN),       std::runtime_error);\r
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::GREATER_THAN_EQUAL), std::runtime_error);\r
+}\r
+\r
+TEST(PbiFilterTest, ReferenceStartFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiReferenceStartFilter{ 8453 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{1});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiReferenceStartFilter{ 9200, Compare::GREATER_THAN_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,3});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, ZmwFilterOk)\r
+{\r
+    {\r
+        const auto filter = PbiFilter{ PbiZmwFilter{ 14743 } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+    }\r
+    {\r
+        const auto filter = PbiFilter{ PbiZmwFilter{ 14743, Compare::NOT_EQUAL } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{});\r
+    }\r
+    {\r
+        const auto zmws = vector<int32_t>({14743,42,200});\r
+        const auto filter = PbiFilter{ PbiZmwFilter{ zmws } };\r
+        tests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, FromDataSetOk)\r
+{\r
+    const auto expectedFilter =\r
+        PbiFilter::Union(\r
+        {\r
+            PbiFilter::Intersection(\r
+            {\r
+                PbiZmwFilter{ 14743 },\r
+                PbiReadAccuracyFilter { 0.9, Compare::GREATER_THAN_EQUAL }\r
+            }),\r
+\r
+            PbiReferenceStartFilter { 9200, Compare::GREATER_THAN_EQUAL }\r
+        });\r
+\r
+\r
+    auto properties1 = Properties{ };\r
+    properties1.Add(Property{ "zm", "14743",  "==" });\r
+    properties1.Add(Property{ "rq", "0.9", ">=" });\r
+\r
+    auto datasetFilter1 = Filter{ };\r
+    datasetFilter1.Properties(properties1);\r
+\r
+    auto properties2 = Properties{ };\r
+    properties2.Add(Property{ "pos", "9200", ">=" });\r
+\r
+    auto datasetFilter2 = Filter{ };\r
+    datasetFilter2.Properties(properties2);\r
+\r
+    auto datasetFilters = Filters{ };\r
+    datasetFilters.Add(datasetFilter1);\r
+    datasetFilters.Add(datasetFilter2);\r
+    auto dataset = DataSet{ };\r
+    dataset.Filters(datasetFilters);\r
+\r
+    const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+\r
+    for (size_t i = 0; i < tests::shared_index.NumReads(); ++i) {\r
+        EXPECT_EQ(expectedFilter.Accepts(tests::shared_index, i),\r
+                  generatedFilter.Accepts(tests::shared_index, i));\r
+    }\r
+}\r
+\r
+TEST(PbiFilterTest, BarcodeListFromDataSetXmlOk)\r
+{\r
+    auto runner = [](const Property& property,\r
+                     const PbiFilter& expectedFilter,\r
+                     const std::vector<size_t>& expectedResults)\r
+    {\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  expectedResults);\r
+        tests::checkFilterRows(generatedFilter, expectedResults);\r
+    };\r
+\r
+    // single barcode\r
+    runner(Property{ "bc", "18", "==" },\r
+           PbiBarcodeFilter{ 18, Compare::EQUAL },\r
+           std::vector<size_t>{1,3});\r
+\r
+    // single barcode (bracketed)\r
+    runner(Property{ "bc", "[18]", "==" },\r
+           PbiBarcodeFilter{ 18, Compare::EQUAL },\r
+           std::vector<size_t>{1,3});\r
+\r
+    // barcode pair (square brackets)\r
+    runner(Property{ "bc", "[17,18]", "==" },\r
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },\r
+           std::vector<size_t>{1,3});\r
+\r
+    // barcode pair (parens)\r
+    runner(Property{ "bc", "(17,18)", "==" },\r
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },\r
+           std::vector<size_t>{1,3});\r
+\r
+    // barcode pair (curly brackets)\r
+    runner(Property{ "bc", "{17,18}", "==" },\r
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },\r
+           std::vector<size_t>{1,3});\r
+\r
+    // barcode pair (list, but no brackets)\r
+    runner(Property{ "bc", "17,18", "==" },\r
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },\r
+           std::vector<size_t>{1,3});\r
+\r
+    // barcode pair - same value\r
+    runner(Property{ "bc", "[18,18]", "==" },\r
+           PbiBarcodesFilter{ {18, 18}, Compare::EQUAL },\r
+           std::vector<size_t>{}); // none share forward & reverse\r
+\r
+    auto expectFail = [](const Property& property)\r
+    {\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        EXPECT_THROW(PbiFilter::FromDataSet(dataset), std::runtime_error);\r
+    };\r
+\r
+    // list-ish, but only one value\r
+    expectFail(Property{ "bc", "[18,]", "==" });\r
+\r
+    // too many barcodes\r
+    expectFail(Property{ "bc", "[18,18,18]", "==" });\r
+}\r
+\r
+TEST(PbiFilterTest, LocalContextFiltersFromDataSetXmlOk)\r
+{\r
+    {   // no adapters or barcodes\r
+\r
+        const auto expectedFilter =\r
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::EQUAL };\r
+\r
+        // XML: <Property Name="cx" Value="0" Operator="==" />\r
+        Property property("cx", "0", "==");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{0});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{0});\r
+    }\r
+    {   // any adapters or barcodes\r
+\r
+        const auto expectedFilter =\r
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL };\r
+\r
+        // XML: <Property Name="cx" Value="0" Operator="!=" />\r
+        Property property("cx", "0", "!=");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});\r
+    }\r
+    {   // contains adapter_before\r
+\r
+        const auto expectedFilter =\r
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS };\r
+\r
+        // XML: <Property Name="cx" Value="1" Operator="&" />\r
+        Property property("cx", "1", "&");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,3});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{1,3});\r
+    }\r
+    {   // contains adapter_before\r
+\r
+        const auto expectedFilter =\r
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS };\r
+\r
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE" Operator="&" />\r
+        Property property("cx", "ADAPTER_BEFORE", "&");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,3});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{1,3});\r
+    }\r
+    {   // contains adapter_after\r
+\r
+        const auto expectedFilter =\r
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::CONTAINS };\r
+\r
+        // XML: <Property Name="cx" Value="2" Operator="&" />\r
+        Property property("cx", "2", "&");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{2,3});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{2,3});\r
+    }\r
+    {   // contains adapter_before or adapter_after\r
+\r
+        const auto expectedFilter =\r
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,\r
+                                       Compare::CONTAINS };\r
+\r
+        // XML: <Property Name="cx" Value="3" Operator="&" />\r
+        Property property("cx", "3", "&");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});\r
+    }\r
+    {   // contains adapter_before or adapter_after\r
+\r
+        const auto expectedFilter =\r
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,\r
+                                       Compare::CONTAINS };\r
+\r
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE | ADAPTER_AFTER" Operator="&" />\r
+        Property property("cx", "ADAPTER_BEFORE | ADAPTER_AFTER", "&");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});\r
+    }\r
+    {   // contains adapter_before or adapter_after - no whitespace separation\r
+\r
+        const auto expectedFilter =\r
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,\r
+                                       Compare::CONTAINS };\r
+\r
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE|ADAPTER_AFTER" Operator="&" />\r
+        Property property("cx", "ADAPTER_BEFORE|ADAPTER_AFTER", "&");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});\r
+    }\r
+    {   // contains adapter_before or adapter_after - a lot of whitespace separation\r
+\r
+        const auto expectedFilter =\r
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,\r
+                                       Compare::CONTAINS };\r
+\r
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE        |           ADAPTER_AFTER" Operator="&" />\r
+        Property property("cx", "ADAPTER_BEFORE        |           ADAPTER_AFTER", "&");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});\r
+    }\r
+    {   // contains adapter_before or adapter_after, but not both\r
+\r
+        const auto expectedFilter = PbiFilter::Union(\r
+        {\r
+            PbiFilter::Intersection(\r
+            {\r
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL },\r
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS }\r
+            }),\r
+            PbiFilter::Intersection(\r
+            {\r
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL },\r
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::NOT_CONTAINS }\r
+            })\r
+        });\r
+\r
+        // XML:\r
+        // <Filters>\r
+        //   <Filter>\r
+        //     <Properties>\r
+        //       <Property Name="cx" Value="0" Operator="!=" />\r
+        //       <Property Name="cx" Value="1" Operator="~" />\r
+        //     </Properties>\r
+        //   </Filter>\r
+        //   <Filter>\r
+        //     <Properties>\r
+        //       <Property Name="cx" Value="0" Operator="!=" />\r
+        //       <Property Name="cx" Value="2" Operator="~" />\r
+        //     </Properties>\r
+        //   </Filter>\r
+        // </Filters>\r
+\r
+        auto filter1 = Filter{ };\r
+        filter1.Properties().Add(Property("cx", "0", "!="));\r
+        filter1.Properties().Add(Property("cx", "1", "~"));\r
+\r
+        auto filter2 = Filter{ };\r
+        filter2.Properties().Add(Property("cx", "0", "!="));\r
+        filter2.Properties().Add(Property("cx", "2", "~"));\r
+\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter1);\r
+        dataset.Filters().Add(filter2);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2});\r
+\r
+    }\r
+    {   // contains adapter_before or adapter_after\r
+\r
+        const auto expectedFilter = PbiFilter::Union(\r
+        {\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }\r
+        });\r
+\r
+        // XML:\r
+        // <Filters>\r
+        //   <Filter>\r
+        //     <Properties>\r
+        //       <Property Name="cx" Value="1" Operator="&" />\r
+        //     </Properties>\r
+        //   </Filter>\r
+        //   <Filter>\r
+        //     <Properties>\r
+        //       <Property Name="cx" Value="2" Operator="&" />\r
+        //     </Properties>\r
+        //   </Filter>\r
+        // </Filters>\r
+\r
+        auto filter1 = Filter{ };\r
+        filter1.Properties().Add(Property("cx", "1", "&"));\r
+\r
+        auto filter2 = Filter{ };\r
+        filter2.Properties().Add(Property("cx", "2", "&"));\r
+\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter1);\r
+        dataset.Filters().Add(filter2);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});\r
+    }\r
+    { // adapter_before and adapter_after\r
+\r
+        const auto expectedFilter = PbiFilter::Intersection(\r
+        {\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }\r
+        });\r
+\r
+        // XML:\r
+        // <Property Name="cx" Value="1" Operator="&" />\r
+        // <Property Name="cx" Value="2" Operator="&" />\r
+        Property property1("cx", "1", "&");\r
+        Property property2("cx", "2", "&");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property1);\r
+        filter.Properties().Add(property2);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{3});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{3});\r
+    }\r
+    {   // adapter_before, but no adapter_after\r
+\r
+        const auto expectedFilter = PbiFilter::Intersection(\r
+        {\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }\r
+        });\r
+\r
+        // XML:\r
+        // <Property Name="cx" Value="1" Operator="&" />\r
+        // <Property Name="cx" Value="2" Operator="~" />\r
+        Property property1("cx", "1", "&");\r
+        Property property2("cx", "2", "~");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property1);\r
+        filter.Properties().Add(property2);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{1});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{1});\r
+    }\r
+    {   // contains no adapter_before\r
+\r
+        const auto expectedFilter =\r
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS };\r
+\r
+        // XML: <Property Name="cx" Value="1" Operator="~" />\r
+        Property property("cx", "1", "~");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{0,2});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{0,2});\r
+    }\r
+    {   // contains no adapter_before or adapter_after\r
+\r
+        const auto expectedFilter = PbiFilter::Intersection(\r
+        {\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS },\r
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }\r
+        });\r
+\r
+        // XML:\r
+        // <Property Name="cx" Value="1" Operator="~" />\r
+        // <Property Name="cx" Value="2" Operator="~" />\r
+        Property property1("cx", "1", "~");\r
+        Property property2("cx", "2", "~");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property1);\r
+        filter.Properties().Add(property2);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{0});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{0});\r
+    }\r
+    {   // contains no adapter_before or adapter_after\r
+\r
+        const auto expectedFilter =\r
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,\r
+                                       Compare::NOT_CONTAINS };\r
+\r
+        // XML: <Property Name="cx" Value="3" Operator="~" />\r
+        Property property("cx", "3", "~");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);\r
+        tests::checkFilterRows(expectedFilter,  std::vector<size_t>{0});\r
+        tests::checkFilterRows(generatedFilter, std::vector<size_t>{0});\r
+    }\r
+    {   // throws on invalid enum name\r
+\r
+        Property property("cx", "DOES_NOT_EXIST", "~");\r
+\r
+        auto filter = Filter{ };\r
+        filter.Properties().Add(property);\r
+        DataSet dataset = DataSet{ };\r
+        dataset.Filters().Add(filter);\r
+\r
+        EXPECT_THROW(PbiFilter::FromDataSet(dataset), std::runtime_error);\r
+    }\r
+}\r
diff --git a/tests/src/test_PbiFilterQuery.cpp b/tests/src/test_PbiFilterQuery.cpp

new file mode 100644 (file)

index 0000000..cacd7cd
--- /dev/null
+++ b/tests/src/test_PbiFilterQuery.cpp
@@ -0,0 +1,444 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/PbiFilterQuery.h>
+#include <algorithm>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(PbiFilterQueryTest, QueryOk)
+{
+    const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } };
+
+    {
+        int count = 0;
+        PbiFilterQuery query( PbiQueryLengthFilter{ 500, Compare::GREATER_THAN_EQUAL}, bamFile);
+        for (const auto& r: query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 500);
+        }
+        EXPECT_EQ(3, count);
+    }
+    {
+        // all records aligned to reverse strand && pos >= 9200
+        const auto filter = PbiFilter::Intersection(
+        {
+            PbiAlignedStrandFilter{Strand::REVERSE},
+            PbiReferenceStartFilter{9200, Compare::GREATER_THAN_EQUAL}
+        });
+
+        int count = 0;
+        PbiFilterQuery query(filter, bamFile);
+        for (const auto& r: query) {
+            ++count;
+            EXPECT_EQ(Strand::REVERSE, r.AlignedStrand());
+            EXPECT_GE((r.ReferenceStart()), 9200);
+            EXPECT_EQ(string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237"), r.FullName());
+        }
+        EXPECT_EQ(1, count);
+    }
+    {
+        // all records aligned to forward strand && pos >= 9200
+        const auto filter = PbiFilter::Intersection(
+        {
+            PbiAlignedStrandFilter{Strand::FORWARD},
+            PbiReferenceStartFilter{9200, Compare::GREATER_THAN_EQUAL}
+        });
+
+        int count = 0;
+        PbiFilterQuery query(filter, bamFile);
+        for (const auto& r: query) {
+            ++count;
+            EXPECT_EQ(Strand::FORWARD, r.AlignedStrand());
+            EXPECT_GE((r.ReferenceStart()), 9200);
+            EXPECT_EQ(string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2114_2531"), r.FullName());
+        }
+        EXPECT_EQ(1, count);
+    }
+    {
+        // all records from RG ("b89a4406") with numMatches >= 1200
+        const auto filter = PbiFilter::Intersection(
+        {
+            PbiReadGroupFilter{"b89a4406"},
+            PbiNumMatchesFilter{1200, Compare::GREATER_THAN_EQUAL}
+        });
+
+        int count = 0;
+        PbiFilterQuery query(filter, bamFile);
+        for (const auto& r: query) {
+            ++count;
+            EXPECT_EQ(string("b89a4406"), r.ReadGroupId());
+            EXPECT_GE((r.NumMatches()), 1200);
+            if (count == 1)
+                EXPECT_EQ(string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055"), r.FullName());
+            else if (count == 2)
+                EXPECT_EQ(string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/4101_5571"), r.FullName());
+        }
+        EXPECT_EQ(2, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, ZmwRangeFromDatasetOk)
+{
+    const auto expectedMovieName = string{ "m150404_101626_42267_c100807920800000001823174110291514_s1_p0" };
+
+    const DataSet ds(tests::Data_Dir + "/chunking/chunking.subreadset.xml");
+    EXPECT_EQ(3, ds.BamFiles().size());
+
+    { // movie name
+
+        int count = 0;
+        PbiFilterQuery query{ PbiMovieNameFilter{expectedMovieName}, ds };
+        for (const BamRecord& r : query) {
+            EXPECT_EQ(expectedMovieName, r.MovieName());
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+    
+    { // sequencing chemistries
+        set<string> chems{ ds.SequencingChemistries() };
+        set<string> expected{ "P6-C4" };
+        EXPECT_TRUE(equal(chems.begin(), chems.end(), expected.begin()));
+    }
+
+    { // min ZMW
+
+        int count = 0;
+        PbiFilterQuery query{ PbiZmwFilter{54, Compare::GREATER_THAN}, ds };
+        for (const BamRecord& r : query) {
+            EXPECT_GT(r.HoleNumber(), 54);
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+
+    { // max ZMW
+
+        int count = 0;
+        PbiFilterQuery query{ PbiZmwFilter{1816, Compare::LESS_THAN}, ds };
+        for (const BamRecord& r : query) {
+            EXPECT_LT(r.HoleNumber(),1816);
+            ++count;
+        }
+        EXPECT_EQ(150, count);
+    }
+
+    { // put all together, from DataSet XML
+
+        const PbiFilter filter = PbiFilter::FromDataSet(ds);
+        PbiFilterQuery query(filter, ds);
+        int count = 0;
+        for (const BamRecord& r : query) {
+            EXPECT_EQ(expectedMovieName, r.MovieName());
+            const auto zmw = r.HoleNumber();
+            EXPECT_GT(zmw, 54);
+            EXPECT_LT(zmw, 1816);
+            ++count;
+        }
+        EXPECT_EQ(150, count);
+    }
+    { // empty filter object - should return all records from the same dataset
+
+        PbiFilterQuery query(PbiFilter{ }, ds);
+        int count = 0;
+        for (const BamRecord& r : query) {
+            (void)r;
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+    { // no <Filters> element present at all
+
+        const DataSet ds(tests::GeneratedData_Dir + "/chunking_missingfilters.subreadset.xml");
+        const PbiFilter filter = PbiFilter::FromDataSet(ds);
+        PbiFilterQuery query(filter, ds);
+        int count = 0;
+        for (const BamRecord& r : query) {
+            (void)r;
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+    { // <Filters> element contains no child <Filter> elements
+
+        const DataSet ds(tests::GeneratedData_Dir + "/chunking_emptyfilters.subreadset.xml");
+        const PbiFilter filter = PbiFilter::FromDataSet(ds);
+        PbiFilterQuery query(filter, ds);
+        int count = 0;
+        for (const BamRecord& r : query) {
+            (void)r;
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, MissingPbiShouldThrow)
+{
+    const PbiFilter filter{ PbiZmwFilter{31883} };
+    const string phi29Bam = tests::GeneratedData_Dir + "/missing_pbi.bam";
+    const string hasPbiBam = tests::Data_Dir + "/polymerase/production.scraps.bam";
+
+    { // single file, missing PBI
+
+        EXPECT_THROW(PbiFilterQuery(filter, phi29Bam), std::runtime_error);
+    }
+
+    { // from dataset, all missing PBI
+
+        DataSet ds;
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        EXPECT_THROW(PbiFilterQuery(filter, ds), std::runtime_error);
+    }
+
+    { // from dataset, mixed PBI presence
+
+        DataSet ds;
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.ScrapsBamFile", hasPbiBam));
+        EXPECT_THROW(PbiFilterQuery(filter, ds), std::runtime_error);
+    }
+}
+
+TEST(PbiFilterQueryTest, QNameWhitelistFile)
+{
+    const DataSet ds(tests::Data_Dir + "/polymerase/qnameFiltered.subreads.dataset.xml");
+    const PbiFilter filter = PbiFilter::FromDataSet(ds);
+    PbiFilterQuery query(filter, ds);
+    int count = 0;
+    for (const BamRecord& r : query) {
+        (void)r;
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(PbiFilterQueryTest, EmptyFiles)
+{
+    const BamFile file{ tests::Data_Dir + "/empty.bam" };
+    PbiFilterQuery query{ PbiFilter{}, file };
+    size_t count = 0;
+    for (const auto& r : query) {
+        (void)r;
+        ++count;
+    }
+    EXPECT_EQ(0, count);
+}
+
+TEST(PbiFilterQueryTest, BarcodeData)
+{
+   const BamFile file{ tests::Data_Dir + "/phi29.bam" };
+
+   // bc_quality == 1
+   {
+       size_t count = 0;
+       PbiFilterQuery query{ PbiBarcodeQualityFilter{1}, file };
+       for (const auto& r : query) {
+           (void)r;
+           ++count;
+       }
+       EXPECT_EQ(120, count);
+   }
+
+   // bc_quality != 1
+   {
+       size_t count = 0;
+       PbiFilterQuery query{ PbiBarcodeQualityFilter{1, Compare::NOT_EQUAL}, file };
+       for (const auto& r : query) {
+           (void)r;
+           ++count;
+       }
+       EXPECT_EQ(0, count);
+   }
+
+   // bc_forward == 0
+   {
+       size_t count = 0;
+       PbiFilterQuery query{ PbiBarcodeForwardFilter{0}, file };
+       for (const auto& r : query) {
+           (void)r;
+           ++count;
+       }
+       EXPECT_EQ(40, count);
+   }
+
+   // bc_forward == [0,2]
+   {
+       size_t count = 0;
+       const auto ids = vector<int16_t>{ 0, 2 };
+       PbiFilterQuery query{ PbiBarcodeForwardFilter{ ids }, file };
+       for (const auto& r : query) {
+           (void)r;
+           ++count;
+       }
+       EXPECT_EQ(80, count);
+   }
+
+   // bc_reverse != 0
+   {
+       size_t count = 0;
+       PbiFilterQuery query{ PbiBarcodeReverseFilter{0, Compare::NOT_EQUAL}, file };
+       for (const auto& r : query) {
+           (void)r;
+           ++count;
+       }
+       EXPECT_EQ(80, count);
+   }
+}
+
+TEST(PbiFilterQueryTest, BarcodeQualityFromXml)
+{
+
+const string xml_all = R"_XML_(
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+   xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+   xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+   xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+   xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+   xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+   xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+   UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+   TimeStampedName="subreadset_150304_231155" 
+   MetaType="PacBio.DataSet.SubreadSet" 
+   Name="DataSet_SubreadSet" 
+   Tags="" 
+   Version="3.0.0" 
+   CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+   <pbbase:ExternalResource 
+       UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+       TimeStampedName="subread_bam_150304_231155" 
+       MetaType="PacBio.SubreadFile.SubreadBamFile" 
+       ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam">
+       <pbbase:FileIndices>
+           <pbbase:FileIndex 
+               UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5194" 
+               TimeStampedName="bam_index_150304_231155" 
+               MetaType="PacBio.Index.PacBioIndex" 
+               ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi"/>
+       </pbbase:FileIndices>
+   </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="bq" Operator="=" Value="1"/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
+)_XML_";
+
+const string xml_none = R"_XML_(
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet
+   xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+   xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+   xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+   xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+   xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+   xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"
+   UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c"
+   TimeStampedName="subreadset_150304_231155"
+   MetaType="PacBio.DataSet.SubreadSet"
+   Name="DataSet_SubreadSet"
+   Tags=""
+   Version="3.0.0"
+   CreatedAt="2015-01-27T09:00:01">
+<pbbase:ExternalResources>
+   <pbbase:ExternalResource
+       UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193"
+       TimeStampedName="subread_bam_150304_231155"
+       MetaType="PacBio.SubreadFile.SubreadBamFile"
+       ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam">
+       <pbbase:FileIndices>
+           <pbbase:FileIndex
+               UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5194"
+               TimeStampedName="bam_index_150304_231155"
+               MetaType="PacBio.Index.PacBioIndex"
+               ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi"/>
+       </pbbase:FileIndices>
+   </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="bq" Operator="!=" Value="1"/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
+)_XML_";
+
+     const BamFile file{ tests::Data_Dir + "/phi29.bam" }; 
+
+     {   // filter allows all records
+         const DataSet ds = DataSet::FromXml(xml_all);
+         const PbiFilterQuery query { PbiFilter::FromDataSet(ds), file };
+         size_t count = 0;
+         for (const auto& r : query) {
+             (void)r;
+             ++count;
+         }
+         EXPECT_EQ(120, count);
+     }
+     {    // filter allows no records
+         const DataSet ds = DataSet::FromXml(xml_none);
+         const PbiFilterQuery query { PbiFilter::FromDataSet(ds), file };
+         size_t count = 0;
+         for (const auto& r : query) {
+             (void)r;
+             ++count;
+         }
+         EXPECT_EQ(0, count);
+    }
+}
+
+
diff --git a/tests/src/test_Pulse2BaseCache.cpp b/tests/src/test_Pulse2BaseCache.cpp

new file mode 100644 (file)

index 0000000..e93fa73
--- /dev/null
+++ b/tests/src/test_Pulse2BaseCache.cpp
@@ -0,0 +1,84 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/../../src/Pulse2BaseCache.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+TEST(Pulse2BaseCacheTest, CountsDetectedInConstructor)
+{
+    const string pulseCalls = "ACccTTAGtTCAtG";
+    const string trimmedPC  = "ACTTAGTCAG";
+
+    const Pulse2BaseCache cache{ pulseCalls };
+
+    EXPECT_EQ(pulseCalls.size(), cache.NumPulses());
+    EXPECT_EQ(trimmedPC.size(),  cache.NumBases());
+}
+
+TEST(Pulse2BaseCacheTest, RemovesSquashedPulsesFromString)
+{
+    const string pulseCalls = "ACccTTAGtTCAtG";
+    const string trimmedPC  = "ACTTAGTCAG";
+    const string altLabel   = "-G--A--T--AC--";
+    const string trimmedAlt = "-GA--T-AC-";
+
+    const Pulse2BaseCache cache{ pulseCalls };
+
+    EXPECT_EQ(trimmedPC,  cache.RemoveSquashedPulses(pulseCalls));
+    EXPECT_EQ(trimmedAlt, cache.RemoveSquashedPulses(altLabel));
+}
+
+TEST(Pulse2BaseCacheTest, RemovesSquashedPulsesFromVector)
+{
+    const string pulseCalls = "ACccTTAGtTCAtG";
+    const vector<uint16_t> pkMean        = {5,4,2,2,3,8,8,8,4,7,7,7,3,4};
+    const vector<uint16_t> trimmedPkmean = {5,4,3,8,8,8,7,7,7,4};
+
+    const Pulse2BaseCache cache{ pulseCalls };
+
+    EXPECT_EQ(trimmedPkmean, cache.RemoveSquashedPulses(pkMean));
+}
diff --git a/tests/src/test_QNameQuery.cpp b/tests/src/test_QNameQuery.cpp

new file mode 100644 (file)

index 0000000..1a2dcd1
--- /dev/null
+++ b/tests/src/test_QNameQuery.cpp
@@ -0,0 +1,96 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Yuan Li
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/QNameQuery.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+static const string dataDir = tests::Data_Dir + "/group/";
+static const string test1fn = string(dataDir) + "test1.bam";
+static const string test2fn = string(dataDir) + "test2.bam";
+static const string test3fn = string(dataDir) + "test3.bam";
+
+static
+void TestQNameQuery(const string& fn, const vector<int>& expected)
+{
+    EXPECT_NO_THROW(
+    {
+        vector<int> counts;
+        QNameQuery qQuery(fn);
+        for (const vector<BamRecord>& records : qQuery)
+            counts.push_back(records.size());
+        EXPECT_EQ(expected, counts);
+    });
+}
+
+static
+void TestNoneConstQNameQuery(const string& fn, const vector<int>& expected)
+{
+    EXPECT_NO_THROW(
+    {
+        vector<int> counts;
+        QNameQuery qQuery(fn);
+        for (vector<BamRecord>& records : qQuery)
+            counts.push_back(records.size());
+        EXPECT_EQ(expected, counts);
+    });
+}
+
+TEST(QNameQueryTest, CountQSizes)
+{
+    // test case 1 has exactly one bamRecord.
+    string fn = test1fn;
+    vector<int> expected({1});
+    TestQNameQuery(fn, expected);
+    TestNoneConstQNameQuery(fn, expected);
+
+    // test case 2 has bamRecords of four subreads.
+    fn = test2fn;
+    expected = {1, 1, 1, 1};
+    TestQNameQuery(fn, expected);
+    TestNoneConstQNameQuery(fn, expected);
+
+    fn = test3fn;
+    expected = {2,1,1,1,1,1,1,2,1,1,1};
+    TestQNameQuery(fn, expected);
+    TestNoneConstQNameQuery(fn, expected);
+}
+
diff --git a/tests/src/test_QualityValues.cpp b/tests/src/test_QualityValues.cpp

new file mode 100644 (file)

index 0000000..98795a3
--- /dev/null
+++ b/tests/src/test_QualityValues.cpp
@@ -0,0 +1,120 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/QualityValues.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(QualityValueTest, DefaultsOk)
+{
+    const QualityValue value;
+    EXPECT_EQ(0,   value);
+    EXPECT_EQ('!', value.Fastq());
+}
+
+TEST(QualityValueTest, FromNumber)
+{
+    const QualityValue zero(0);
+    const QualityValue thirtyThree(33);
+    const QualityValue valid(42);
+    const QualityValue max(93);
+    const QualityValue tooHigh(94);
+    const QualityValue wayTooHigh(INT8_MAX);
+
+    EXPECT_EQ(0,  zero);
+    EXPECT_EQ(33, thirtyThree);
+    EXPECT_EQ(42, valid);
+    EXPECT_EQ(93, max);
+    EXPECT_EQ(93, tooHigh);
+    EXPECT_EQ(93, wayTooHigh);
+
+    EXPECT_EQ('!', zero.Fastq());
+    EXPECT_EQ('B', thirtyThree.Fastq());
+    EXPECT_EQ('K', valid.Fastq());
+    EXPECT_EQ('~', max.Fastq());
+    EXPECT_EQ('~', tooHigh.Fastq());
+    EXPECT_EQ('~', wayTooHigh.Fastq());
+}
+
+TEST(QualityValueTest, FromFastq)
+{
+    const QualityValue zero        = QualityValue::FromFastq('!');
+    const QualityValue thirtyThree = QualityValue::FromFastq('B');
+    const QualityValue valid       = QualityValue::FromFastq('K');
+    const QualityValue max         = QualityValue::FromFastq('~');
+
+    EXPECT_EQ(0,  zero);
+    EXPECT_EQ(33, thirtyThree);
+    EXPECT_EQ(42, valid);
+    EXPECT_EQ(93, max);
+}
+
+TEST(QualityValuesTest, Default)
+{
+    const QualityValues qvs;
+    EXPECT_TRUE(qvs.empty());
+    EXPECT_EQ(string(), qvs.Fastq());
+}
+
+TEST(QualityValuesTest, FromNumbers)
+{
+    const string fastqString = "~~~KKBB!!";
+    const vector<uint8_t> values = { 93, 93, 93, 42, 42, 33, 33, 0, 0 };
+
+    QualityValues qvs;
+    for (auto qv : values)
+        qvs.push_back(qv);
+    EXPECT_EQ(fastqString, qvs.Fastq());
+}
+
+TEST(QualityValuesTest, FromFastq)
+{
+    const string fastqString = "~~~KKBB!!";
+    const vector<uint8_t> values = { 93, 93, 93, 42, 42, 33, 33, 0, 0 };
+
+    const QualityValues& qvs = QualityValues::FromFastq(fastqString);
+    EXPECT_EQ(fastqString.size(), qvs.size());
+    EXPECT_EQ(values.size(), qvs.size());
+    for (size_t i = 0; i < fastqString.size(); ++i)
+        EXPECT_EQ(values.at(i), qvs.at(i));
+}
diff --git a/tests/src/test_ReadAccuracyQuery.cpp b/tests/src/test_ReadAccuracyQuery.cpp

new file mode 100644 (file)

index 0000000..05d8bfc
--- /dev/null
+++ b/tests/src/test_ReadAccuracyQuery.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/ReadAccuracyQuery.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(ReadAccuracyQueryTest, QueryOk)
+{
+    const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } };
+
+    {
+        int count = 0;
+        ReadAccuracyQuery query(0.901, Compare::GREATER_THAN_EQUAL, bamFile);
+        for (const auto& r: query) {
+            ++count;
+            EXPECT_GE(r.ReadAccuracy(), 0.901);
+        }
+        EXPECT_EQ(4, count);
+    }
+    {
+        int count = 0;
+        ReadAccuracyQuery query(0.95, Compare::GREATER_THAN_EQUAL, bamFile);
+        for (const auto& r: query) {
+            ++count;
+            EXPECT_GE(r.ReadAccuracy(), 0.901);
+        }
+        EXPECT_EQ(0, count);
+    }
+}
diff --git a/tests/src/test_ReadGroupInfo.cpp b/tests/src/test_ReadGroupInfo.cpp

new file mode 100644 (file)

index 0000000..8b9f23a
--- /dev/null
+++ b/tests/src/test_ReadGroupInfo.cpp
@@ -0,0 +1,230 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett, Lance Hepler
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/ReadGroupInfo.h>
+#include <string>
+#include <vector>
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(ReadGroupInfoTest, IdFromMovieNameAndReadType)
+{
+    ReadGroupInfo rg("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0", "HQREGION");
+    EXPECT_EQ("00082ba1", rg.Id());
+}
+
+TEST(ReadGroupInfoTest, FrameCodecSetOk)
+{
+    ReadGroupInfo rg("test");
+    rg.IpdCodec(FrameCodec::V1);
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::IPD));
+    EXPECT_EQ("ip", rg.BaseFeatureTag(BaseFeature::IPD));
+    EXPECT_EQ(FrameCodec::V1, rg.IpdCodec());
+}
+
+TEST(ReadGroupInfoTest, SequencingChemistryOk)
+{
+    {   // P6-C4
+        const string& chem = "P6-C4";
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100356200","2.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100356200","2.3"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100612400","2.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100612400","2.3"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100356200","2.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100356200","2.3"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100612400","2.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100612400","2.3"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100356300")
+          .SequencingKit("100356200")
+          .BasecallerVersion("2.1");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+
+    {   // S/P1-C1/beta
+        const string& chem = "S/P1-C1/beta";
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-620-000","3.0"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-620-000","3.1"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-620-000")
+          .BasecallerVersion("3.0");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+
+    {   // S/P1-C1.1 (Echidna)
+        const string& chem = "S/P1-C1.1";
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.2"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.3"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-867-300")
+          .BasecallerVersion("3.1");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+
+    {   // S/P1-C1.2 (Flea)
+        const string& chem = "S/P1-C1.2";
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.2"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.3"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-902-100")
+          .BasecallerVersion("3.1");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+    {   // S/P1-C1.3 (Goat)
+        const string& chem = "S/P1-C1.3";
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-972-200","3.2"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-972-200","3.3"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-972-200")
+          .BasecallerVersion("3.3");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+}
+
+TEST(ReadGroupInfoTest, SequencingChemistryThrowsOnBadTriple)
+{
+    // check that we actually throw
+    ReadGroupInfo rg("BAD");
+    rg.BindingKit("100372700")
+      .SequencingKit("100-619-400")
+      .BasecallerVersion("2.0");
+    EXPECT_THROW(rg.SequencingChemistry(), InvalidSequencingChemistryException);
+
+    // now check thrown contents
+    try {
+        ReadGroupInfo rg("BAD");
+        rg.BindingKit("100372700")
+          .SequencingKit("100-619-400")
+          .BasecallerVersion("2.0");
+    } catch (InvalidSequencingChemistryException& e) {
+        EXPECT_EQ(string("100372700"),   e.BindingKit());
+        EXPECT_EQ(string("100-619-400"), e.SequencingKit());
+        EXPECT_EQ(string("2.0"),         e.BasecallerVersion());
+    }
+}
+
+TEST(ReadGroupInfoTest, BasecallerVersion)
+{
+    // too short
+    try {
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-867-300")
+          .BasecallerVersion("3");
+        const string chem = rg.SequencingChemistry();
+        (void)chem;
+
+    } catch (std::runtime_error& e) {
+        EXPECT_EQ(string("basecaller version too short: 3"), string(e.what()));
+    }
+
+    // initial implementation assumed single digit version numbers:
+    //    const string ver{ basecallerVersion.substr(0, 3) };
+    // So '3.299.dummy' would incorrectly be interpreted as (OK) '3.2'.
+    // 3.
+
+    try {
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-867-300")
+          .BasecallerVersion("3.199.dummy");   
+        const string chem = rg.SequencingChemistry();
+        (void)chem;
+
+    } catch (InvalidSequencingChemistryException& e) {
+        EXPECT_EQ("100-619-300", e.BindingKit());
+        EXPECT_EQ("100-867-300", e.SequencingKit());
+        EXPECT_EQ("3.199.dummy", e.BasecallerVersion());
+    }
+    //EXPECT_THROW(rg.SequencingChemistry(), InvalidSequencingChemistryException);
+}
+
+TEST(ReadGroupInfoTest, ClearBaseFeatures)
+{
+    ReadGroupInfo rg("test");
+    rg.BaseFeatureTag(BaseFeature::DELETION_QV,     "dq");
+    rg.BaseFeatureTag(BaseFeature::DELETION_TAG,    "dt");
+    rg.BaseFeatureTag(BaseFeature::INSERTION_QV,    "iq");
+    rg.BaseFeatureTag(BaseFeature::MERGE_QV,        "mq");
+    rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV, "sq");
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+    EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV));
+
+    rg.ClearBaseFeatures();
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_TAG));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::INSERTION_QV));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::MERGE_QV));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_QV));
+}
+
+TEST(ReadGroupInfoTest, RemoveBaseFeature)
+{
+    ReadGroupInfo rg("test");
+    rg.BaseFeatureTag(BaseFeature::DELETION_QV,     "dq");
+    rg.BaseFeatureTag(BaseFeature::DELETION_TAG,    "dt");
+    rg.BaseFeatureTag(BaseFeature::INSERTION_QV,    "iq");
+    rg.BaseFeatureTag(BaseFeature::MERGE_QV,        "mq");
+    rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV, "sq");
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+    EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV));
+
+    rg.RemoveBaseFeature(BaseFeature::DELETION_QV);
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+    
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_TAG));
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::INSERTION_QV));
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::MERGE_QV));
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_QV));
+}
+
diff --git a/tests/src/test_SamWriter.cpp b/tests/src/test_SamWriter.cpp

new file mode 100644 (file)

index 0000000..f13b5df
--- /dev/null
+++ b/tests/src/test_SamWriter.cpp
@@ -0,0 +1,150 @@
+// Copyright (c) 2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/SamWriter.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(SamWriterTest, HeaderOk)
+{
+    // setup header
+    const string hdrText = {
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.3\n"
+        "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;"
+             "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t"
+             "PU:test\tPM:SEQUEL\n"
+    };
+
+    EXPECT_NO_THROW(
+    {
+        // write header to file
+        const string generatedFn = tests::GeneratedData_Dir + "/samwriter_hdr_only.sam";
+        {
+            const BamHeader inputHeader(hdrText);
+            SamWriter writer(generatedFn, inputHeader);
+            (void)writer;
+        };
+
+        // check header
+        {
+            ifstream f(generatedFn);
+            const string text((istreambuf_iterator<char>(f)),
+                               istreambuf_iterator<char>());
+            EXPECT_EQ(hdrText, text);
+        }
+
+        // clean up
+        remove(generatedFn.c_str());
+    });
+}
+
+TEST(SamWriterTest, SingleRecordOk)
+{
+
+    // setup header
+    const string hdrLine1 = { "@HD\tVN:1.1\tSO:unknown\tpb:3.0.3" };
+    const string hdrLine2 = {
+        "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;"
+             "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t"
+             "PU:test\tPM:SEQUEL"
+    };
+    const string hdrText = hdrLine1 + "\n" + hdrLine2 + "\n";
+    const BamHeader inputHeader(hdrText);
+
+    // setup record
+    BamRecord record(inputHeader);
+    record.Impl().Name("test/100/0_5");
+    record.Impl().SetSequenceAndQualities("ACGTC", 5, "@@@@@");
+    record.Impl().CigarData("");
+    record.Impl().Bin(0);
+    record.Impl().Flag(0);
+    record.Impl().InsertSize(0);
+    record.Impl().MapQuality(0);
+    record.Impl().MatePosition(-1);
+    record.Impl().MateReferenceId(-1);
+    record.Impl().Position(-1);
+    record.Impl().ReferenceId(-1);
+    record.Impl().SetMapped(false);
+
+    TagCollection tags;
+    tags["zm"] = static_cast<int32_t>(100);
+    tags["qs"] = static_cast<Position>(0);
+    tags["qe"] = static_cast<Position>(5);
+    tags["np"] = static_cast<int32_t>(1);
+    tags["rq"] = static_cast<float>(0.6);
+    tags["RG"] = std::string{ "6002b307" };
+    tags["sn"] = vector<float>{0.2f,0.2f,0.2f,0.2f};
+    record.Impl().Tags(tags);
+
+    const string expectedSamRecord = {
+        "test/100/0_5\t4\t*\t0\t0\t*\t*\t0\t0\tACGTC\t@@@@@\tRG:Z:6002b307\t"
+        "np:i:1\tqe:i:5\tqs:i:0\trq:f:0.6\tsn:B:f,0.2,0.2,0.2,0.2\tzm:i:100"
+    };
+
+    EXPECT_NO_THROW(
+    {
+        // write data to file
+        const string generatedFn = tests::GeneratedData_Dir + "/samwriter_hdr_and_record.sam";
+        {
+            SamWriter writer(generatedFn, inputHeader);
+            writer.Write(record);
+        };
+
+        // check header & record
+        {
+            ifstream f(generatedFn);
+            string line1;
+            string line2;
+            string line3;
+            std::getline(f, line1);
+            std::getline(f, line2);
+            std::getline(f, line3);
+            EXPECT_EQ(hdrLine1, line1);
+            EXPECT_EQ(hdrLine2, line2);
+            EXPECT_EQ(expectedSamRecord, line3);
+        }
+
+        // cleanup
+        remove(generatedFn.c_str());
+    });
+}
diff --git a/tests/src/test_SequenceUtils.cpp b/tests/src/test_SequenceUtils.cpp

new file mode 100644 (file)

index 0000000..20bf5e6
--- /dev/null
+++ b/tests/src/test_SequenceUtils.cpp
@@ -0,0 +1,117 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/../../src/SequenceUtils.h>
+#include <string>
+#include <vector>
+#include <climits>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+TEST(SequenceUtilsTest, ComplementChar)
+{
+                        // complement
+    const char A = 'A'; // T
+    const char B = 'B'; // V
+    const char C = 'C'; // G
+    const char D = 'D'; // H
+    const char E = 'E'; // null
+    const char F = 'F'; // null
+    const char G = 'G'; // C
+    const char H = 'H'; // D
+    const char I = 'I'; // null
+    const char J = 'J'; // null
+    const char K = 'K'; // M
+    const char L = 'L'; // null
+    const char M = 'M'; // K
+    const char N = 'N'; // N
+    const char O = 'O'; // null
+    const char P = 'P'; // null
+    const char Q = 'Q'; // null
+    const char R = 'R'; // Y
+    const char S = 'S'; // S
+    const char T = 'T'; // A
+    const char U = 'U'; // A
+    const char V = 'V'; // B
+    const char W = 'W'; // W
+    const char X = 'X'; // null
+    const char Y = 'Y'; // R
+    const char Z = 'Z'; // null
+
+    EXPECT_EQ(T, Complement(A));
+    EXPECT_EQ(V, Complement(B));
+    EXPECT_EQ(G, Complement(C));
+    EXPECT_EQ(H, Complement(D));
+    EXPECT_EQ(0, Complement(E));
+    EXPECT_EQ(0, Complement(F));
+    EXPECT_EQ(C, Complement(G));
+    EXPECT_EQ(D, Complement(H));
+    EXPECT_EQ(0, Complement(I));
+    EXPECT_EQ(0, Complement(J));
+    EXPECT_EQ(M, Complement(K));
+    EXPECT_EQ(0, Complement(L));
+    EXPECT_EQ(K, Complement(M));
+    EXPECT_EQ(N, Complement(N));
+    EXPECT_EQ(0, Complement(O));
+    EXPECT_EQ(0, Complement(P));
+    EXPECT_EQ(0, Complement(Q));
+    EXPECT_EQ(Y, Complement(R));
+    EXPECT_EQ(S, Complement(S));
+    EXPECT_EQ(A, Complement(T));
+    EXPECT_EQ(A, Complement(U));
+    EXPECT_EQ(B, Complement(V));
+    EXPECT_EQ(W, Complement(W));
+    EXPECT_EQ(0, Complement(X));
+    EXPECT_EQ(R, Complement(Y));
+    EXPECT_EQ(0, Complement(Z));
+}
+
+TEST(SequenceUtilsTest, ReverseComplement)
+{
+    string input1 = "ATATATCCCGGCG";
+    const string rc1 = "CGCCGGGATATAT";
+
+    ReverseComplement(input1);
+    EXPECT_EQ(rc1, input1);
+}
diff --git a/tests/src/test_StringUtils.cpp b/tests/src/test_StringUtils.cpp

new file mode 100644 (file)

index 0000000..d335246
--- /dev/null
+++ b/tests/src/test_StringUtils.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/../../src/StringUtils.h>
+#include <string>
+#include <vector>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+TEST(StringUtilsTest, BasicSplit)
+{
+    const string test = "foo\tbar\tbaz";
+    const vector<string> tokens = internal::Split(test, '\t');
+    EXPECT_EQ(3, tokens.size());
+    EXPECT_TRUE(tokens.at(0) == "foo");
+    EXPECT_TRUE(tokens.at(1) == "bar");
+    EXPECT_TRUE(tokens.at(2) == "baz");
+}
+
+TEST(StringUtilsTest, SplitKeepsEmptyTokens)
+{
+    const string test = "foo\tbar\t\tbaz";
+    const vector<string> tokens = internal::Split(test, '\t');
+    EXPECT_EQ(4, tokens.size());
+    EXPECT_TRUE(tokens.at(0) == "foo");
+    EXPECT_TRUE(tokens.at(1) == "bar");
+    EXPECT_TRUE(tokens.at(2) == "");
+    EXPECT_TRUE(tokens.at(3) == "baz");
+}
diff --git a/tests/src/test_SubreadLengthQuery.cpp b/tests/src/test_SubreadLengthQuery.cpp

new file mode 100644 (file)

index 0000000..9d55e21
--- /dev/null
+++ b/tests/src/test_SubreadLengthQuery.cpp
@@ -0,0 +1,81 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/SubreadLengthQuery.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(SubreadLengthQueryTest, QueryOk)
+{
+    const auto bamFile = BamFile{ tests::Data_Dir + string{ "/group/test2.bam" } };
+
+    {
+        int count = 0;
+        SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, bamFile);
+        for (const auto& r: query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 500);
+        }
+        EXPECT_EQ(3, count);
+    }
+    {
+        int count = 0;
+        SubreadLengthQuery query(1000, Compare::GREATER_THAN_EQUAL, bamFile);
+        for (const auto& r: query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 1000);
+        }
+        EXPECT_EQ(2, count);
+    }
+    {
+        int count = 0;
+        SubreadLengthQuery query(5000, Compare::GREATER_THAN_EQUAL, bamFile);
+        for (const auto& r: query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 5000);
+        }
+        EXPECT_EQ(0, count);
+    }
+}
diff --git a/tests/src/test_Tags.cpp b/tests/src/test_Tags.cpp

new file mode 100644 (file)

index 0000000..6755204
--- /dev/null
+++ b/tests/src/test_Tags.cpp
@@ -0,0 +1,1144 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <boost/type_traits/is_convertible.hpp>
+#include <gtest/gtest.h>
+#include <pbbam/BamTagCodec.h>
+#include <pbbam/TagCollection.h>
+#include <pbbam/SamTagCodec.h>
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <string>
+
+#include <typeinfo>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+TEST(TagTest, TagConstruction)
+{
+    int8_t   i8   = 0;
+    uint8_t  u8   = 0;
+    int16_t  i16  = 0;
+    uint16_t  u16 = 0;
+    int32_t  i32  = 0;
+    uint32_t u32  = 0;
+    float    f    = 0.0;
+    string   str  = "";
+    vector<int8_t>   i8_array;
+    vector<uint8_t>  u8_array;
+    vector<int16_t>  i16_array;
+    vector<uint16_t> u16_array;
+    vector<int32_t>  i32_array;
+    vector<uint32_t> u32_Array;
+    vector<float>    float_array;
+
+    signed char   c  = 'A';
+    unsigned char uc = 'A';
+
+    Tag i8Tag(i8);
+    Tag u8Tag(u8);
+    Tag i16Tag(i16);
+    Tag u16Tag(u16);
+    Tag i32Tag(i32);
+    Tag u32Tag(u32);
+    Tag floatTag(f);
+    Tag stringTag(str);
+    Tag i8_array_Tag(i8_array);
+    Tag u8_array_Tag(u8_array);
+    Tag i16_array_Tag(i16_array);
+    Tag u16_array_Tag(u16_array);
+    Tag i32_array_Tag(i32_array);
+    Tag u32_array_Tag(u32_Array);
+    Tag float_array_Tag(float_array);
+
+    Tag charTag(c, TagModifier::ASCII_CHAR);
+    Tag ucharTag(uc, TagModifier::ASCII_CHAR);
+
+    EXPECT_TRUE(i8Tag.Type()     == TagDataType::INT8);
+    EXPECT_TRUE(u8Tag.Type()     == TagDataType::UINT8);
+    EXPECT_TRUE(i16Tag.Type()    == TagDataType::INT16);
+    EXPECT_TRUE(u16Tag.Type()    == TagDataType::UINT16);
+    EXPECT_TRUE(i32Tag.Type()    == TagDataType::INT32);
+    EXPECT_TRUE(u32Tag.Type()    == TagDataType::UINT32);
+    EXPECT_TRUE(floatTag.Type()  == TagDataType::FLOAT);
+    EXPECT_TRUE(stringTag.Type() == TagDataType::STRING);
+    EXPECT_TRUE(i8_array_Tag.Type()    == TagDataType::INT8_ARRAY);
+    EXPECT_TRUE(u8_array_Tag.Type()    == TagDataType::UINT8_ARRAY);
+    EXPECT_TRUE(i16_array_Tag.Type()   == TagDataType::INT16_ARRAY);
+    EXPECT_TRUE(u16_array_Tag.Type()   == TagDataType::UINT16_ARRAY);
+    EXPECT_TRUE(i32_array_Tag.Type()   == TagDataType::INT32_ARRAY);
+    EXPECT_TRUE(u32_array_Tag.Type()   == TagDataType::UINT32_ARRAY);
+    EXPECT_TRUE(float_array_Tag.Type() == TagDataType::FLOAT_ARRAY);
+
+    EXPECT_TRUE(charTag.ToAscii()  == 'A');
+    EXPECT_TRUE(ucharTag.ToAscii() == 'A');
+}
+
+TEST(TagTest, CopyAndCompare)
+{
+    int8_t   i8   = 0;
+    uint8_t  u8   = 0;
+    int16_t  i16  = 0;
+    uint16_t  u16 = 0;
+    int32_t  i32  = 0;
+    uint32_t u32  = 0;
+    float    f    = 0.0;
+    string   str  = "";
+    vector<int8_t>   i8_array;
+    vector<uint8_t>  u8_array;
+    vector<int16_t>  i16_array;
+    vector<uint16_t> u16_array;
+    vector<int32_t>  i32_array;
+    vector<uint32_t> u32_Array;
+    vector<float>    float_array;
+
+    Tag i8Tag(i8);
+    Tag u8Tag(u8);
+    Tag i16Tag(i16);
+    Tag u16Tag(u16);
+    Tag i32Tag(i32);
+    Tag u32Tag(u32);
+    Tag floatTag(f);
+    Tag stringTag(str);
+    Tag i8_array_Tag(i8_array);
+    Tag u8_array_Tag(u8_array);
+    Tag i16_array_Tag(i16_array);
+    Tag u16_array_Tag(u16_array);
+    Tag i32_array_Tag(i32_array);
+    Tag u32_array_Tag(u32_Array);
+    Tag float_array_Tag(float_array);
+
+    Tag i8Tag2 = i8Tag;
+    Tag u8Tag2 = u8Tag;
+    Tag i16Tag2 = i16Tag;
+    Tag u16Tag2 = u16Tag;
+    Tag i32Tag2 = i32Tag;
+    Tag u32Tag2 = u32Tag;
+    Tag floatTag2 = floatTag;
+    Tag stringTag2 = stringTag;
+    Tag i8_array_Tag2 = i8_array_Tag;
+    Tag u8_array_Tag2 = u8_array_Tag;
+    Tag i16_array_Tag2 = i16_array_Tag;
+    Tag u16_array_Tag2 = u16_array_Tag;
+    Tag i32_array_Tag2 = i32_array_Tag;
+    Tag u32_array_Tag2 = u32_array_Tag;
+    Tag float_array_Tag2 = float_array_Tag;
+
+    EXPECT_EQ(i8Tag, i8Tag2);
+    EXPECT_EQ(u8Tag, u8Tag2);
+    EXPECT_EQ(i16Tag, i16Tag2);
+    EXPECT_EQ(u16Tag, u16Tag2);
+    EXPECT_EQ(i32Tag, i32Tag2);
+    EXPECT_EQ(u32Tag, u32Tag2);
+    EXPECT_EQ(floatTag, floatTag2);
+    EXPECT_EQ(stringTag, stringTag2);
+    EXPECT_EQ(i8_array_Tag, i8_array_Tag2);
+    EXPECT_EQ(u8_array_Tag, u8_array_Tag2);
+    EXPECT_EQ(i16_array_Tag, i16_array_Tag2);
+    EXPECT_EQ(u16_array_Tag, u16_array_Tag2);
+    EXPECT_EQ(i32_array_Tag, i32_array_Tag2);
+    EXPECT_EQ(u32_array_Tag, u32_array_Tag2);
+    EXPECT_EQ(float_array_Tag, float_array_Tag2);
+}
+
+TEST(TagTest, Type_None)
+{
+     Tag tag;
+
+     EXPECT_TRUE(tag.Type() == TagDataType::INVALID);
+     EXPECT_TRUE(tag.IsNull());
+     EXPECT_TRUE(tag.Typename() == "none");
+
+     EXPECT_FALSE(tag.IsNumeric());
+     EXPECT_FALSE(tag.IsString());
+     EXPECT_FALSE(tag.IsArray());
+}
+
+TEST(TagTest, Type_Int8)
+{
+    const int8_t v = -42;
+    const Tag tag(v);
+
+    int8_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt8());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT8);
+    EXPECT_TRUE(tag.Typename() == "int8_t");
+    EXPECT_TRUE(tag.IsInt8());
+
+    EXPECT_TRUE(tag.IsSignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsUnsignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt8)
+{
+    const uint8_t v = 42;
+    const Tag tag(v);
+
+    uint8_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt8());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT8);
+    EXPECT_TRUE(tag.Typename() == "uint8_t");
+    EXPECT_TRUE(tag.IsUInt8());
+
+    EXPECT_TRUE(tag.IsUnsignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsSignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Ascii)
+{
+    const char          c  = '$';
+    const signed char   sc = '$';
+    const unsigned char uc = '$';
+    const uint8_t u8 = 65;
+    const int8_t  i8 = 66;
+
+    { // old style: construct-then-modify
+
+        Tag fromPlainChar = Tag(c);
+        Tag fromSignedChar = Tag(sc);
+        Tag fromUnsignedChar = Tag(uc);
+        Tag fromUint8 = Tag(u8);
+        Tag fromInt8  = Tag(i8);
+        fromPlainChar.Modifier(TagModifier::ASCII_CHAR);
+        fromSignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUnsignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUint8.Modifier(TagModifier::ASCII_CHAR);
+        fromInt8.Modifier(TagModifier::ASCII_CHAR);
+
+        EXPECT_TRUE(fromPlainChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromPlainChar.IsIntegral());
+        EXPECT_TRUE(fromPlainChar.IsNumeric());
+        EXPECT_EQ('$', fromPlainChar.ToAscii());
+
+        EXPECT_TRUE(fromSignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromSignedChar.IsIntegral());
+        EXPECT_TRUE(fromSignedChar.IsNumeric());
+        EXPECT_EQ('$', fromSignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUnsignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUnsignedChar.IsIntegral());
+        EXPECT_TRUE(fromUnsignedChar.IsNumeric());
+        EXPECT_EQ('$', fromUnsignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUint8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUint8.IsIntegral());
+        EXPECT_TRUE(fromUint8.IsNumeric());
+        EXPECT_EQ('A', fromUint8.ToAscii());
+
+        EXPECT_TRUE(fromInt8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromInt8.IsIntegral());
+        EXPECT_TRUE(fromInt8.IsNumeric());
+        EXPECT_EQ('B', fromInt8.ToAscii());
+    }
+
+    { // new style: construct directly as ASCII
+
+        const Tag fromPlainChar    = Tag(c,  TagModifier::ASCII_CHAR);
+        const Tag fromSignedChar   = Tag(sc, TagModifier::ASCII_CHAR);
+        const Tag fromUnsignedChar = Tag(uc, TagModifier::ASCII_CHAR);
+        const Tag fromUint8 = Tag(u8, TagModifier::ASCII_CHAR);
+        const Tag fromInt8  = Tag(i8, TagModifier::ASCII_CHAR);
+
+        EXPECT_TRUE(fromPlainChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromPlainChar.IsIntegral());
+        EXPECT_TRUE(fromPlainChar.IsNumeric());
+        EXPECT_EQ('$', fromPlainChar.ToAscii());
+
+        EXPECT_TRUE(fromSignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromSignedChar.IsIntegral());
+        EXPECT_TRUE(fromSignedChar.IsNumeric());
+        EXPECT_EQ('$', fromSignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUnsignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUnsignedChar.IsIntegral());
+        EXPECT_TRUE(fromUnsignedChar.IsNumeric());
+        EXPECT_EQ('$', fromUnsignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUint8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUint8.IsIntegral());
+        EXPECT_TRUE(fromUint8.IsNumeric());
+        EXPECT_EQ('A', fromUint8.ToAscii());
+
+        EXPECT_TRUE(fromInt8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromInt8.IsIntegral());
+        EXPECT_TRUE(fromInt8.IsNumeric());
+        EXPECT_EQ('B', fromInt8.ToAscii());
+    }
+
+    // check invalid constructs
+    EXPECT_THROW(Tag('A', TagModifier::HEX_STRING), std::runtime_error);
+}
+
+TEST(TagTest, Type_Int16)
+{
+    const int16_t v = -42;
+    const Tag tag(v);
+
+    int16_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt16());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT16);
+    EXPECT_TRUE(tag.Typename() == "int16_t");
+    EXPECT_TRUE(tag.IsInt16());
+    EXPECT_TRUE(tag.IsSignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsUnsignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt16)
+{
+    const uint16_t v = 42;
+    const Tag tag(v);
+
+    uint16_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt16());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT16);
+    EXPECT_TRUE(tag.Typename() == "uint16_t");
+    EXPECT_TRUE(tag.IsUInt16());
+    EXPECT_TRUE(tag.IsUnsignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsSignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Int32)
+{
+    const int32_t v = -42;
+    const Tag tag(v);
+
+    int32_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt32());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT32);
+    EXPECT_TRUE(tag.Typename() == "int32_t");
+    EXPECT_TRUE(tag.IsInt32());
+    EXPECT_TRUE(tag.IsSignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsUnsignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt32)
+{
+    const uint32_t v = 42;
+    const Tag tag(v);
+
+    uint32_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt32());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT32);
+    EXPECT_TRUE(tag.Typename() == "uint32_t");
+    EXPECT_TRUE(tag.IsUInt32());
+    EXPECT_TRUE(tag.IsUnsignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsSignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Float)
+{
+    const float v = 3.141;
+    const Tag tag(v);
+
+    float v2;
+    EXPECT_NO_THROW(v2 = tag.ToFloat());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::FLOAT);
+    EXPECT_TRUE(tag.Typename() == "float");
+    EXPECT_TRUE(tag.IsFloat());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsIntegral());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_String)
+{
+    const string v = "foo_who";
+    const Tag tag(v);
+
+    string v2;
+    EXPECT_NO_THROW(v2 = tag.ToString());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::STRING);
+    EXPECT_TRUE(tag.Typename() == "string");
+    EXPECT_TRUE(tag.IsString());
+
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+
+    // "Hex format" string
+    const Tag hex("DEADBEEF", TagModifier::HEX_STRING);
+    EXPECT_TRUE(hex.Type() == TagDataType::STRING);
+    EXPECT_TRUE(hex.Typename() == "string");
+    EXPECT_TRUE(hex.IsString());
+    EXPECT_TRUE(hex.HasModifier(TagModifier::HEX_STRING));
+    EXPECT_FALSE(hex.IsNull());
+    EXPECT_FALSE(hex.IsNumeric());
+    EXPECT_FALSE(hex.IsArray());
+
+    // check invalid constructs
+    EXPECT_THROW(Tag("DEADBEEF", TagModifier::ASCII_CHAR), std::runtime_error);
+}
+
+TEST(TagTest, Type_Int8Array)
+{
+    const vector<int8_t> v = { -42, 100, 0 };
+    const Tag tag(v);
+
+    vector<int8_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt8Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT8_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<int8_t>");
+    EXPECT_TRUE(tag.IsInt8Array());
+    EXPECT_TRUE(tag.IsSignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt8Array)
+{
+    const vector<uint8_t> v = { 42, 200, 0 };
+    const Tag tag(v);
+
+    vector<uint8_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt8Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT8_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<uint8_t>");
+    EXPECT_TRUE(tag.IsUInt8Array());
+    EXPECT_TRUE(tag.IsUnsignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Int16Array)
+{
+    const vector<int16_t> v = { 42, -300, 0 };
+    const Tag tag(v);
+
+    vector<int16_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt16Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT16_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<int16_t>");
+    EXPECT_TRUE(tag.IsInt16Array());
+    EXPECT_TRUE(tag.IsSignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt16Array)
+{
+    const vector<uint16_t> v = { 42, 300, 0 };
+    const Tag tag(v);
+
+    vector<uint16_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt16Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT16_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<uint16_t>");
+    EXPECT_TRUE(tag.IsUInt16Array());
+    EXPECT_TRUE(tag.IsUnsignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);;
+}
+
+TEST(TagTest, Type_Int32Array)
+{
+    const vector<int32_t> v = { 42, -300, 0 };
+    const Tag tag(v);
+
+    vector<int32_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt32Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT32_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<int32_t>");
+    EXPECT_TRUE(tag.IsInt32Array());
+    EXPECT_TRUE(tag.IsSignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt32Array)
+{
+    const vector<uint32_t> v = { 42, 300, 0 };
+    const Tag tag(v);
+
+    vector<uint32_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt32Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT32_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<uint32_t>");
+    EXPECT_TRUE(tag.IsUInt32Array());
+    EXPECT_TRUE(tag.IsUnsignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_FloatArray)
+{
+    const vector<float> v = { 1.1f, 1.2f, 1.3f };
+    const Tag tag(v);
+
+    vector<float> v2;
+    EXPECT_NO_THROW(v2 = tag.ToFloatArray());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::FLOAT_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<float>");
+    EXPECT_TRUE(tag.IsFloatArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsIntegralArray());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, CastBackToOriginalOk)
+{
+    int8_t   i8   = 0;
+    uint8_t  u8   = 0;
+    int16_t  i16  = 0;
+    uint16_t  u16 = 0;
+    int32_t  i32  = 0;
+    uint32_t u32  = 0;
+    float    f    = 0.0;
+    string   str  = "";
+    vector<int8_t>   i8_array;
+    vector<uint8_t>  u8_array;
+    vector<int16_t>  i16_array;
+    vector<uint16_t> u16_array;
+    vector<int32_t>  i32_array;
+    vector<uint32_t> u32_array;
+    vector<float>    float_array;
+
+    Tag i8Tag(i8);
+    Tag u8Tag(u8);
+    Tag i16Tag(i16);
+    Tag u16Tag(u16);
+    Tag i32Tag(i32);
+    Tag u32Tag(u32);
+    Tag floatTag(f);
+    Tag stringTag(str);
+    Tag i8_array_Tag(i8_array);
+    Tag u8_array_Tag(u8_array);
+    Tag i16_array_Tag(i16_array);
+    Tag u16_array_Tag(u16_array);
+    Tag i32_array_Tag(i32_array);
+    Tag u32_array_Tag(u32_array);
+    Tag float_array_Tag(float_array);
+
+    EXPECT_NO_THROW({
+        i8 = i8Tag.ToInt8();
+        u8 = u8Tag.ToUInt8();
+        i16 = i16Tag.ToInt16();
+        u16 = u16Tag.ToUInt16();
+        i32 = i32Tag.ToInt32();
+        u32 = u32Tag.ToUInt32();
+        f = floatTag.ToFloat();
+        str = stringTag.ToString();
+        i8_array = i8_array_Tag.ToInt8Array();
+        u8_array = u8_array_Tag.ToUInt8Array();
+        i16_array = i16_array_Tag.ToInt16Array();
+        u16_array = u16_array_Tag.ToUInt16Array();
+        i32_array = i32_array_Tag.ToInt32Array();
+        u32_array = u32_array_Tag.ToUInt32Array();
+        float_array = float_array_Tag.ToFloatArray();
+    });
+}
+
+TEST(TagTest, ConvertToInt8)
+{
+    Tag zero(int32_t(0));
+    Tag min(int32_t(INT8_MIN));
+    Tag normal(int32_t(42));
+    Tag max(int32_t(INT8_MAX));
+    Tag underflow(int32_t(INT8_MIN-1));
+    Tag overflow(int32_t(INT8_MAX+1));
+    Tag floatTag(float(3.14));
+    Tag stringTag(string("foo"));
+    Tag arrayTag(vector<int8_t>({1, 2, 3}));
+
+    // allowed
+    EXPECT_NO_THROW(
+    {
+        zero.ToInt8();
+        min.ToInt8();
+        normal.ToInt8();
+        max.ToInt8();
+    });
+
+    // not allowed
+    EXPECT_THROW(underflow.ToInt8(), std::exception);
+    EXPECT_THROW(overflow.ToInt8(),  std::exception);
+    EXPECT_THROW(floatTag.ToInt8(),  std::exception);
+    EXPECT_THROW(stringTag.ToInt8(), std::exception);
+    EXPECT_THROW(arrayTag.ToInt8(),  std::exception);
+}
+
+TEST(TagTest, ConvertToUInt8)
+{
+    Tag zero(int32_t(0));
+    Tag neg(int32_t(-1));
+    Tag normal(int32_t(42));
+    Tag max(int32_t(UINT8_MAX));
+    Tag overflow(int32_t(UINT8_MAX+1));
+    Tag floatTag(float(3.14));
+    Tag stringTag(string("foo"));
+    Tag arrayTag(vector<uint8_t>({1, 2, 3}));
+
+    // allowed
+    EXPECT_NO_THROW(
+    {
+        zero.ToUInt8();
+        normal.ToUInt8();
+        max.ToUInt8();
+    });
+
+    // not allowed
+    EXPECT_THROW(neg.ToUInt8(),       std::exception);
+    EXPECT_THROW(overflow.ToUInt8(),  std::exception);
+    EXPECT_THROW(floatTag.ToUInt8(),  std::exception);
+    EXPECT_THROW(stringTag.ToUInt8(), std::exception);
+    EXPECT_THROW(arrayTag.ToUInt8(),  std::exception);
+}
+
+TEST(TagTest, ConvertToInt16)
+{
+    Tag zero(int32_t(0));
+    Tag min(int32_t(INT16_MIN));
+    Tag normal(int32_t(42));
+    Tag max(int32_t(INT16_MAX));
+    Tag underflow(int32_t(INT16_MIN-1));
+    Tag overflow(int32_t(INT16_MAX+1));
+    Tag floatTag(float(3.14));
+    Tag stringTag(string("foo"));
+    Tag arrayTag(vector<int16_t>({1, 2, 3}));
+
+    // allowed
+    EXPECT_NO_THROW(
+    {
+        zero.ToInt16();
+        min.ToInt16();
+        normal.ToInt16();
+        max.ToInt16();
+    });
+
+    // not allowed
+    EXPECT_THROW(underflow.ToInt16(), std::exception);
+    EXPECT_THROW(overflow.ToInt16(),  std::exception);
+    EXPECT_THROW(floatTag.ToInt16(),  std::exception);
+    EXPECT_THROW(stringTag.ToInt16(), std::exception);
+    EXPECT_THROW(arrayTag.ToInt16(),  std::exception);
+}
+
+TEST(TagTest, ConvertToUInt16)
+{
+    Tag zero(int32_t(0));
+    Tag neg(int32_t(-1));
+    Tag normal(int32_t(42));
+    Tag max(int32_t(UINT16_MAX));
+    Tag overflow(int32_t(UINT16_MAX+1));
+    Tag floatTag(float(3.14));
+    Tag stringTag(string("foo"));
+    Tag arrayTag(vector<uint16_t>({1, 2, 3}));
+
+    // allowed
+    EXPECT_NO_THROW(
+    {
+        zero.ToUInt16();
+        normal.ToUInt16();
+        max.ToUInt16();
+    });
+
+    // not allowed
+    EXPECT_THROW(neg.ToUInt16(),       std::exception);
+    EXPECT_THROW(overflow.ToUInt16(),  std::exception);
+    EXPECT_THROW(floatTag.ToUInt16(),  std::exception);
+    EXPECT_THROW(stringTag.ToUInt16(), std::exception);
+    EXPECT_THROW(arrayTag.ToUInt16(),  std::exception);
+}
+
+TEST(TagTest, ConvertToInt32)
+{
+    Tag zero(int32_t(0));
+    Tag min(int32_t(INT32_MIN));
+    Tag normal(int32_t(42));
+    Tag max(int32_t(INT32_MAX));
+    Tag floatTag(float(3.14));
+    Tag stringTag(string("foo"));
+    Tag arrayTag(vector<int32_t>({1, 2, 3}));
+
+    // no 64-bit ctors - will not compile
+    //
+    // Tag underflow(int64_t(INT32_MIN-1));
+    // Tag overflow(int64_t(INT32_MAX+1));
+
+    // allowed
+    EXPECT_NO_THROW(
+    {
+        zero.ToInt32();
+        min.ToInt32();
+        normal.ToInt32();
+        max.ToInt32();
+    });
+
+    // not allowed
+    EXPECT_THROW(floatTag.ToInt32(),  std::exception);
+    EXPECT_THROW(stringTag.ToInt32(), std::exception);
+    EXPECT_THROW(arrayTag.ToInt32(),  std::exception);
+}
+
+TEST(TagTest, ConvertToUInt32)
+{
+    Tag zero(int32_t(0));
+    Tag neg(int32_t(-1));
+    Tag normal(int32_t(42));
+    Tag max(uint32_t(UINT32_MAX));
+    Tag floatTag(float(3.14));
+    Tag stringTag(string("foo"));
+    Tag arrayTag(vector<uint32_t>({1, 2, 3}));
+
+    // no 64-bit ctors - will not compile
+    //
+    // Tag overflow(int64_t(UINT32_MAX+1));
+
+    // allowed
+    EXPECT_NO_THROW(
+    {
+        zero.ToUInt32();
+        normal.ToUInt32();
+        max.ToUInt32();
+    });
+
+    // not allowed
+    EXPECT_THROW(neg.ToUInt32(),       std::exception);
+    EXPECT_THROW(floatTag.ToUInt32(),  std::exception);
+    EXPECT_THROW(stringTag.ToUInt32(), std::exception);
+    EXPECT_THROW(arrayTag.ToUInt32(),  std::exception);
+}
+
+TEST(TagCollectionTest, DefaultConstruction)
+{
+    TagCollection tags;
+    EXPECT_TRUE(tags.empty());
+    EXPECT_FALSE(tags.Contains("XY"));
+}
+
+TEST(TagCollectionTest, AddSimpleTags)
+{
+    const int32_t intValue = -42;
+    const string strValue = "foo";
+    const string hexStrValue = "1abc75";
+
+    TagCollection tags;
+    tags["ST"] = strValue;
+    tags["XY"] = intValue;
+    tags["HX"] = hexStrValue;
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+
+    EXPECT_EQ(3, tags.size());
+    EXPECT_TRUE(tags.Contains("XY"));
+    EXPECT_TRUE(tags.Contains("ST"));
+    EXPECT_TRUE(tags.Contains("HX"));
+    EXPECT_FALSE(tags.Contains("ZZ"));
+
+    EXPECT_TRUE( tags["XY"].ToInt32() == intValue );
+    EXPECT_TRUE( tags["ST"].ToString() == strValue );
+    EXPECT_TRUE( tags["HX"].ToString() == hexStrValue );
+    EXPECT_TRUE( tags["HX"].HasModifier(TagModifier::HEX_STRING) );
+}
+
+TEST(SamTagCodecTest, DecodeTest)
+{
+    string tagString;
+    tagString.append("HX:H:1abc75");
+    tagString.append("\t");
+    tagString.append("ST:Z:foo");
+    tagString.append("\t");
+    tagString.append("VC:B:i,42,-100,37,2048");
+    tagString.append("\t");
+    tagString.append("XY:i:-42");
+
+    TagCollection expected;
+    expected["ST"] = string("foo");
+    expected["XY"] = int32_t(-42);
+    expected["HX"] = Tag("1abc75", TagModifier::HEX_STRING);
+    expected["VC"] = vector<int32_t>( { 42, -100, 37, 2048 } );
+
+    TagCollection tags = SamTagCodec::Decode(tagString);
+
+    EXPECT_TRUE(tags.Contains("ST"));
+    EXPECT_TRUE(tags.Contains("HX"));
+    EXPECT_TRUE(tags.Contains("XY"));
+    EXPECT_TRUE(tags.Contains("VC"));
+
+    EXPECT_EQ(string("foo"), tags["ST"].ToString());
+    EXPECT_TRUE(tags["HX"].HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(string("1abc75"), tags["HX"].ToString());
+    EXPECT_EQ((int8_t)-42, tags["XY"].ToInt8());
+    EXPECT_EQ(vector<int32_t>( { 42, -100, 37, 2048 } ), tags["VC"].ToInt32Array());
+}
+
+TEST(SamTagCodecTest, EncodeTest)
+{
+    TagCollection tags;
+    tags["ST"] = string("foo");
+    tags["XY"] = int32_t(-42);
+    tags["HX"] = Tag("1abc75", TagModifier::HEX_STRING);
+    tags["VC"] = vector<int32_t>( { 42, -100, 37, 2048 } );
+
+    // "HX:H:1abc75\tST:Z:foo\0\tVC:B:i,42,-100,37,2048\tXY:i:-42"
+    string expected;
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("ST:Z:foo");
+    expected.append("\t");
+    expected.append("VC:B:i,42,-100,37,2048");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const string sam = SamTagCodec::Encode(tags);
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamTagCodecTest, DecodeTest)
+{
+    vector<uint8_t> data;
+    data.push_back(uint8_t('H'));
+    data.push_back(uint8_t('X'));
+    data.push_back(uint8_t('H'));
+    data.push_back(uint8_t('1'));
+    data.push_back(uint8_t('a'));
+    data.push_back(uint8_t('b'));
+    data.push_back(uint8_t('c'));
+    data.push_back(uint8_t('7'));
+    data.push_back(uint8_t('5'));
+    data.push_back(uint8_t(0));
+
+    data.push_back(uint8_t('X'));
+    data.push_back(uint8_t('Y'));
+    data.push_back(uint8_t('i'));
+    const int32_t x = -42;
+    char valueBytes[sizeof x];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&x)),
+              static_cast<const char*>(static_cast<const void*>(&x)) + sizeof x,
+              valueBytes);
+    data.push_back(valueBytes[0]);
+    data.push_back(valueBytes[1]);
+    data.push_back(valueBytes[2]);
+    data.push_back(valueBytes[3]);
+
+    data.push_back('C');
+    data.push_back('A');
+    data.push_back('B');
+    data.push_back('C');
+    const uint32_t numChars = 3;
+    char numCharsValueBytes[sizeof numChars];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&numChars)),
+              static_cast<const char*>(static_cast<const void*>(&numChars)) + sizeof numChars,
+              numCharsValueBytes);
+    data.push_back(numCharsValueBytes[0]);
+    data.push_back(numCharsValueBytes[1]);
+    data.push_back(numCharsValueBytes[2]);
+    data.push_back(numCharsValueBytes[3]);
+
+    const vector<uint8_t> charArray = vector<uint8_t>({34, 5, 125});
+    data.push_back(charArray.at(0));
+    data.push_back(charArray.at(1));
+    data.push_back(charArray.at(2));
+
+    TagCollection tags = BamTagCodec::Decode(data);
+
+    EXPECT_TRUE(tags["HX"].HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(string("1abc75"), tags["HX"].ToString());
+    EXPECT_EQ(x, tags["XY"].ToInt32());
+    EXPECT_EQ(charArray, tags["CA"].ToUInt8Array());
+
+    // sanity check - convert tags back to SAM
+    string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const string sam = SamTagCodec::Encode(tags);
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamTagCodecTest, EncodeTest)
+{
+    vector<uint8_t> expected;
+
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('B');
+    expected.push_back('C');
+    const uint32_t numChars = 3;
+    char numCharsValueBytes[sizeof numChars];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&numChars)),
+              static_cast<const char*>(static_cast<const void*>(&numChars)) + sizeof numChars,
+              numCharsValueBytes);
+    expected.push_back(numCharsValueBytes[0]);
+    expected.push_back(numCharsValueBytes[1]);
+    expected.push_back(numCharsValueBytes[2]);
+    expected.push_back(numCharsValueBytes[3]);
+
+    const vector<uint8_t> charArray = vector<uint8_t>({34, 5, 125});
+    expected.push_back(charArray.at(0));
+    expected.push_back(charArray.at(1));
+    expected.push_back(charArray.at(2));
+
+    expected.push_back(uint8_t('H'));
+    expected.push_back(uint8_t('X'));
+    expected.push_back(uint8_t('H'));
+    expected.push_back(uint8_t('1'));
+    expected.push_back(uint8_t('a'));
+    expected.push_back(uint8_t('b'));
+    expected.push_back(uint8_t('c'));
+    expected.push_back(uint8_t('7'));
+    expected.push_back(uint8_t('5'));
+    expected.push_back(uint8_t(0));
+
+    expected.push_back(uint8_t('X'));
+    expected.push_back(uint8_t('Y'));
+    expected.push_back(uint8_t('i'));
+    const int32_t x = -42;
+    char valueBytes[sizeof x];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&x)),
+              static_cast<const char*>(static_cast<const void*>(&x)) + sizeof x,
+              valueBytes);
+    expected.push_back(valueBytes[0]);
+    expected.push_back(valueBytes[1]);
+    expected.push_back(valueBytes[2]);
+    expected.push_back(valueBytes[3]);
+
+    TagCollection tags;
+    tags["HX"] = Tag("1abc75", TagModifier::HEX_STRING);
+    tags["CA"] = charArray;
+    tags["XY"] = x;
+
+    const vector<uint8_t>& data = BamTagCodec::Encode(tags);
+    EXPECT_EQ(expected, data);
+}
+
+TEST(BamTagCodecTest, AsciiTagsTest)
+{
+    vector<uint8_t> expected;
+    expected.reserve(20);
+    expected.push_back('I'); // I8:A:B
+    expected.push_back('8');
+    expected.push_back('A');
+    expected.push_back('B');
+    expected.push_back('P'); // PC:A:$
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('$');
+    expected.push_back('S'); // SC:A:$
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('$');
+    expected.push_back('U'); // U8:A:A
+    expected.push_back('8');
+    expected.push_back('A');
+    expected.push_back('A');
+    expected.push_back('U'); // UC:A:$
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('$');
+
+    const char          c  = '$';
+    const signed char   sc = '$';
+    const unsigned char uc = '$';
+    const uint8_t u8 = 65;
+    const int8_t  i8 = 66;
+
+    { // old style: construct-then-modify
+
+        Tag fromPlainChar = Tag(c);
+        Tag fromSignedChar = Tag(sc);
+        Tag fromUnsignedChar = Tag(uc);
+        Tag fromUint8 = Tag(u8);
+        Tag fromInt8  = Tag(i8);
+        fromPlainChar.Modifier(TagModifier::ASCII_CHAR);
+        fromSignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUnsignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUint8.Modifier(TagModifier::ASCII_CHAR);
+        fromInt8.Modifier(TagModifier::ASCII_CHAR);
+
+        TagCollection tags;
+        tags["PC"] = fromPlainChar;
+        tags["SC"] = fromSignedChar;
+        tags["UC"] = fromUnsignedChar;
+        tags["U8"] = fromUint8;
+        tags["I8"] = fromInt8;
+
+        const vector<uint8_t>& data = BamTagCodec::Encode(tags);
+        EXPECT_EQ(expected, data);
+    }
+
+    { // new style: construct directly as ASCII
+
+        const Tag fromPlainChar    = Tag(c,  TagModifier::ASCII_CHAR);
+        const Tag fromSignedChar   = Tag(sc, TagModifier::ASCII_CHAR);
+        const Tag fromUnsignedChar = Tag(uc, TagModifier::ASCII_CHAR);
+        const Tag fromUint8 = Tag(u8, TagModifier::ASCII_CHAR);
+        const Tag fromInt8  = Tag(i8, TagModifier::ASCII_CHAR);
+
+        TagCollection tags;
+        tags["PC"] = fromPlainChar;
+        tags["SC"] = fromSignedChar;
+        tags["UC"] = fromUnsignedChar;
+        tags["U8"] = fromUint8;
+        tags["I8"] = fromInt8;
+
+        const vector<uint8_t>& data = BamTagCodec::Encode(tags);
+        EXPECT_EQ(expected, data);
+    }
+}
diff --git a/tests/src/test_TimeUtils.cpp b/tests/src/test_TimeUtils.cpp

new file mode 100644 (file)

index 0000000..90f1489
--- /dev/null
+++ b/tests/src/test_TimeUtils.cpp
@@ -0,0 +1,67 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/../../src/TimeUtils.h>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+TEST(TimeUtilsTest, ToIso8601)
+{
+    const time_t rawTime = 436428750L;
+    const auto timestamp = std::chrono::system_clock::from_time_t(rawTime);
+
+    const auto expected = string{ "1983-10-31T06:12:30Z" }; // no ms in test case
+    const auto actual = internal::ToIso8601(timestamp);
+    EXPECT_EQ(expected, actual);
+}
+
+TEST(TimeUtilsTest, ToDataSetFormat)
+{
+    const time_t rawTime = 436428750L;
+    const auto timestamp = std::chrono::system_clock::from_time_t(rawTime);
+
+    const auto expected = string{ "831031_061230" }; // no ms in test case
+    const std::string& actual = internal::ToDataSetFormat(timestamp);
+    EXPECT_EQ(expected, actual);
+}
diff --git a/tests/src/test_UnmappedReadsQuery.cpp b/tests/src/test_UnmappedReadsQuery.cpp

new file mode 100644 (file)

index 0000000..cf5f46a
--- /dev/null
+++ b/tests/src/test_UnmappedReadsQuery.cpp
@@ -0,0 +1,117 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+//#ifdef PBBAM_TESTING
+//#define private public
+//#endif
+
+//#include "TestData.h"
+//#include <gtest/gtest.h>
+//#include <pbbam/EntireFileQuery.h>
+
+//#include <pbbam/UnmappedReadsQuery.h>
+//#include <string>
+//using namespace PacBio;
+//using namespace PacBio::BAM;
+//using namespace std;
+
+//const string inputBamFn1 = tests::Data_Dir + "/unmap1.bam";
+//const string inputBamFn2 = tests::Data_Dir + "/unmap2.bam";
+
+//TEST(UnmappedReadsQueryTest, UnmappedOnlyFile)
+//{
+//    // open input BAM file
+//    BamFile bamFile(inputBamFn1);
+//    EXPECT_TRUE(bamFile);
+
+//    // check all records, and save unmapped count
+//    int count = 0;
+//    int unmappedExpected = 0;
+//    EntireFileQuery entireFile(bamFile);
+//    EXPECT_TRUE(entireFile);
+//    for ( const BamRecord& record : entireFile ) {
+//        ++count;
+//        if (!record.IsMapped())
+//            ++unmappedExpected;
+//    }
+//    EXPECT_EQ(10, count);
+//    EXPECT_EQ(10, unmappedExpected);
+
+//    // query unmapped records only
+//    int unmappedObserved = 0;
+//    UnmappedReadsQuery unmappedReads(bamFile);
+//    EXPECT_TRUE(unmappedReads);
+//    for ( const BamRecord& record : unmappedReads ) {
+//        EXPECT_FALSE(record.IsMapped());
+//        ++unmappedObserved;
+//    }
+//    EXPECT_EQ(unmappedExpected, unmappedObserved);
+//}
+
+//TEST(UnmappedReadsQueryTest, MixedFile)
+//{
+//    // open input BAM file
+//    BamFile bamFile(inputBamFn2);
+//    EXPECT_TRUE(bamFile);
+
+//    // check all records, and save unmapped count
+//    int count = 0;
+//    int unmappedExpected = 0;
+//    EntireFileQuery entireFile(bamFile);
+//    EXPECT_TRUE(entireFile);
+//    for ( const BamRecord& record : entireFile ) {
+//        ++count;
+//        if (!record.IsMapped())
+//            ++unmappedExpected;
+//    }
+//    EXPECT_EQ(19, count);
+//    EXPECT_EQ(9, unmappedExpected);
+
+//    // query unmapped records only
+//    int unmappedObserved = 0;
+//    UnmappedReadsQuery unmappedReads(bamFile);
+//    EXPECT_TRUE(unmappedReads);
+//    for ( const BamRecord& record : unmappedReads ) {
+//        EXPECT_FALSE(record.IsMapped());
+//        ++unmappedObserved;
+//    }
+//    EXPECT_EQ(unmappedExpected, unmappedObserved);
+//}
+
+// TODO: handle no index case
+
+// TODO: additional special cases as needed
diff --git a/tests/src/test_Validator.cpp b/tests/src/test_Validator.cpp

new file mode 100644 (file)

index 0000000..b3d0638
--- /dev/null
+++ b/tests/src/test_Validator.cpp
@@ -0,0 +1,615 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include <gtest/gtest.h>
+#include <pbbam/BamFile.h>
+#include <pbbam/BamHeader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/Cigar.h>
+#include <pbbam/ReadGroupInfo.h>
+#include <pbbam/Validator.h>
+
+#include "../src/StringUtils.h"
+#include "../src/ValidationErrors.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace tests {
+
+static BamRecord makeValidMappedRecord(void)
+{
+    BamRecordImpl impl;
+    impl.Bin(4680);
+    impl.Flag(2);
+    impl.InsertSize(0);
+    impl.MapQuality(10);
+    impl.MatePosition(-1);
+    impl.MateReferenceId(-1);
+    impl.Name("movie1/54130/0_10");
+    impl.Position(1);
+    impl.ReferenceId(0);
+    impl.SetMapped(true);
+    impl.SetSequenceAndQualities("AATGAGGAGA");
+    impl.CigarData(Cigar{ "10=" });
+
+    TagCollection tags;
+    tags["RG"] = string{ "3f58e5b8" };
+    tags["dq"] = string{ "2222'$22'2" };
+    tags["dt"] = string{ "NNNNAGNNGN" };
+    tags["iq"] = string{ "(+#1'$#*1&" };
+    tags["mq"] = string{ "&1~51*5&~2" };
+    tags["sq"] = string{ "<32<4<<<<3" };
+    tags["ip"] = vector<uint8_t>{ 2,0,10,22,34,0,2,3,0,16 };
+    tags["np"] = static_cast<int32_t>(1);
+    tags["qe"] = static_cast<int32_t>(10);
+    tags["qs"] = static_cast<int32_t>(0);
+    tags["zm"] = static_cast<int32_t>(54130);
+    tags["cx"] = static_cast<int32_t>(2);
+    tags["AS"] = static_cast<int32_t>(-3020);
+    tags["NM"] = static_cast<int32_t>(134);
+    tags["rq"] = static_cast<float>(0.854);
+    tags["sn"] = vector<float>{ 2.0,2.0,2.0,2.0 };
+    impl.Tags(tags);
+
+    return BamRecord(impl);
+}
+
+static BamRecord makeValidUnmappedRecord(void)
+{
+    BamRecordImpl impl;
+    impl.Bin(4680);
+    impl.Flag(4);
+    impl.InsertSize(0);
+    impl.MapQuality(10);
+    impl.MatePosition(-1);
+    impl.MateReferenceId(-1);
+    impl.Name("m140906_231018_42161_c100676332550000001823129611271486_s1_p0/8/0_10");
+    impl.Position(-1);
+    impl.ReferenceId(-1);
+    impl.SetSequenceAndQualities("AATGAGGAGA");
+
+    TagCollection tags;
+    tags["RG"] = string{ "b5482b33" };
+    tags["dq"] = string{ "2222222222" };
+    tags["dt"] = string{ "NNNNNNNNNN" };
+    tags["iq"] = string{ ",*11111001" };
+    tags["mq"] = string{ "&47088')34" };
+    tags["sq"] = string{ "8<4<:<6<0<" };
+    tags["ip"] = vector<uint8_t>{ 255,9,20,43,38,12,9,30,39,22 };
+    tags["np"] = static_cast<int32_t>(1);
+    tags["qe"] = static_cast<int32_t>(10);
+    tags["qs"] = static_cast<int32_t>(0);
+    tags["zm"] = static_cast<int32_t>(8);
+    tags["cx"] = static_cast<int32_t>(2);
+    tags["AS"] = static_cast<int32_t>(-3020);
+    tags["NM"] = static_cast<int32_t>(134);
+    tags["rq"] = static_cast<float>(0.811);
+    tags["sn"] = vector<float>{ 2.0,2.0,2.0,2.0 };
+    impl.Tags(tags);
+
+    return BamRecord(impl);
+}
+
+static ReadGroupInfo makeValidReadGroup(void)
+{
+    ReadGroupInfo rg("f5b4ffb6");
+    rg.MovieName("movie32");
+    rg.ReadType("CCS");
+    rg.BindingKit("100372700");
+    rg.SequencingKit("100612400");
+    rg.BasecallerVersion("2.3");
+    rg.FrameRateHz("100");
+    rg.Control("TRUE");
+    return rg;
+}
+
+// valid, 'starter' objects
+static const ReadGroupInfo validReadGroup       = makeValidReadGroup();
+static const BamRecord     validMappedRecord    = makeValidMappedRecord();
+static const BamRecord     validUnmappedRecord  = makeValidUnmappedRecord();
+
+} // namespace tests
+} // namespace BAM
+} // namespace PacBio
+
+TEST(ValidatorErrorsTest, SetMaxNumErrors)
+{
+    {   // default - use "no max"
+        internal::ValidationErrors errors;
+        EXPECT_EQ(internal::ValidationErrors::MAX, errors.maxNumErrors_);
+    }
+    {   // max of zero doesn't make sense... make equivalent to "no max"
+        internal::ValidationErrors errors(0);
+        EXPECT_EQ(internal::ValidationErrors::MAX, errors.maxNumErrors_);
+    }
+    {   // max = 1
+        internal::ValidationErrors errors(1);
+        EXPECT_EQ(1, errors.maxNumErrors_);
+    }
+    {   // max = 10
+        internal::ValidationErrors errors(10);
+        EXPECT_EQ(10, errors.maxNumErrors_);
+    }
+}
+
+TEST(ValidatorErrorsTest, ThrowOnMaxReached)
+{
+    {
+        internal::ValidationErrors errors(1);
+        EXPECT_THROW(errors.AddFileError("foo", "you"), ValidationException);
+    }
+    {
+        internal::ValidationErrors errors(2);
+        errors.AddFileError("foo", "you");
+        EXPECT_THROW(errors.AddFileError("foo", "me"), ValidationException);
+    }
+}
+
+TEST(ValidatorErrorsTest, ExceptionFromResults)
+{
+    const string error1 = "error1";
+    const string error2 = "error2";
+
+    try {
+
+        internal::ValidationErrors errors(4);
+        errors.AddFileError("path/to/foo.bam", error1);
+        errors.AddFileError("path/to/foo.bam", error2);
+        errors.AddReadGroupError("deadbeef", "invalid sequencing chemistry combination detected");
+        errors.AddRecordError("m140906_231018_42161_c100676332550000001823129611271486_s1_p0/8/0_10",
+                              "MergeQV does not match expected length");
+
+    } catch (ValidationException& e) {
+
+        EXPECT_EQ(1, e.FileErrors().size());                       // only 1 file
+        EXPECT_EQ(2, e.FileErrors().at("path/to/foo.bam").size()); // 2 errors for this file
+        EXPECT_EQ(1, e.ReadGroupErrors().size());
+        EXPECT_EQ(1, e.RecordErrors().size());
+    }
+}
+
+TEST(ValidatorTest, ValidReadGroup)
+{
+    ASSERT_NO_THROW(Validator::Validate(tests::validReadGroup));
+}
+
+TEST(ValidatorTest, ReadGroupRequiredComponents)
+{
+    {   // missing ID
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.Id("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // missing movie name
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.MovieName("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // missing read type
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.ReadType("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // missing binding kit
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.BindingKit("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // missing sequencing kit
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.SequencingKit("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // missing basecaller version
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.BasecallerVersion("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // missing frame rate
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.FrameRateHz("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+}
+
+TEST(ValidatorTest, ReadGroupValues)
+{
+    {   // mismatch expected ID vs stored ID - change ID
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.Id("deadbeef");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // mismatch expected ID vs stored ID - change read type
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.ReadType("SUBREAD");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // mismatch expected ID vs stored ID - change movie name
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.MovieName("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // unknown read type
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.ReadType("FOO");
+
+        // recompute ID so we're only checking the new read type, not read ID
+        rg.Id( MakeReadGroupId(rg.MovieName(), rg.ReadType()) );
+
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // invalid chemistry triple - change binding kit
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.BindingKit("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // invalid chemistry triple - change sequencing kit
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.SequencingKit("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {   // invalid chemistry triple - change basecaller version
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.BasecallerVersion("0.42");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    { // non-numeric frame rate
+        ReadGroupInfo rg = tests::validReadGroup;
+        rg.FrameRateHz("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+}
+
+TEST(ValidatorTest, ValidHeader)
+{
+    const BamHeader validMappedHeader {
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;"
+              "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;"
+              "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200"
+              "\tPU:movie1\n"
+    };
+
+    const BamHeader validUnmappedHeader {
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+            "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t"
+            "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n"
+    };
+
+    ASSERT_NO_THROW(Validator::Validate(validMappedHeader));
+    ASSERT_NO_THROW(Validator::Validate(validUnmappedHeader));
+}
+
+TEST(ValidatorTest, ValidateHeader)
+{
+    const BamHeader validMappedHeader {
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;"
+              "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;"
+              "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200"
+              "\tPU:movie1\n"
+    };
+
+    {   // invalid SAM version - non-numeric
+        BamHeader header = validMappedHeader.DeepCopy();
+        header.Version("foo");
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {   // invalid SAM version - negative version numbers
+        BamHeader header = validMappedHeader.DeepCopy();
+        header.Version("-1.4.0");
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {   // invalid sort order
+        BamHeader header = validMappedHeader.DeepCopy();
+        header.SortOrder("not_a_valid_sort_order");
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+
+    // invalid PacBioBamVersion numbers (non-numeric, negative, earlier than min)
+    // already throw when you try to set them... so we have to catch & ignore
+    // initial exception to get to validator
+
+    {   // invalid PacBioBAM version - non-numeric
+        BamHeader header = validMappedHeader.DeepCopy();
+        try {
+            header.PacBioBamVersion("foo");
+        } catch (...) { }
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {   // invalid PacBioBAM version - negative version numbers
+        BamHeader header = validMappedHeader.DeepCopy();
+        try {
+            header.PacBioBamVersion("-1.4.0");
+        } catch (...) { }
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {   // invalid PacBioBAM version - earlier than minimum allowed
+        BamHeader header = validMappedHeader.DeepCopy();
+        try {
+            header.PacBioBamVersion("3.0.0");
+        } catch (...) { }
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+}
+
+TEST(ValidatorTest, ValidRecord)
+{
+    const BamHeader validMappedHeader {
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;"
+              "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;"
+              "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200"
+              "\tPU:movie1\n"
+    };
+    BamRecord record(tests::validMappedRecord);
+    record.header_ = validMappedHeader;
+    ASSERT_NO_THROW(Validator::Validate(record));
+}
+
+static inline
+void ModifyTag(BamRecord* record,
+               const std::string& tagName,
+               const Tag& tag)
+{
+    if (record->Impl().HasTag(tagName))
+        record->Impl().EditTag(tagName, tag);
+    else
+        record->Impl().AddTag(tagName, tag);
+}
+
+static inline
+void CheckInvalidTagLength(const std::string& tagName, const Tag& tag)
+{
+    static const BamHeader validUnmappedHeader {
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+            "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t"
+            "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n"
+    };
+    BamRecord record(tests::validUnmappedRecord);
+    record.header_ = validUnmappedHeader;
+
+    ModifyTag(&record, tagName, tag);
+
+    EXPECT_THROW(Validator::Validate(record), ValidationException);
+    EXPECT_FALSE(Validator::IsValid(record));
+}
+
+TEST(ValidatorTest, TagDataLengths)
+{
+    const BamHeader validUnmappedHeader {
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+            "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t"
+            "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n"
+    };
+
+    // make these "variable-length" SEQ/tags too short for the read's stated
+    // queryStart/queryEnd
+
+    {   // SEQ
+        BamRecord record(tests::validUnmappedRecord);
+        record.header_ = validUnmappedHeader;
+        record.Impl().SetSequenceAndQualities("AA");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+
+    CheckInvalidTagLength("dq", QualityValues("@@").Fastq());  // DeletionQV
+    CheckInvalidTagLength("iq", QualityValues("@@").Fastq());  // InsertionQV
+    CheckInvalidTagLength("mq", QualityValues("@@").Fastq());  // MergeQV
+    CheckInvalidTagLength("sq", QualityValues("@@").Fastq());  // SubstitutionQV
+    CheckInvalidTagLength("dt", string("AA")); // DeletionTag
+    CheckInvalidTagLength("st", string("AA")); // SubstitutionTag
+
+    const auto& f = Frames{ vector<uint16_t>{42, 42, 42} };
+    const auto& frames = f.Data();
+    CheckInvalidTagLength("ip", frames); // IPD
+
+    // NOTE: disabling "internal" tag checks for now, only checking "standard"
+    //       PacBioBAM tags
+
+//    const auto& pulses = vector<uint16_t>{42, 42, 42};
+//    CheckInvalidTagLength("pv", QualityValues("@@").Fastq());  // AltLabelQV
+//    CheckInvalidTagLength("pq", QualityValues("@@").Fastq());  // LabelQV
+//    CheckInvalidTagLength("pg", QualityValues("@@").Fastq());  // PulseMergeQv
+//    CheckInvalidTagLength("pt", string("AA")); // AltLabelTag
+//    CheckInvalidTagLength("pc", string("AA")); // PulseCall
+//    CheckInvalidTagLength("pd", frames); // PrePulseFrames
+//    CheckInvalidTagLength("px", frames); // PulseCallWidth
+//    CheckInvalidTagLength("pw", frames); // PulseWidth
+//    CheckInvalidTagLength("pa", pulses); // Pkmean
+//    CheckInvalidTagLength("ps", pulses); // Pkmean2
+//    CheckInvalidTagLength("pm", pulses); // Pkmid
+//    CheckInvalidTagLength("pi", pulses); // Pkmid2
+}
+
+TEST(ValidatorTest, TagDataValues)
+{
+    const BamHeader validMappedHeader {
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;"
+              "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;"
+              "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200"
+              "\tPU:movie1\n"
+    };
+
+    {   // missing qe
+        BamRecord record(tests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("qe");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {   // missing qs
+        BamRecord record(tests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("qs");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {   // queryStart should be < queryEnd
+        BamRecord record(tests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.QueryStart(10);
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // missing zm
+        BamRecord record(tests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("zm");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {   // missing np
+        BamRecord record(tests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("np");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {   // numPasses for SUBREAD type records should be 1
+        BamRecord record(tests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.NumPasses(42);
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {   // missing sn
+        BamRecord record(tests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("sn");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+}
+
+TEST(ValidatorTest, MappedRecords)
+{
+    const BamHeader validMappedHeader {
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n"
+        "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+            "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t"
+            "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n"
+    };
+
+    {   // mapped record should have valid refID
+        BamRecord record(tests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().ReferenceId(-1);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {   // mapped record should have valid position
+        BamRecord record(tests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().Position(-1);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+
+}
+
+TEST(ValidatorTest, UnmappedRecords)
+{
+    const BamHeader validUnmappedHeader {
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+            "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+            "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t"
+            "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n"
+    };
+
+    {   // unmapped should have no refID
+        BamRecord record(tests::validUnmappedRecord);
+        record.header_ = validUnmappedHeader;
+        record.Impl().ReferenceId(0);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {   // unmapped should have no position
+        BamRecord record(tests::validUnmappedRecord);
+        record.header_ = validUnmappedHeader;
+        record.Impl().Position(42);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+}
diff --git a/tests/src/test_Version.cpp b/tests/src/test_Version.cpp

new file mode 100644 (file)

index 0000000..08bc7fe
--- /dev/null
+++ b/tests/src/test_Version.cpp
@@ -0,0 +1,335 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "../src/Version.h"
+
+#include <gtest/gtest.h>
+#include <sstream>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace tests {
+
+static inline Version MakeVersion(int x, int y, int z)
+{ return Version(x, y, z); }
+
+} // namespace tests
+} // namespace BAM
+} // namespace PacBio
+
+TEST(VersionTest, DefaultOk)
+{
+    Version v;
+    EXPECT_EQ(0, v.Major());
+    EXPECT_EQ(0, v.Minor());
+    EXPECT_EQ(0, v.Revision());
+}
+
+TEST(VersionTest, CopyAndMoveOk)
+{
+    {   // copy ctor
+        Version v1(3,1,1);
+        EXPECT_EQ(3, v1.Major());
+        EXPECT_EQ(1, v1.Minor());
+        EXPECT_EQ(1, v1.Revision());
+
+        Version v2(v1);
+        EXPECT_EQ(3, v2.Major());
+        EXPECT_EQ(1, v2.Minor());
+        EXPECT_EQ(1, v2.Revision());
+    }
+    {   // copy assign
+        Version v1(3,1,1);
+        EXPECT_EQ(3, v1.Major());
+        EXPECT_EQ(1, v1.Minor());
+        EXPECT_EQ(1, v1.Revision());
+
+        Version v2;
+        v2 = v1;
+        EXPECT_EQ(3, v2.Major());
+        EXPECT_EQ(1, v2.Minor());
+        EXPECT_EQ(1, v2.Revision());
+
+    }
+    {   // move ctor
+        Version v(tests::MakeVersion(3,1,1));
+        EXPECT_EQ(3, v.Major());
+        EXPECT_EQ(1, v.Minor());
+        EXPECT_EQ(1, v.Revision());
+
+    }
+    {   // move assign
+        Version v1(3,1,1);
+        EXPECT_EQ(3, v1.Major());
+        EXPECT_EQ(1, v1.Minor());
+        EXPECT_EQ(1, v1.Revision());
+
+        Version v2;
+        v2 = std::move(v1);
+        EXPECT_EQ(3, v2.Major());
+        EXPECT_EQ(1, v2.Minor());
+        EXPECT_EQ(1, v2.Revision());
+    }
+}
+
+TEST(VersionTest, FromIntsOk)
+{
+    {   // normal
+        Version v(3,1,1);
+        EXPECT_EQ(3, v.Major());
+        EXPECT_EQ(1, v.Minor());
+        EXPECT_EQ(1, v.Revision());
+    }
+
+    // negatives
+    EXPECT_THROW(Version(-3, 1, 1), std::runtime_error);
+}
+
+TEST(VersionTest, FromStringOk)
+{
+    {   // normal
+        Version v("3.1.1");
+        EXPECT_EQ(3, v.Major());
+        EXPECT_EQ(1, v.Minor());
+        EXPECT_EQ(1, v.Revision());
+    }
+
+    // negatives
+    EXPECT_THROW(Version("-3.1.1"), std::runtime_error);
+
+    // non-numeric
+    EXPECT_THROW(Version("foo.bar.baz"), std::runtime_error);
+
+    // empty
+    EXPECT_THROW(Version(""), std::runtime_error);
+}
+
+TEST(VersionTest, SettersOk)
+{
+    Version v(3,1,1);
+
+    v.Major(4);
+
+    EXPECT_EQ(4, v.Major());
+    EXPECT_EQ(1, v.Minor());
+    EXPECT_EQ(1, v.Revision());
+
+    v.Minor(7);
+
+    EXPECT_EQ(4, v.Major());
+    EXPECT_EQ(7, v.Minor());
+    EXPECT_EQ(1, v.Revision());
+
+    v.Revision(23);
+
+    EXPECT_EQ(4, v.Major());
+    EXPECT_EQ(7, v.Minor());
+    EXPECT_EQ(23, v.Revision());
+
+    {   // invalid
+        Version v1(3,1,1);
+        Version v2(3,1,1);
+        Version v3(3,1,1);
+        EXPECT_THROW(v1.Major(-1),    std::runtime_error);
+        EXPECT_THROW(v2.Minor(-1),    std::runtime_error);
+        EXPECT_THROW(v3.Revision(-1), std::runtime_error);
+    }
+}
+
+TEST(VersionTest, ComparisonsOk)
+{
+    const Version v0_0_0 = Version(0,0,0);
+    const Version v0_0_4 = Version(0,0,4);
+    const Version v0_1_0 = Version(0,1,0);
+    const Version v0_1_4 = Version(0,1,4);
+    const Version v3_0_0 = Version(3,0,0);
+    const Version v3_0_4 = Version(3,0,4);
+    const Version v3_1_0 = Version(3,1,0);
+    const Version v3_1_4 = Version(3,1,4);
+    const Version v3_1_5 = Version(3,1,5);
+
+    // operator==
+    EXPECT_TRUE(v0_0_0 == v0_0_0);
+    EXPECT_TRUE(v3_0_0 == v3_0_0);
+    EXPECT_TRUE(v0_1_0 == v0_1_0);
+    EXPECT_TRUE(v0_0_4 == v0_0_4);
+    EXPECT_TRUE(v3_1_0 == v3_1_0);
+    EXPECT_TRUE(v3_1_4 == v3_1_4);
+
+    EXPECT_FALSE(v3_1_4 == v0_0_0);
+    EXPECT_FALSE(v3_1_4 == v3_0_0);
+    EXPECT_FALSE(v3_1_4 == v0_1_0);
+    EXPECT_FALSE(v3_1_4 == v0_0_4);
+    EXPECT_FALSE(v3_1_4 == v3_1_0);
+    EXPECT_FALSE(v3_1_4 == v3_1_5);
+
+    // operator!=
+    EXPECT_FALSE(v0_0_0 != v0_0_0);
+    EXPECT_FALSE(v3_0_0 != v3_0_0);
+    EXPECT_FALSE(v0_1_0 != v0_1_0);
+    EXPECT_FALSE(v0_0_4 != v0_0_4);
+    EXPECT_FALSE(v3_1_0 != v3_1_0);
+    EXPECT_FALSE(v3_1_4 != v3_1_4);
+
+    EXPECT_TRUE(v3_1_4 != v0_0_0);
+    EXPECT_TRUE(v3_1_4 != v3_0_0);
+    EXPECT_TRUE(v3_1_4 != v0_1_0);
+    EXPECT_TRUE(v3_1_4 != v0_0_4);
+    EXPECT_TRUE(v3_1_4 != v3_1_0);
+    EXPECT_TRUE(v3_1_4 != v3_1_5);
+
+    // operator<
+    EXPECT_FALSE(v0_0_0 < v0_0_0);
+    EXPECT_TRUE(v0_0_0 < v0_0_4);
+    EXPECT_TRUE(v0_0_0 < v0_1_0);
+    EXPECT_TRUE(v0_0_0 < v3_0_0);
+    EXPECT_TRUE(v0_0_0 < v0_1_4);
+    EXPECT_TRUE(v0_0_0 < v3_0_4);
+    EXPECT_TRUE(v0_0_0 < v3_1_0);
+    EXPECT_TRUE(v0_0_0 < v3_1_4);
+
+    EXPECT_TRUE(v0_0_4 < v3_1_4);
+    EXPECT_TRUE(v0_1_0 < v3_1_4);
+    EXPECT_TRUE(v0_1_4 < v3_1_4);
+    EXPECT_TRUE(v3_0_0 < v3_1_4);
+    EXPECT_TRUE(v3_0_4 < v3_1_4);
+    EXPECT_TRUE(v3_1_0 < v3_1_4);
+    EXPECT_FALSE(v3_1_4 < v3_1_4);
+    EXPECT_FALSE(v3_1_5 < v3_1_4);
+
+    EXPECT_FALSE(v3_1_4 < v0_0_0);
+
+    // operator<=
+    EXPECT_TRUE(v0_0_0 <= v0_0_0);
+    EXPECT_TRUE(v0_0_0 <= v0_0_4);
+    EXPECT_TRUE(v0_0_0 <= v0_1_0);
+    EXPECT_TRUE(v0_0_0 <= v3_0_0);
+    EXPECT_TRUE(v0_0_0 <= v0_1_4);
+    EXPECT_TRUE(v0_0_0 <= v3_0_4);
+    EXPECT_TRUE(v0_0_0 <= v3_1_0);
+    EXPECT_TRUE(v0_0_0 <= v3_1_4);
+
+    EXPECT_TRUE(v0_0_4 <= v3_1_4);
+    EXPECT_TRUE(v0_1_0 <= v3_1_4);
+    EXPECT_TRUE(v0_1_4 <= v3_1_4);
+    EXPECT_TRUE(v3_0_0 <= v3_1_4);
+    EXPECT_TRUE(v3_0_4 <= v3_1_4);
+    EXPECT_TRUE(v3_1_0 <= v3_1_4);
+    EXPECT_TRUE(v3_1_4 <= v3_1_4);
+    EXPECT_FALSE(v3_1_5 <= v3_1_4);
+
+    EXPECT_FALSE(v3_1_4 <= v0_0_0);
+
+    // operator>
+    EXPECT_FALSE(v0_0_0 > v0_0_0);
+    EXPECT_FALSE(v0_0_0 > v0_0_4);
+    EXPECT_FALSE(v0_0_0 > v0_1_0);
+    EXPECT_FALSE(v0_0_0 > v3_0_0);
+    EXPECT_FALSE(v0_0_0 > v0_1_4);
+    EXPECT_FALSE(v0_0_0 > v3_0_4);
+    EXPECT_FALSE(v0_0_0 > v3_1_0);
+    EXPECT_FALSE(v0_0_0 > v3_1_4);
+
+    EXPECT_FALSE(v0_0_4 > v3_1_4);
+    EXPECT_FALSE(v0_1_0 > v3_1_4);
+    EXPECT_FALSE(v0_1_4 > v3_1_4);
+    EXPECT_FALSE(v3_0_0 > v3_1_4);
+    EXPECT_FALSE(v3_0_4 > v3_1_4);
+    EXPECT_FALSE(v3_1_0 > v3_1_4);
+    EXPECT_FALSE(v3_1_4 > v3_1_4);
+    EXPECT_TRUE(v3_1_5 > v3_1_4);
+
+    EXPECT_TRUE(v3_1_4 > v0_0_0);
+
+    // operator>=
+    EXPECT_TRUE(v0_0_0 >= v0_0_0);
+    EXPECT_FALSE(v0_0_0 >= v0_0_4);
+    EXPECT_FALSE(v0_0_0 >= v0_1_0);
+    EXPECT_FALSE(v0_0_0 >= v3_0_0);
+    EXPECT_FALSE(v0_0_0 >= v0_1_4);
+    EXPECT_FALSE(v0_0_0 >= v3_0_4);
+    EXPECT_FALSE(v0_0_0 >= v3_1_0);
+    EXPECT_FALSE(v0_0_0 >= v3_1_4);
+
+    EXPECT_FALSE(v0_0_4 >= v3_1_4);
+    EXPECT_FALSE(v0_1_0 >= v3_1_4);
+    EXPECT_FALSE(v0_1_4 >= v3_1_4);
+    EXPECT_FALSE(v3_0_0 >= v3_1_4);
+    EXPECT_FALSE(v3_0_4 >= v3_1_4);
+    EXPECT_FALSE(v3_1_0 >= v3_1_4);
+    EXPECT_TRUE(v3_1_4 >= v3_1_4);
+    EXPECT_TRUE(v3_1_5 >= v3_1_4);
+
+    EXPECT_TRUE(v3_1_4 >= v0_0_0);
+}
+
+TEST(VersionTest, ToStringOk)
+{
+    {
+        Version v(0,0,0);
+        EXPECT_EQ(string("0.0.0"), v.ToString());
+    }
+    {
+        Version v(3,1,4);
+        EXPECT_EQ(string("3.1.4"), v.ToString());
+    }
+    {
+        Version v;
+        v.Major(4);
+        EXPECT_EQ(string("4.0.0"), v.ToString());
+    }
+    {
+        const string s = "1.2.3";
+        Version v(s);
+        EXPECT_EQ(s, v.ToString());
+    }
+}
+
+TEST(VersionTest, OutputStreamOk)
+{
+    Version v(3,1,4);
+    Version v2(4,10,0);
+
+    stringstream s;
+    s << v << ", " << v2 << ", " << v << endl;
+
+    EXPECT_EQ(string("3.1.4, 4.10.0, 3.1.4\n"), s.str());
+}
diff --git a/tests/src/test_WhitelistedZmwReadStitcher.cpp b/tests/src/test_WhitelistedZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..9c4ba7f
--- /dev/null
+++ b/tests/src/test_WhitelistedZmwReadStitcher.cpp
@@ -0,0 +1,260 @@
+// Copyright (c) 2014-2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/BamFile.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/PbiRawData.h>
+#include <pbbam/virtual/WhitelistedZmwReadStitcher.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace tests {
+
+static
+void Compare(const BamRecord& b1, const BamRecord& b2)
+{
+    EXPECT_TRUE(b1.HasDeletionQV());
+    EXPECT_TRUE(b1.HasDeletionTag());
+    EXPECT_TRUE(b1.HasInsertionQV());
+    EXPECT_TRUE(b1.HasMergeQV());
+    EXPECT_TRUE(b1.HasSubstitutionQV());
+    EXPECT_TRUE(b1.HasSubstitutionTag());
+    EXPECT_TRUE(b1.HasLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelTag());
+    EXPECT_TRUE(b1.HasPkmean());
+    EXPECT_TRUE(b1.HasPkmid());
+    EXPECT_TRUE(b1.HasPulseCall());
+    EXPECT_TRUE(b1.HasIPD());
+    EXPECT_TRUE(b1.HasPulseWidth());
+    EXPECT_TRUE(b1.HasPrePulseFrames());
+    EXPECT_TRUE(b1.HasPulseCallWidth());
+    EXPECT_TRUE(b1.HasPulseMergeQV());
+
+    EXPECT_TRUE(b2.HasDeletionQV());
+    EXPECT_TRUE(b2.HasDeletionTag());
+    EXPECT_TRUE(b2.HasInsertionQV());
+    EXPECT_TRUE(b2.HasMergeQV());
+    EXPECT_TRUE(b2.HasSubstitutionQV());
+    EXPECT_TRUE(b2.HasSubstitutionTag());
+    EXPECT_TRUE(b2.HasLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelTag());
+    EXPECT_TRUE(b2.HasPkmean());
+    EXPECT_TRUE(b2.HasPkmid());
+    EXPECT_TRUE(b2.HasPulseCall());
+    EXPECT_TRUE(b2.HasIPD());
+    EXPECT_TRUE(b2.HasPulseWidth());
+    EXPECT_TRUE(b2.HasPrePulseFrames());
+    EXPECT_TRUE(b2.HasPulseCallWidth());
+    EXPECT_TRUE(b2.HasPulseMergeQV());
+
+    EXPECT_EQ(b1.FullName(),        b2.FullName());
+    EXPECT_EQ(b1.HoleNumber(),      b2.HoleNumber());
+    EXPECT_EQ(b1.NumPasses(),       b2.NumPasses());
+    EXPECT_EQ(b1.Sequence(),        b2.Sequence());
+    EXPECT_EQ(b1.Qualities(),       b2.Qualities());
+    EXPECT_EQ(b1.DeletionQV(),      b2.DeletionQV());
+    EXPECT_EQ(b1.DeletionTag(),     b2.DeletionTag());
+    EXPECT_EQ(b1.InsertionQV(),     b2.InsertionQV());
+    EXPECT_EQ(b1.MergeQV(),         b2.MergeQV());
+    EXPECT_EQ(b1.SubstitutionQV(),  b2.SubstitutionQV());
+    EXPECT_EQ(b1.SubstitutionTag(), b2.SubstitutionTag());
+    EXPECT_EQ(b1.LabelQV(),         b2.LabelQV());
+    EXPECT_EQ(b1.AltLabelQV(),      b2.AltLabelQV());
+    EXPECT_EQ(b1.AltLabelTag(),     b2.AltLabelTag());
+    EXPECT_EQ(b1.Pkmean(),          b2.Pkmean());
+    EXPECT_EQ(b1.Pkmid(),           b2.Pkmid());
+    EXPECT_EQ(b1.PulseCall(),       b2.PulseCall());
+    EXPECT_EQ(b1.IPD(),             b2.IPD());
+    EXPECT_EQ(b1.PulseWidth(),      b2.PulseWidth());
+    EXPECT_EQ(b1.PrePulseFrames(),  b2.PrePulseFrames());
+    EXPECT_EQ(b1.PulseCallWidth(),  b2.PulseCallWidth());
+    EXPECT_EQ(b1.ReadGroup(),       b2.ReadGroup());
+    EXPECT_EQ(b1.PulseMergeQV(),    b2.PulseMergeQV());
+}
+
+} // namespace tests
+} // namespace BAM
+} // namespace PacBio
+
+TEST(WhitelistedZmwReadStitching, EmptyList)
+{
+    const std::vector<int32_t> whitelist = { };
+    WhitelistedZmwReadStitcher stitcher(whitelist,
+                                        tests::Data_Dir + "/polymerase/internal.subreads.bam",
+                                        tests::Data_Dir + "/polymerase/internal.scraps.bam");
+    EXPECT_FALSE(stitcher.HasNext());
+    EXPECT_TRUE(stitcher.NextRaw().empty());
+}
+
+TEST(WhitelistedZmwReadStitching, SingleValue)
+{
+    const std::vector<int32_t> whitelist = { 200000 };
+    WhitelistedZmwReadStitcher stitcher(whitelist,
+                                        tests::Data_Dir + "/polymerase/internal.subreads.bam",
+                                        tests::Data_Dir + "/polymerase/internal.scraps.bam");
+
+    // create virtual record
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // fetch original polymerase read (2nd record)
+    BamFile polyBam(tests::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    ++begin;
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin++;
+
+    EXPECT_EQ(200000, virtualRecord.HoleNumber());
+
+    tests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(WhitelistedZmwReadStitching, UnknownZmw)
+{
+    const std::vector<int32_t> whitelist { 42 }; // ZMW not in our files
+    WhitelistedZmwReadStitcher stitcher(whitelist,
+                                        tests::Data_Dir + "/polymerase/internal.subreads.bam",
+                                        tests::Data_Dir + "/polymerase/internal.scraps.bam");
+
+    EXPECT_FALSE(stitcher.HasNext());
+    EXPECT_TRUE(stitcher.NextRaw().empty());
+}
+
+TEST(WhitelistedZmwReadStitching, MultiValue)
+{
+    const std::vector<int32_t> whitelist = { 100000, 300000 };
+    WhitelistedZmwReadStitcher stitcher(whitelist,
+                                        tests::Data_Dir + "/polymerase/internal.subreads.bam",
+                                        tests::Data_Dir + "/polymerase/internal.scraps.bam");
+
+
+    // create virtual records
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord1 = stitcher.Next();
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord2 = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // fetch original polymerase reads (2nd record)
+    BamFile polyBam(tests::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+
+    EXPECT_TRUE(begin != end);
+    auto polyRecord1 = *begin++;
+    EXPECT_TRUE(begin != end);
+    ++begin;
+    EXPECT_TRUE(begin != end);
+    auto polyRecord2 = *begin++;
+    EXPECT_TRUE(begin == end);
+
+    EXPECT_EQ(100000, virtualRecord1.HoleNumber());
+    EXPECT_EQ(300000, virtualRecord2.HoleNumber());
+
+    tests::Compare(polyRecord1, virtualRecord1);
+    tests::Compare(polyRecord2, virtualRecord2);
+}
+
+TEST(WhitelistedZmwReadStitching, MultiValue_MixedKnownAndUnknown)
+{
+    const std::vector<int32_t> whitelist { 42, 200000, 24 };
+    WhitelistedZmwReadStitcher stitcher(whitelist,
+                                        tests::Data_Dir + "/polymerase/internal.subreads.bam",
+                                        tests::Data_Dir + "/polymerase/internal.scraps.bam");
+
+    // everything below should behave exactly as 'SingleValueOk' test,
+    // as the unknown ZMWs will have been removed during construction
+
+    // create virtual record
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // fetch original polymerase read (2nd record)
+    BamFile polyBam(tests::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    ++begin;
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin++;
+
+    EXPECT_EQ(200000, virtualRecord.HoleNumber());
+
+    tests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(WhitelistedZmwReadStitching, EmptyScrapsFileOk)
+{
+    const std::vector<int32_t> whitelist = { 10944689, 10944690 };
+    const std::string primaryBamFn = tests::Data_Dir + "/polymerase/scrapless.subreads.bam" ;
+    const std::string scrapsBamFn  = tests::Data_Dir + "/polymerase/scrapless.scraps.bam" ;
+
+    int count = 0;
+    WhitelistedZmwReadStitcher stitcher(whitelist, primaryBamFn, scrapsBamFn);
+    while (stitcher.HasNext()) {
+        auto record = stitcher.Next();
+        (void)record;
+        ++count;
+    }
+    EXPECT_EQ(2, count);
+
+    const BamFile primaryBam(primaryBamFn);
+    const BamFile scrapsBam(scrapsBamFn);
+    const PbiRawData primaryIdx(primaryBam.PacBioIndexFilename());
+    const PbiRawData scrapsIdx(scrapsBam.PacBioIndexFilename());
+    EXPECT_EQ(3, primaryIdx.NumReads());
+    EXPECT_EQ(0, scrapsIdx.NumReads());
+}
diff --git a/tests/src/test_ZmwQuery.cpp b/tests/src/test_ZmwQuery.cpp

new file mode 100644 (file)

index 0000000..287a4eb
--- /dev/null
+++ b/tests/src/test_ZmwQuery.cpp
@@ -0,0 +1,67 @@
+// Copyright (c) 2014-2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/ZmwQuery.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+//TEST(EntireFileQueryTest, CountRecords)
+//{
+//    EXPECT_NO_THROW(
+//    {
+//        // open input BAM file
+//        BamFile bamFile(inputBamFn);
+
+//        // count records
+//        int count = 0;
+//        EntireFileQuery entireFile(bamFile);
+//        for (const BamRecord& record : entireFile) {
+//            (void)record;
+//            ++count;
+//        }
+
+//        EXPECT_EQ(3307, count);
+//    });
+//}
diff --git a/tests/src/test_ZmwReadStitcher.cpp b/tests/src/test_ZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..3554c74
--- /dev/null
+++ b/tests/src/test_ZmwReadStitcher.cpp
@@ -0,0 +1,512 @@
+// Copyright (c) 2014-2016, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifdef PBBAM_TESTING
+#define private public
+#endif
+
+#include "TestData.h"
+#include <gtest/gtest.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/PbiFilter.h>
+#include <pbbam/virtual/VirtualPolymeraseReader.h>
+#include <pbbam/virtual/VirtualPolymeraseCompositeReader.h>
+#include <pbbam/virtual/ZmwReadStitcher.h>
+#include <string>
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace PacBio {
+namespace BAM {
+namespace tests {
+
+static
+void Compare(const BamRecord& b1, const BamRecord& b2)
+{
+    EXPECT_TRUE(b1.HasDeletionQV());
+    EXPECT_TRUE(b1.HasDeletionTag());
+    EXPECT_TRUE(b1.HasInsertionQV());
+    EXPECT_TRUE(b1.HasMergeQV());
+    EXPECT_TRUE(b1.HasSubstitutionQV());
+    EXPECT_TRUE(b1.HasSubstitutionTag());
+    EXPECT_TRUE(b1.HasLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelTag());
+    EXPECT_TRUE(b1.HasPkmean());
+    EXPECT_TRUE(b1.HasPkmid());
+    EXPECT_TRUE(b1.HasPulseCall());
+    EXPECT_TRUE(b1.HasIPD());
+    EXPECT_TRUE(b1.HasPulseWidth());
+    EXPECT_TRUE(b1.HasPrePulseFrames());
+    EXPECT_TRUE(b1.HasPulseCallWidth());
+    EXPECT_TRUE(b1.HasPulseMergeQV());
+
+    EXPECT_TRUE(b2.HasDeletionQV());
+    EXPECT_TRUE(b2.HasDeletionTag());
+    EXPECT_TRUE(b2.HasInsertionQV());
+    EXPECT_TRUE(b2.HasMergeQV());
+    EXPECT_TRUE(b2.HasSubstitutionQV());
+    EXPECT_TRUE(b2.HasSubstitutionTag());
+    EXPECT_TRUE(b2.HasLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelTag());
+    EXPECT_TRUE(b2.HasPkmean());
+    EXPECT_TRUE(b2.HasPkmid());
+    EXPECT_TRUE(b2.HasPulseCall());
+    EXPECT_TRUE(b2.HasIPD());
+    EXPECT_TRUE(b2.HasPulseWidth());
+    EXPECT_TRUE(b2.HasPrePulseFrames());
+    EXPECT_TRUE(b2.HasPulseCallWidth());
+    EXPECT_TRUE(b2.HasPulseMergeQV());
+
+    EXPECT_EQ(b1.FullName(),        b2.FullName());
+    EXPECT_EQ(b1.HoleNumber(),      b2.HoleNumber());
+    EXPECT_EQ(b1.NumPasses(),       b2.NumPasses());
+    EXPECT_EQ(b1.Sequence(),        b2.Sequence());
+    EXPECT_EQ(b1.Qualities(),       b2.Qualities());
+    EXPECT_EQ(b1.DeletionQV(),      b2.DeletionQV());
+    EXPECT_EQ(b1.DeletionTag(),     b2.DeletionTag());
+    EXPECT_EQ(b1.InsertionQV(),     b2.InsertionQV());
+    EXPECT_EQ(b1.MergeQV(),         b2.MergeQV());
+    EXPECT_EQ(b1.SubstitutionQV(),  b2.SubstitutionQV());
+    EXPECT_EQ(b1.SubstitutionTag(), b2.SubstitutionTag());
+    EXPECT_EQ(b1.LabelQV(),         b2.LabelQV());
+    EXPECT_EQ(b1.AltLabelQV(),      b2.AltLabelQV());
+    EXPECT_EQ(b1.AltLabelTag(),     b2.AltLabelTag());
+    EXPECT_EQ(b1.Pkmean(),          b2.Pkmean());
+    EXPECT_EQ(b1.Pkmid(),           b2.Pkmid());
+    EXPECT_EQ(b1.PulseCall(),       b2.PulseCall());
+    EXPECT_EQ(b1.IPD(),             b2.IPD());
+    EXPECT_EQ(b1.PulseWidth(),      b2.PulseWidth());
+    EXPECT_EQ(b1.PrePulseFrames(),  b2.PrePulseFrames());
+    EXPECT_EQ(b1.PulseCallWidth(),  b2.PulseCallWidth());
+    EXPECT_EQ(b1.ReadGroup(),       b2.ReadGroup());
+    EXPECT_EQ(b1.PulseMergeQV(),    b2.PulseMergeQV());
+}
+
+static
+size_t NumVirtualRecords(const string& primaryBamFn,
+                         const string& scrapsBamFn)
+{
+    ZmwReadStitcher stitcher(primaryBamFn, scrapsBamFn);
+    size_t count = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        (void)record;
+        ++count;
+    }
+    return count;
+}
+
+} // namespace tests
+} // namespace BAM
+} // namespace PacBio
+
+TEST(ZmwReadStitching, FromBams_NoFilter)
+{
+    ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/internal.subreads.bam",
+                             tests::Data_Dir + "/polymerase/internal.scraps.bam");
+    size_t count = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        (void)record;
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(ZmwReadStitching, FromBams_Filtered)
+{
+    PbiFilter filter { PbiZmwFilter{100000} }; // setup to match DataSet w/ filter
+    ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/internal.subreads.bam",
+                             tests::Data_Dir + "/polymerase/internal.scraps.bam",
+                             filter);
+    size_t count = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        EXPECT_EQ(100000, record.HoleNumber());
+        ++count;
+    }
+    EXPECT_EQ(1, count);
+}
+
+TEST(ZmwReadStitching, FromDataSet_NoFilter)
+{
+    // dataset contains these resources (subreads/scraps + hqregion/scraps BAMs)
+    const string primaryFn1 = tests::Data_Dir + "/polymerase/production.subreads.bam";
+    const string scrapsFn1  = tests::Data_Dir + "/polymerase/production.scraps.bam";
+    const string primaryFn2 = tests::Data_Dir + "/polymerase/production_hq.hqregion.bam";
+    const string scrapsFn2  = tests::Data_Dir + "/polymerase/production_hq.scraps.bam";
+    const size_t numExpectedRecords =
+            tests::NumVirtualRecords(primaryFn1, scrapsFn1) +
+            tests::NumVirtualRecords(primaryFn2, scrapsFn2);
+
+    const string datasetFn = tests::Data_Dir +
+            "/polymerase/multiple_resources.subread.dataset.xml";
+
+    DataSet ds{ datasetFn };
+    ZmwReadStitcher stitcher{ ds };
+    size_t numObservedRecords = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        (void)record;
+        ++numObservedRecords;
+    }
+    EXPECT_EQ(numExpectedRecords, numObservedRecords);
+}
+
+TEST(ZmwReadStitching, FromDataSet_Filtered)
+{
+    // dataset contains these resources (subreads/scraps + hqregion/scraps BAMs)
+    const string primaryFn1 = tests::Data_Dir + "/polymerase/production.subreads.bam";
+    const string scrapsFn1  = tests::Data_Dir + "/polymerase/production.scraps.bam";
+    const string primaryFn2 = tests::Data_Dir + "/polymerase/internal.subreads.bam";
+    const string scrapsFn2  = tests::Data_Dir + "/polymerase/internal.scraps.bam";
+    const string primaryFn3 = tests::Data_Dir + "/polymerase/production_hq.hqregion.bam";
+    const string scrapsFn3  = tests::Data_Dir + "/polymerase/production_hq.scraps.bam";
+    const size_t totalRecords =
+            tests::NumVirtualRecords(primaryFn1, scrapsFn1) +
+            tests::NumVirtualRecords(primaryFn2, scrapsFn2) +
+            tests::NumVirtualRecords(primaryFn3, scrapsFn3);
+    EXPECT_EQ(5, totalRecords);
+
+    // our filter will remove the 2 "production" BAM pairs
+    // using a ZMW filter that only the "internal" pair should pass
+    const string datasetFn = tests::Data_Dir +
+            "/polymerase/filtered_resources.subread.dataset.xml";
+
+    DataSet ds{ datasetFn };
+    ZmwReadStitcher stitcher{ ds };
+    size_t numObservedRecords = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        (void)record;
+        ++numObservedRecords;
+    }
+    EXPECT_EQ(1, numObservedRecords);
+}
+
+TEST(ZmwReadStitching, FromDataSet_EmptyDataSet)
+{
+    ZmwReadStitcher stitcher{ DataSet{} };
+    EXPECT_FALSE(stitcher.HasNext());
+}
+
+TEST(ZmwReadStitching, EmptyScrapsFile)
+{
+    const std::string primaryBamFn = tests::Data_Dir + "/polymerase/scrapless.subreads.bam" ;
+    const std::string scrapsBamFn  = tests::Data_Dir + "/polymerase/scrapless.scraps.bam" ;
+
+    const BamFile primaryBam(primaryBamFn);
+    const BamFile scrapsBam(scrapsBamFn);
+    const PbiRawData primaryIdx(primaryBam.PacBioIndexFilename());
+    const PbiRawData scrapsIdx(scrapsBam.PacBioIndexFilename());
+    EXPECT_EQ(3, primaryIdx.NumReads());
+    EXPECT_EQ(0, scrapsIdx.NumReads());
+
+    int count = 0;
+    ZmwReadStitcher stitcher(primaryBamFn, scrapsBamFn);
+    while (stitcher.HasNext()) {
+        auto record = stitcher.Next();
+        (void)record;
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(ZmwReadStitching, VirtualRegions)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/internal.subreads.bam",
+                             tests::Data_Dir + "/polymerase/internal.scraps.bam");
+    auto virtualRecord = stitcher.Next();
+
+    auto regionMap = virtualRecord.VirtualRegionsMap();
+    auto adapter = virtualRecord.VirtualRegionsTable(VirtualRegionType::ADAPTER);
+
+    // Compare different accessors to same source
+    EXPECT_EQ(regionMap[VirtualRegionType::ADAPTER], adapter);
+
+    // Compare to truth
+    EXPECT_EQ(3047,adapter[0].beginPos);
+    EXPECT_EQ(3095,adapter[0].endPos);
+    EXPECT_EQ(3650,adapter[1].beginPos);
+    EXPECT_EQ(3700,adapter[1].endPos);
+    EXPECT_EQ(4289,adapter[2].beginPos);
+    EXPECT_EQ(4335,adapter[2].endPos);
+    EXPECT_EQ(4888,adapter[3].beginPos);
+    EXPECT_EQ(4939,adapter[3].endPos);
+    EXPECT_EQ(5498,adapter[4].beginPos);
+    EXPECT_EQ(5546,adapter[4].endPos);
+    EXPECT_EQ(6116,adapter[5].beginPos);
+    EXPECT_EQ(6173,adapter[5].endPos);
+    EXPECT_EQ(6740,adapter[6].beginPos);
+    EXPECT_EQ(6790,adapter[6].endPos);
+
+    auto barcode = virtualRecord.VirtualRegionsTable(VirtualRegionType::BARCODE);
+    EXPECT_EQ(regionMap[VirtualRegionType::BARCODE], barcode);
+    EXPECT_EQ(3025,barcode[0].beginPos);
+    EXPECT_EQ(3047,barcode[0].endPos);
+    EXPECT_EQ(3095,barcode[1].beginPos);
+    EXPECT_EQ(3116,barcode[1].endPos);
+    EXPECT_EQ(3628,barcode[2].beginPos);
+    EXPECT_EQ(3650,barcode[2].endPos);
+    EXPECT_EQ(3700,barcode[3].beginPos);
+    EXPECT_EQ(3722,barcode[3].endPos);
+    EXPECT_EQ(4267,barcode[4].beginPos);
+    EXPECT_EQ(4289,barcode[4].endPos);
+    EXPECT_EQ(4335,barcode[5].beginPos);
+    EXPECT_EQ(4356,barcode[5].endPos);
+    EXPECT_EQ(4864,barcode[6].beginPos);
+    EXPECT_EQ(4888,barcode[6].endPos);
+    EXPECT_EQ(4939,barcode[7].beginPos);
+    EXPECT_EQ(4960,barcode[7].endPos);
+    EXPECT_EQ(5477,barcode[8].beginPos);
+    EXPECT_EQ(5498,barcode[8].endPos);
+    EXPECT_EQ(5546,barcode[9].beginPos);
+    EXPECT_EQ(5571,barcode[9].endPos);
+    EXPECT_EQ(6087,barcode[10].beginPos);
+    EXPECT_EQ(6116,barcode[10].endPos);
+    EXPECT_EQ(6173,barcode[11].beginPos);
+    EXPECT_EQ(6199,barcode[11].endPos);
+    EXPECT_EQ(6719,barcode[12].beginPos);
+    EXPECT_EQ(6740,barcode[12].endPos);
+    EXPECT_EQ(6790,barcode[13].beginPos);
+    EXPECT_EQ(6812,barcode[13].endPos);
+
+    auto lqregion = virtualRecord.VirtualRegionsTable(VirtualRegionType::LQREGION);
+    EXPECT_EQ(regionMap[VirtualRegionType::LQREGION], lqregion);
+    EXPECT_EQ(0,lqregion[0].beginPos);
+    EXPECT_EQ(2659,lqregion[0].endPos);
+    EXPECT_EQ(7034,lqregion[1].beginPos);
+    EXPECT_EQ(7035,lqregion[1].endPos);
+
+
+    auto hqregion = virtualRecord.VirtualRegionsTable(VirtualRegionType::HQREGION);
+    EXPECT_EQ(regionMap[VirtualRegionType::HQREGION], hqregion);
+    EXPECT_EQ(2659,hqregion[0].beginPos);
+    EXPECT_EQ(7034,hqregion[0].endPos);
+}
+
+TEST(ZmwReadStitching, InternalSubreadsToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/internal.subreads.bam",
+                             tests::Data_Dir + "/polymerase/internal.scraps.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+
+    // Read original polymerase read
+    BamFile polyBam(tests::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    // check
+    tests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(ZmwReadStitching, InternalHQToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/internal.hqregions.bam",
+                             tests::Data_Dir + "/polymerase/internal.lqregions.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+
+    // Read original polymerase read
+    BamFile polyBam(tests::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    // check
+    tests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(ZmwReadStitching, ProductionSubreadsToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/production.subreads.bam",
+                             tests::Data_Dir + "/polymerase/production.scraps.bam");
+
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // Read original polymerase read
+    BamFile polyBam(tests::Data_Dir + "/polymerase/production.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    EXPECT_EQ(polyRecord.FullName(),        virtualRecord.FullName());
+    EXPECT_EQ(polyRecord.HoleNumber(),      virtualRecord.HoleNumber());
+    EXPECT_FLOAT_EQ(polyRecord.ReadAccuracy(),    virtualRecord.ReadAccuracy());
+    EXPECT_EQ(polyRecord.NumPasses(),       virtualRecord.NumPasses());
+    EXPECT_EQ(polyRecord.Sequence(),        virtualRecord.Sequence());
+    EXPECT_EQ(polyRecord.Qualities(),       virtualRecord.Qualities());
+    EXPECT_EQ(polyRecord.DeletionQV(),      virtualRecord.DeletionQV());
+    EXPECT_EQ(polyRecord.DeletionTag(),     virtualRecord.DeletionTag());
+    EXPECT_EQ(polyRecord.InsertionQV(),     virtualRecord.InsertionQV());
+    EXPECT_EQ(polyRecord.MergeQV(),         virtualRecord.MergeQV());
+    EXPECT_EQ(polyRecord.SubstitutionQV(),  virtualRecord.SubstitutionQV());
+    EXPECT_EQ(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag());
+    EXPECT_EQ(polyRecord.IPD(),             virtualRecord.IPDV1Frames());
+    EXPECT_EQ(polyRecord.ReadGroup(),       virtualRecord.ReadGroup());
+}
+
+TEST(ZmwReadStitching, ProductionHQToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/production_hq.hqregion.bam",
+                             tests::Data_Dir + "/polymerase/production_hq.scraps.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // Read original polymerase read
+    BamFile polyBam(tests::Data_Dir + "/polymerase/production.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    EXPECT_FALSE(polyRecord.HasPulseCall());
+    EXPECT_FALSE(virtualRecord.HasPulseCall());
+    EXPECT_EQ(polyRecord.FullName(),        virtualRecord.FullName());
+    EXPECT_EQ(polyRecord.HoleNumber(),      virtualRecord.HoleNumber());
+    EXPECT_EQ(polyRecord.ReadAccuracy(),    virtualRecord.ReadAccuracy());
+    EXPECT_EQ(polyRecord.NumPasses(),       virtualRecord.NumPasses());
+    EXPECT_EQ(polyRecord.Sequence(),        virtualRecord.Sequence());
+    EXPECT_EQ(polyRecord.Qualities(),       virtualRecord.Qualities());
+    EXPECT_EQ(polyRecord.DeletionQV(),      virtualRecord.DeletionQV());
+    EXPECT_EQ(polyRecord.DeletionTag(),     virtualRecord.DeletionTag());
+    EXPECT_EQ(polyRecord.InsertionQV(),     virtualRecord.InsertionQV());
+    EXPECT_EQ(polyRecord.MergeQV(),         virtualRecord.MergeQV());
+    EXPECT_EQ(polyRecord.SubstitutionQV(),  virtualRecord.SubstitutionQV());
+    EXPECT_EQ(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag());
+    EXPECT_EQ(polyRecord.IPD(),             virtualRecord.IPDV1Frames());
+    EXPECT_EQ(polyRecord.ReadGroup(),       virtualRecord.ReadGroup());
+
+    EXPECT_TRUE(polyRecord.HasDeletionQV());
+    EXPECT_TRUE(polyRecord.HasDeletionTag());
+    EXPECT_TRUE(polyRecord.HasInsertionQV());
+    EXPECT_TRUE(polyRecord.HasMergeQV());
+    EXPECT_TRUE(polyRecord.HasSubstitutionQV());
+    EXPECT_TRUE(polyRecord.HasSubstitutionTag());
+    EXPECT_TRUE(polyRecord.HasIPD());
+    EXPECT_FALSE(polyRecord.HasLabelQV());
+    EXPECT_FALSE(polyRecord.HasAltLabelQV());
+    EXPECT_FALSE(polyRecord.HasAltLabelTag());
+    EXPECT_FALSE(polyRecord.HasPkmean());
+    EXPECT_FALSE(polyRecord.HasPkmid());
+    EXPECT_FALSE(polyRecord.HasPulseCall());
+    EXPECT_FALSE(polyRecord.HasPulseWidth());
+    EXPECT_FALSE(polyRecord.HasPrePulseFrames());
+    EXPECT_FALSE(polyRecord.HasPulseCallWidth());
+
+    EXPECT_TRUE(virtualRecord.HasDeletionQV());
+    EXPECT_TRUE(virtualRecord.HasDeletionTag());
+    EXPECT_TRUE(virtualRecord.HasInsertionQV());
+    EXPECT_TRUE(virtualRecord.HasMergeQV());
+    EXPECT_TRUE(virtualRecord.HasSubstitutionQV());
+    EXPECT_TRUE(virtualRecord.HasSubstitutionTag());
+    EXPECT_TRUE(virtualRecord.HasIPD());
+    EXPECT_FALSE(virtualRecord.HasLabelQV());
+    EXPECT_FALSE(virtualRecord.HasAltLabelQV());
+    EXPECT_FALSE(virtualRecord.HasAltLabelTag());
+    EXPECT_FALSE(virtualRecord.HasPkmean());
+    EXPECT_FALSE(virtualRecord.HasPkmid());
+    EXPECT_FALSE(virtualRecord.HasPulseCall());
+    EXPECT_FALSE(virtualRecord.HasPulseWidth());
+    EXPECT_FALSE(virtualRecord.HasPrePulseFrames());
+    EXPECT_FALSE(virtualRecord.HasPulseCallWidth());
+}
+
+TEST(ZmwReadStitching, VirtualRecord_VirtualRegionsTable)
+{
+    ZmwReadStitcher stitcher(tests::Data_Dir + "/polymerase/production.subreads.bam",
+                             tests::Data_Dir + "/polymerase/production.scraps.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    const auto virtualRecord = stitcher.Next();
+
+    const auto subreads  = virtualRecord.VirtualRegionsTable(VirtualRegionType::SUBREAD);
+    const auto adapters  = virtualRecord.VirtualRegionsTable(VirtualRegionType::ADAPTER);
+    const auto hqRegions = virtualRecord.VirtualRegionsTable(VirtualRegionType::HQREGION);
+    const auto lqRegions = virtualRecord.VirtualRegionsTable(VirtualRegionType::LQREGION);
+    const auto barcodes  = virtualRecord.VirtualRegionsTable(VirtualRegionType::BARCODE);
+    const auto filtered  = virtualRecord.VirtualRegionsTable(VirtualRegionType::FILTERED);
+
+    EXPECT_FALSE(subreads.empty());
+    EXPECT_FALSE(adapters.empty());
+    EXPECT_FALSE(hqRegions.empty());
+    EXPECT_FALSE(lqRegions.empty());
+    EXPECT_FALSE(barcodes.empty());
+    EXPECT_TRUE(filtered.empty());    // this type not present in this data
+}
+
+TEST(ZmwReadStitching, LegacyTypedefsOk)
+{
+    {
+        VirtualPolymeraseReader reader(tests::Data_Dir + "/polymerase/internal.subreads.bam",
+                                       tests::Data_Dir + "/polymerase/internal.scraps.bam");
+        size_t count = 0;
+        while (reader.HasNext()) {
+            const auto record = reader.Next();
+            (void)record;
+            ++count;
+        }
+        EXPECT_EQ(3, count);
+    }
+
+    {
+        VirtualPolymeraseCompositeReader reader{ DataSet{} };
+        EXPECT_FALSE(reader.HasNext());
+    }
+}
+
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt

new file mode 100644 (file)

index 0000000..5c589c1
--- /dev/null
+++ b/tools/CMakeLists.txt
@@ -0,0 +1,47 @@
+
+
+if(DEFINED PacBioBAM_build_pbindex)
+
+    # Deprecating the "PacBioBAM_build_pbindex" command line option in favor of more
+    # general "PacBioBAM_build_tools", as we're starting to add new utilities.
+    #
+    # That said, I don't want to break current auto tests/builds, so I'm providing a
+    # warning message so devs are aware.
+    #
+    # construct warning message
+    set(pbindex_warning "\nDeprecated:\n-DPacBioBAM_build_pbindex\n")
+    if (PacBioBAM_build_pbindex)
+        set(pbindex_warning "${pbindex_warning} Building as requested,")
+    else()
+        set(pbindex_warning "${pbindex_warning} Skipping as requested,")
+    endif()
+    set(pbindex_warning "${pbindex_warning} but support for this option will be removed at some point in the future.\n")
+    message(AUTHOR_WARNING "${pbindex_warning} ** Use -DPacBioBAM_build_tools instead. **\n")
+
+    # force PacBioBAM_build_tools option
+    set(PacBioBAM_build_tools
+        ${PacBioBAM_build_pbindex} CACHE BOOL
+        "Build PacBioBAM with add'l utilities (e.g. pbindex, pbindexdump)." FORCE)
+endif()
+
+if (PacBioBAM_build_tools)
+
+    # tools directory
+    set(ToolsCommonDir ${PacBioBAM_ToolsDir}/common)
+    set(PacBioBAM_CramTestsDir ${PacBioBAM_TestsDir}/src/cram)
+
+    # quash warning with OptionParser
+    include(CheckCXXCompilerFlag)
+    check_cxx_compiler_flag("-Wno-unused-private-field" HAS_NO_UNUSED_PRIVATE_FIELD)
+    if(HAS_NO_UNUSED_PRIVATE_FIELD)
+        set(PacBioBAM_CXX_FLAGS "${PacBioBAM_CXX_FLAGS} -Wno-unused-private-field")
+    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PacBioBAM_CXX_FLAGS}")
+
+    # tools
+    add_subdirectory(bam2sam)
+    add_subdirectory(pbindex)
+    add_subdirectory(pbindexdump)
+    add_subdirectory(pbmerge)
+
+endif()
diff --git a/tools/bam2sam/CMakeLists.txt b/tools/bam2sam/CMakeLists.txt

new file mode 100644 (file)

index 0000000..5554970
--- /dev/null
+++ b/tools/bam2sam/CMakeLists.txt
@@ -0,0 +1,39 @@
+
+set(Bam2SamSrcDir ${PacBioBAM_ToolsDir}/bam2sam/src)
+
+# create version header
+set(Bam2Sam_VERSION ${PacBioBAM_VERSION})
+configure_file(
+    ${Bam2SamSrcDir}/Bam2SamVersion.h.in ${GeneratedDir}/Bam2SamVersion.h @ONLY
+)
+
+# list source files
+set(BAM2SAM_SOURCES
+    ${ToolsCommonDir}/OptionParser.cpp
+    ${Bam2SamSrcDir}/main.cpp
+    ${Bam2SamSrcDir}/Bam2Sam.cpp
+)
+
+# build bam2sam executable
+include(PbbamTool)
+create_pbbam_tool(
+    TARGET  bam2sam
+    SOURCES ${BAM2SAM_SOURCES}
+)
+
+# cram tests
+if (PacBioBAM_build_tests)
+
+    configure_file(
+        ${PacBioBAM_CramTestsDir}/bam2sam.t.in
+        ${GeneratedDir}/bam2sam.t
+    )
+
+    add_test(
+        NAME bam2sam_CramTests
+        WORKING_DIRECTORY ${PacBioBAM_TestsDir}/scripts
+        COMMAND "python" cram.py
+            ${GeneratedDir}/bam2sam.t
+    )
+
+endif()
diff --git a/tools/bam2sam/src/Bam2Sam.cpp b/tools/bam2sam/src/Bam2Sam.cpp

new file mode 100644 (file)

index 0000000..5fde774
--- /dev/null
+++ b/tools/bam2sam/src/Bam2Sam.cpp
@@ -0,0 +1,121 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "Bam2Sam.h"
+#include <htslib/sam.h>
+#include <stdexcept>
+#include <memory>
+#include <cassert>
+using namespace bam2sam;
+using namespace std;
+
+namespace bam2sam {
+
+struct HtslibFileDeleter
+{
+    void operator()(samFile* file)
+    {
+        if (file)
+            sam_close(file);
+        file = nullptr;
+    }
+};
+
+struct HtslibHeaderDeleter
+{
+    void operator()(bam_hdr_t* hdr)
+    {
+        if (hdr)
+            bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+struct HtslibRecordDeleter
+{
+    void operator()(bam1_t* b)
+    {
+        if (b)
+            bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+} // namespace bam2sam
+
+void PbBam2Sam::Run(const Settings &settings)
+{
+    int htslibResult = 0;
+
+    // open files
+
+    unique_ptr<samFile, HtslibFileDeleter> inFileWrapper(sam_open(settings.inputFilename_.c_str(), "rb"));
+    samFile* in = inFileWrapper.get();
+    if (!in || !in->fp.bgzf)
+        throw std::runtime_error("could not read from stdin");
+
+    unique_ptr<samFile, HtslibFileDeleter> outFileWrapper(sam_open("-", "w"));
+    samFile* out = outFileWrapper.get();
+    if (!out)
+        throw std::runtime_error("could not write to stdout");
+
+    // fetch & write header
+
+    unique_ptr<bam_hdr_t, HtslibHeaderDeleter> headerWrapper(bam_hdr_read(in->fp.bgzf));
+    bam_hdr_t* hdr = headerWrapper.get();
+    if (!hdr)
+        throw std::runtime_error("could not read header");
+
+    if (!settings.noHeader_) {
+        htslibResult = sam_hdr_write(out, hdr);
+        if (htslibResult != 0)
+            throw std::runtime_error("could not write header");
+        if (settings.printHeaderOnly_)
+            return;
+    }
+
+    // fetch & write records
+
+    unique_ptr<bam1_t, HtslibRecordDeleter> recordWrapper(bam_init1());
+    bam1_t* b = recordWrapper.get();
+
+    while ((htslibResult = sam_read1(in, hdr, b)) >= 0) {
+        htslibResult = sam_write1(out, hdr, b);
+        if (htslibResult < 0)
+            throw std::runtime_error("error writing record to stdout");
+    }
+}
diff --git a/tools/bam2sam/src/Bam2Sam.h b/tools/bam2sam/src/Bam2Sam.h

new file mode 100644 (file)

index 0000000..4a7ffbb
--- /dev/null
+++ b/tools/bam2sam/src/Bam2Sam.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef BAM2SAM_H
+#define BAM2SAM_H
+
+#include "Settings.h"
+
+namespace bam2sam {
+
+class PbBam2Sam
+{
+public:
+    static void Run(const Settings& settings);
+};
+
+} // namespace bam2sam
+
+#endif // PBIBAM2SAM_H
diff --git a/tools/bam2sam/src/Bam2SamVersion.h.in b/tools/bam2sam/src/Bam2SamVersion.h.in

new file mode 100644 (file)

index 0000000..10319b7
--- /dev/null
+++ b/tools/bam2sam/src/Bam2SamVersion.h.in
@@ -0,0 +1,49 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef BAM2SAMVERSION_H
+#define BAM2SAMVERSION_H
+
+#include <string>
+
+namespace bam2sam {
+
+const std::string Version = std::string("@Bam2Sam_VERSION@");
+
+} // namespace bam2sam
+
+#endif // BAM2SAMVERSION_H
diff --git a/tools/bam2sam/src/Settings.h b/tools/bam2sam/src/Settings.h

new file mode 100644 (file)

index 0000000..d570dc9
--- /dev/null
+++ b/tools/bam2sam/src/Settings.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef SETTINGS_H
+#define SETTINGS_H
+
+#include <string>
+#include <vector>
+
+namespace bam2sam {
+
+class Settings
+{
+public:
+    Settings(void)
+        : noHeader_(false)
+        , printHeaderOnly_(false)
+    { }
+
+public:
+    std::string inputFilename_;
+    bool noHeader_;
+    bool printHeaderOnly_;
+    std::vector<std::string> errors_;
+};
+
+} // namespace bam2sam
+
+#endif // SETTINGS_H
diff --git a/tools/bam2sam/src/main.cpp b/tools/bam2sam/src/main.cpp

new file mode 100644 (file)

index 0000000..d27b42f
--- /dev/null
+++ b/tools/bam2sam/src/main.cpp
@@ -0,0 +1,127 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "../common/OptionParser.h"
+#include "Bam2Sam.h"
+#include "Bam2SamVersion.h"
+#include <string>
+#include <vector>
+#include <cassert>
+#include <cstdlib>
+
+static
+bam2sam::Settings fromCommandLine(optparse::OptionParser& parser,
+                                    int argc, char* argv[])
+{
+    bam2sam::Settings settings;
+
+    const optparse::Values options = parser.parse_args(argc, argv);
+
+    // input
+    const std::vector<std::string> positionalArgs = parser.args();
+    const size_t numPositionalArgs = positionalArgs.size();
+    if (numPositionalArgs == 0)
+        settings.inputFilename_ = "-"; // stdin
+    else if (numPositionalArgs == 1)
+        settings.inputFilename_ = parser.args().front();
+    else {
+        assert(numPositionalArgs > 1);
+        settings.errors_.push_back("bam2sam does not support more than one input file per run");
+    }
+
+    // header options
+    if (options.is_set("no_header"))
+        settings.noHeader_ = options.get("no_header");
+    if (options.is_set("header_only"))
+        settings.printHeaderOnly_ = options.get("header_only");
+
+    if (settings.noHeader_ && settings.printHeaderOnly_)
+        settings.errors_.push_back("conflicting arguments requested: --no-header and --header-only");
+
+    return settings;
+}
+
+int main(int argc, char* argv[])
+{
+    // setup help & options
+    optparse::OptionParser parser;
+    parser.description("bam2sam converts a BAM file to SAM. It is essentially a stripped-down "
+                       "'samtools view', mostly useful for testing/debugging without requiring samtools. "
+                       "Input BAM file is read from a file or stdin, and SAM output is written to stdout."
+                       );
+    parser.prog("bam2sam");
+    parser.usage("bam2sam [options] [input]");
+    parser.version(bam2sam::Version);
+    parser.add_version_option(true);
+    parser.add_help_option(true);
+
+    auto optionGroup = optparse::OptionGroup(parser, "Options");
+    optionGroup.add_option("")
+               .dest("input")
+               .metavar("input")
+               .help("Input BAM file. If not provided, stdin will be used as input.");
+    optionGroup.add_option("--no-header")
+                .dest("no_header")
+                .action("store_true")
+                .help("Omit header from output.");
+    optionGroup.add_option("--header-only")
+               .dest("header_only")
+               .action("store_true")
+               .help("Print only the header (no records).");
+    parser.add_option_group(optionGroup);
+
+    // parse command line for settings
+    const bam2sam::Settings settings = fromCommandLine(parser, argc, argv);
+    if (!settings.errors_.empty()) {
+        std::cerr << std::endl;
+        for (const auto e : settings.errors_)
+            std::cerr << "ERROR: " << e << std::endl;
+        std::cerr << std::endl;
+        parser.print_help();
+        return EXIT_FAILURE;
+    }
+
+    // run tool
+    try {
+        bam2sam::PbBam2Sam::Run(settings);
+        return EXIT_SUCCESS;
+    }
+    catch (std::exception& e) {
+        std::cerr << "ERROR: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tools/common/BamFileMerger.h b/tools/common/BamFileMerger.h

new file mode 100644 (file)

index 0000000..d2a6bb2
--- /dev/null
+++ b/tools/common/BamFileMerger.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef BAMFILEMERGER_H
+#define BAMFILEMERGER_H
+
+#include <pbbam/DataSet.h>
+#include <pbbam/PbiFilter.h>
+#include <pbbam/ProgramInfo.h>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace common {
+
+class BamFileMerger
+{
+public:
+    /// \brief Runs merger on a dataset, applying any supplied filters.
+    ///
+    /// When this function exits, a merged BAM (and optional PBI) will have been
+    /// written and closed.
+    ///
+    /// \param[in] dataset          provides input filenames & filters
+    /// \param[in] outputFilename   resulting BAM output
+    /// \param[in] mergeProgram     info about the calling program. Adds a @PG entry to merged header.
+    /// \param[in] createPbi        if true, creates a PBI alongside output BAM
+    ///
+    /// \throws std::runtime_error if any any errors encountered while reading or writing
+    ///
+    static void Merge(const PacBio::BAM::DataSet& dataset,
+                      const std::string& outputFilename,
+                      const PacBio::BAM::ProgramInfo& mergeProgram = PacBio::BAM::ProgramInfo(),
+                      bool createPbi = true);
+};
+
+} // namespace common
+} // namespace BAM
+} // namespace PacBio
+
+#include "BamFileMerger.inl"
+
+#endif // BAMFILEMERGER_H
diff --git a/tools/common/BamFileMerger.inl b/tools/common/BamFileMerger.inl

new file mode 100644 (file)

index 0000000..18dfbca
--- /dev/null
+++ b/tools/common/BamFileMerger.inl
@@ -0,0 +1,262 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "BamFileMerger.h"
+
+#include <pbbam/BamHeader.h>
+#include <pbbam/BamReader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/CompositeBamReader.h>
+#include <pbbam/PbiBuilder.h>
+
+#include <deque>
+#include <memory>
+#include <stdexcept>
+#include <cassert>
+
+namespace PacBio {
+namespace BAM {
+namespace common {
+
+// ICollator
+
+class ICollator
+{
+public:
+    ~ICollator(void) { }
+
+    bool GetNext(BamRecord& record)
+    {
+        // nothing left to read
+        if (mergeItems_.empty())
+            return false;
+
+        // non-destructive 'pop' of first item from queue
+        auto firstIter = mergeItems_.begin();
+        auto firstItem = PacBio::BAM::internal::CompositeMergeItem{ std::move(firstIter->reader),
+                                                                    std::move(firstIter->record)
+                                                                  };
+        mergeItems_.pop_front();
+
+        // store its record in our output record
+        std::swap(record, firstItem.record);
+
+        // try fetch 'next' from first item's reader
+        // if successful, re-insert it into container & re-sort on our new values
+        // otherwise, this item will go out of scope & reader destroyed
+        if (firstItem.reader->GetNext(firstItem.record)) {
+            mergeItems_.push_front(std::move(firstItem));
+            UpdateSort();
+        }
+
+        // return success
+        return true;
+    }
+
+protected:
+    std::deque<PacBio::BAM::internal::CompositeMergeItem> mergeItems_;
+
+protected:
+    ICollator(std::vector<std::unique_ptr<PacBio::BAM::BamReader> >&& readers)
+    {
+        for (auto&& reader : readers) {
+            auto item = internal::CompositeMergeItem{std::move(reader)};
+            if (item.reader->GetNext(item.record))
+                mergeItems_.push_back(std::move(item));
+        }
+    }
+
+    virtual void UpdateSort(void) =0;
+};
+
+// QNameCollator
+
+struct QNameSorter : std::binary_function<internal::CompositeMergeItem,
+                                          internal::CompositeMergeItem,
+                                          bool>
+{
+    bool operator()(const internal::CompositeMergeItem& lhs,
+                    const internal::CompositeMergeItem& rhs)
+    {
+        const BamRecord& l = lhs.record;
+        const BamRecord& r = rhs.record;
+
+        // movie name
+        const int cmp = l.MovieName().compare(r.MovieName());
+        if (cmp != 0)
+            return cmp < 0;
+
+        // hole number
+        const auto lhsZmw = l.HoleNumber();
+        const auto rhsZmw = r.HoleNumber();
+        if (lhsZmw != rhsZmw)
+            return lhsZmw < rhsZmw;
+
+        // shuffle CCS reads after all others
+        const auto lhsReadType = l.Type();
+        const auto rhsReadType = r.Type();
+        if (lhsReadType == RecordType::CCS)
+            return false;
+        if (rhsReadType == RecordType::CCS)
+            return true;
+
+        // sort on qStart, then finally qEnd
+        const auto lhsQStart = l.QueryStart();
+        const auto rhsQStart = r.QueryStart();
+        return lhsQStart < rhsQStart;
+    }
+};
+
+class QNameCollator : public ICollator
+{
+public:
+    QNameCollator(std::vector<std::unique_ptr<PacBio::BAM::BamReader>>&& readers)
+        : ICollator(std::move(readers))
+    { UpdateSort(); }
+
+    void UpdateSort(void)
+    { std::sort(mergeItems_.begin(), mergeItems_.end(), QNameSorter{ }); }
+};
+
+// AlignedCollator
+
+class AlignedCollator : public ICollator
+{
+public:
+    AlignedCollator(std::vector<std::unique_ptr<PacBio::BAM::BamReader>>&& readers)
+        : ICollator(std::move(readers))
+    { UpdateSort(); }
+
+    void UpdateSort(void)
+    { std::sort(mergeItems_.begin(), mergeItems_.end(), PacBio::BAM::PositionSorter{ }); }
+};
+
+// BamFileMerger
+
+inline
+void BamFileMerger::Merge(const DataSet& dataset,
+                          const std::string& outputFilename,
+                          const ProgramInfo& mergeProgram,
+                          bool createPbi)
+{
+    const PbiFilter filter = PbiFilter::FromDataSet(dataset);
+
+    std::vector<std::string> inputFilenames_;
+    const auto& bamFiles = dataset.BamFiles();
+    inputFilenames_.reserve(bamFiles.size());
+    for (const auto& file : bamFiles)
+        inputFilenames_.push_back(file.Filename());
+
+    if (inputFilenames_.empty())
+        throw std::runtime_error("no input filenames provided to BamFileMerger");
+
+    if (outputFilename.empty())
+        throw std::runtime_error("no output filename provide to BamFileMerger");
+
+
+    // attempt open input files
+    std::vector<std::unique_ptr<BamReader> > readers;
+    readers.reserve(inputFilenames_.size());
+    for (const auto& fn : inputFilenames_) {
+        if (filter.IsEmpty())
+            readers.emplace_back(new BamReader(fn));
+        else
+            readers.emplace_back(new PbiIndexedBamReader(filter, fn));
+    }
+
+    // read headers
+    std::vector<BamHeader> headers;
+    headers.reserve(readers.size());
+    for (auto&& reader : readers)
+        headers.push_back(reader->Header());
+
+    assert(!readers.empty());
+    assert(!headers.empty());
+
+    // merge headers
+    BamHeader mergedHeader = headers.front();
+    const std::string& usingSortOrder = mergedHeader.SortOrder();
+    const bool isCoordinateSorted = (usingSortOrder == "coordinate");
+    for (size_t i = 1; i < headers.size(); ++i) {
+        const BamHeader& header = headers.at(i);
+        if (header.SortOrder() != usingSortOrder)
+            throw std::runtime_error("BAM file sort orders do not match, aborting merge");
+        mergedHeader += headers.at(i);
+    }
+    if (mergeProgram.IsValid())
+        mergedHeader.AddProgram(mergeProgram);
+
+    // setup collator, based on sort order
+    std::unique_ptr<ICollator> collator;
+    if (isCoordinateSorted)
+        collator.reset(new AlignedCollator(std::move(readers)));
+    else
+        collator.reset(new QNameCollator(std::move(readers)));
+    // NOTE: readers *moved*, so no longer accessible here
+
+    // do merge, creating PBI on-the-fly
+    if (createPbi && (outputFilename != "-")) {
+
+        // TODO: this implementation recalculates all PBI values, when we really
+        //       only need to collate entries and update offsets
+
+        BamWriter writer(outputFilename, mergedHeader);
+        PbiBuilder builder{ (outputFilename + ".pbi"),
+                            mergedHeader.NumSequences(),
+                            isCoordinateSorted
+                          };
+        BamRecord record;
+        int64_t vOffset = 0;
+        while (collator->GetNext(record)) {
+            writer.Write(record, &vOffset);
+            builder.AddRecord(record, vOffset);
+        }
+    }
+
+    // otherwise just merge BAM
+    else {
+        BamWriter writer(outputFilename, mergedHeader);
+        BamRecord record;
+        while (collator->GetNext(record))
+            writer.Write(record);
+    }
+}
+
+} // namespace common
+} // namespace BAM
+} // namespace PacBio
diff --git a/tools/common/OptionParser.cpp b/tools/common/OptionParser.cpp

new file mode 100644 (file)

index 0000000..fc73176
--- /dev/null
+++ b/tools/common/OptionParser.cpp
@@ -0,0 +1,562 @@
+/**
+ * Copyright (C) 2010 Johannes Weißl <jargon@molb.org>
+ * License: your favourite BSD-style license
+ *
+ * See OptionParser.h for help.
+ */
+
+#include "OptionParser.h"
+
+#include <cstdlib>
+#include <algorithm>
+#include <complex>
+#include <ciso646>
+
+#if defined(ENABLE_NLS) && ENABLE_NLS
+# include <libintl.h>
+# define _(s) gettext(s)
+#else
+# define _(s) ((const char *) (s))
+#endif
+
+using namespace std;
+
+namespace optparse {
+
+////////// auxiliary (string) functions { //////////
+class str_wrap {
+public:
+  str_wrap(const string& l, const string& r) : lwrap(l), rwrap(r) {}
+  str_wrap(const string& w) : lwrap(w), rwrap(w) {}
+  string operator() (const string& s) { return lwrap + s + rwrap; }
+  const string lwrap, rwrap;
+};
+template<typename InputIterator, typename UnaryOperator>
+static string str_join_trans(const string& sep, InputIterator begin, InputIterator end, UnaryOperator op) {
+  string buf;
+  for (InputIterator it = begin; it != end; ++it) {
+    if (it != begin)
+      buf += sep;
+    buf += op(*it);
+  }
+  return buf;
+}
+template<class InputIterator>
+static string str_join(const string& sep, InputIterator begin, InputIterator end) {
+  return str_join_trans(sep, begin, end, str_wrap(""));
+}
+static string& str_replace(string& s, const string& patt, const string& repl) {
+  size_t pos = 0, n = patt.length();
+  while (true) {
+    pos = s.find(patt, pos);
+    if (pos == string::npos)
+      break;
+    s.replace(pos, n, repl);
+    pos += repl.size();
+  }
+  return s;
+}
+static string str_replace(const string& s, const string& patt, const string& repl) {
+  string tmp = s;
+  str_replace(tmp, patt, repl);
+  return tmp;
+}
+static string str_format(const string& s, size_t pre, size_t len, bool indent_first = true) {
+  stringstream ss;
+  string p;
+  if (indent_first)
+    p = string(pre, ' ');
+
+  size_t pos = 0, linestart = 0;
+  size_t line = 0;
+  while (true) {
+    bool wrap = false;
+
+    size_t new_pos = s.find_first_of(" \n\t", pos);
+    if (new_pos == string::npos)
+      break;
+    if (s[new_pos] == '\n') {
+      pos = new_pos + 1;
+      wrap = true;
+    }
+    if (line == 1)
+      p = string(pre, ' ');
+    if (wrap || new_pos + pre > linestart + len) {
+      ss << p << s.substr(linestart, pos - linestart - 1) << endl;
+      linestart = pos;
+      line++;
+    }
+    pos = new_pos + 1;
+  }
+  ss << p << s.substr(linestart) << endl;
+  return ss.str();
+}
+static string str_inc(const string& s) {
+  stringstream ss;
+  string v = (s != "") ? s : "0";
+  long i;
+  istringstream(v) >> i;
+  ss << i+1;
+  return ss.str();
+}
+static unsigned int cols() {
+  unsigned int n = 80;
+#ifndef _WIN32
+  const char *s = getenv("COLUMNS");
+  if (s)
+    istringstream(s) >> n;
+#endif
+  return n;
+}
+static string basename(const string& s) {
+  string b = s;
+  size_t i = b.find_last_not_of('/');
+  if (i == string::npos) {
+    if (b[0] == '/')
+      b.erase(1);
+    return b;
+  }
+  b.erase(i+1, b.length()-i-1);
+  i = b.find_last_of("/");
+  if (i != string::npos)
+    b.erase(0, i+1);
+  return b;
+}
+////////// } auxiliary (string) functions //////////
+
+
+////////// class OptionParser { //////////
+OptionParser::OptionParser() :
+  _usage(_("%prog [options]")),
+  _add_help_option(true),
+  _add_version_option(true),
+  _interspersed_args(true) {}
+
+Option& OptionParser::add_option(const string& opt) {
+  const string tmp[1] = { opt };
+  return add_option(vector<string>(&tmp[0], &tmp[1]));
+}
+Option& OptionParser::add_option(const string& opt1, const string& opt2) {
+  const string tmp[2] = { opt1, opt2 };
+  return add_option(vector<string>(&tmp[0], &tmp[2]));
+}
+Option& OptionParser::add_option(const string& opt1, const string& opt2, const string& opt3) {
+  const string tmp[3] = { opt1, opt2, opt3 };
+  return add_option(vector<string>(&tmp[0], &tmp[3]));
+}
+Option& OptionParser::add_option(const vector<string>& v) {
+  _opts.resize(_opts.size()+1);
+  Option& option = _opts.back();
+  string dest_fallback;
+  for (vector<string>::const_iterator it = v.begin(); it != v.end(); ++it) {
+    if (it->substr(0,2) == "--") {
+      const string s = it->substr(2);
+      if (option.dest() == "")
+        option.dest(str_replace(s, "-", "_"));
+      option._long_opts.insert(s);
+      _optmap_l[s] = &option;
+    } else if ( it->empty() ) {
+       continue;
+    } else {
+      const string s = it->substr(1,1);
+      if (dest_fallback == "")
+        dest_fallback = s;
+      option._short_opts.insert(s);
+      _optmap_s[s] = &option;
+    }
+  }
+  if (option.dest() == "")
+    option.dest(dest_fallback);
+  return option;
+}
+
+OptionParser& OptionParser::add_option_group(const OptionGroup& group) {
+  for (list<Option>::const_iterator oit = group._opts.begin(); oit != group._opts.end(); ++oit) {
+    const Option& option = *oit;
+    for (set<string>::const_iterator it = option._short_opts.begin(); it != option._short_opts.end(); ++it)
+      _optmap_s[*it] = &option;
+    for (set<string>::const_iterator it = option._long_opts.begin(); it != option._long_opts.end(); ++it)
+      _optmap_l[*it] = &option;
+  }
+  _groups.push_back(&group);
+  return *this;
+}
+
+const Option& OptionParser::lookup_short_opt(const string& opt) const {
+  optMap::const_iterator it = _optmap_s.find(opt);
+  if (it == _optmap_s.end())
+    error(_("no such option") + string(": -") + opt);
+  return *it->second;
+}
+
+void OptionParser::handle_short_opt(const string& opt, const string& arg) {
+
+  _remaining.pop_front();
+  string value;
+
+  const Option& option = lookup_short_opt(opt);
+  if (option._nargs == 1) {
+    value = arg.substr(2);
+    if (value == "") {
+      if (_remaining.empty())
+        error("-" + opt + " " + _("option requires an argument"));
+      value = _remaining.front();
+      _remaining.pop_front();
+    }
+  } else {
+    if (arg.length() > 2)
+      _remaining.push_front(string("-") + arg.substr(2));
+  }
+
+  process_opt(option, string("-") + opt, value);
+}
+
+const Option& OptionParser::lookup_long_opt(const string& opt) const {
+
+  list<string> matching;
+  for (optMap::const_iterator it = _optmap_l.begin(); it != _optmap_l.end(); ++it) {
+    if (it->first.compare(0, opt.length(), opt) == 0)
+      matching.push_back(it->first);
+  }
+  if (matching.size() > 1) {
+    string x = str_join(", ", matching.begin(), matching.end());
+    error(_("ambiguous option") + string(": --") + opt + " (" + x + "?)");
+  }
+  if (matching.size() == 0)
+    error(_("no such option") + string(": --") + opt);
+
+  return *_optmap_l.find(matching.front())->second;
+}
+
+void OptionParser::handle_long_opt(const string& optstr) {
+
+  _remaining.pop_front();
+  string opt, value;
+
+  size_t delim = optstr.find("=");
+  if (delim != string::npos) {
+    opt = optstr.substr(0, delim);
+    value = optstr.substr(delim+1);
+  } else
+    opt = optstr;
+
+  const Option& option = lookup_long_opt(opt);
+  if (option._nargs == 1 and delim == string::npos) {
+    if (not _remaining.empty()) {
+      value = _remaining.front();
+      _remaining.pop_front();
+    }
+  }
+
+  if (option._nargs == 1 and value == "")
+    error("--" + opt + " " + _("option requires an argument"));
+
+  process_opt(option, string("--") + opt, value);
+}
+
+Values& OptionParser::parse_args(const int argc, char const* const* const argv) {
+  if (prog() == "")
+    prog(basename(argv[0]));
+  return parse_args(&argv[1], &argv[argc]);
+}
+Values& OptionParser::parse_args(const vector<string>& v) {
+
+  _remaining.assign(v.begin(), v.end());
+
+  if (add_version_option() and version() != "") {
+    add_option("--version") .action("version") .help(_("show program's version number and exit"));
+    _opts.splice(_opts.begin(), _opts, --(_opts.end()));
+  }
+  if (add_help_option()) {
+    add_option("-h", "--help") .action("help") .help(_("show this help message and exit"));
+    _opts.splice(_opts.begin(), _opts, --(_opts.end()));
+  }
+
+  while (not _remaining.empty()) {
+    const string arg = _remaining.front();
+
+    if (arg == "--") {
+      _remaining.pop_front();
+      break;
+    }
+
+    if (arg.substr(0,2) == "--") {
+      handle_long_opt(arg.substr(2));
+    } else if (arg.substr(0,1) == "-" and arg.length() > 1) {
+      handle_short_opt(arg.substr(1,1), arg);
+    } else {
+      _remaining.pop_front();
+      _leftover.push_back(arg);
+      if (not interspersed_args())
+        break;
+    }
+  }
+  while (not _remaining.empty()) {
+    const string arg = _remaining.front();
+    _remaining.pop_front();
+    _leftover.push_back(arg);
+  }
+
+  for (strMap::const_iterator it = _defaults.begin(); it != _defaults.end(); ++it) {
+    if (not _values.is_set(it->first))
+      _values[it->first] = it->second;
+  }
+
+  for (list<Option>::const_iterator it = _opts.begin(); it != _opts.end(); ++it) {
+    if (it->get_default() != "" and not _values.is_set(it->dest()))
+        _values[it->dest()] = it->get_default();
+  }
+
+  return _values;
+}
+
+void OptionParser::process_opt(const Option& o, const string& opt, const string& value) {
+  if (o.action() == "store") {
+    string err = o.check_type(opt, value);
+    if (err != "")
+      error(err);
+    _values[o.dest()] = value;
+    _values.is_set_by_user(o.dest(), true);
+  }
+  else if (o.action() == "store_const") {
+    _values[o.dest()] = o.get_const();
+    _values.is_set_by_user(o.dest(), true);
+  }
+  else if (o.action() == "store_true") {
+    _values[o.dest()] = "1";
+    _values.is_set_by_user(o.dest(), true);
+  }
+  else if (o.action() == "store_false") {
+    _values[o.dest()] = "0";
+    _values.is_set_by_user(o.dest(), true);
+  }
+  else if (o.action() == "append") {
+    string err = o.check_type(opt, value);
+    if (err != "")
+      error(err);
+    _values[o.dest()] = value;
+    _values.all(o.dest()).push_back(value);
+    _values.is_set_by_user(o.dest(), true);
+  }
+  else if (o.action() == "append_const") {
+    _values[o.dest()] = o.get_const();
+    _values.all(o.dest()).push_back(o.get_const());
+    _values.is_set_by_user(o.dest(), true);
+  }
+  else if (o.action() == "count") {
+    _values[o.dest()] = str_inc(_values[o.dest()]);
+    _values.is_set_by_user(o.dest(), true);
+  }
+  else if (o.action() == "help") {
+    print_help();
+    std::exit(0);
+  }
+  else if (o.action() == "version") {
+    print_version();
+    std::exit(0);
+  }
+  else if (o.action() == "callback" && o.callback()) {
+    (*o.callback())(o, opt, value, *this);
+  }
+}
+
+string OptionParser::format_option_help(unsigned int indent /* = 2 */) const {
+  stringstream ss;
+
+  if (_opts.empty())
+    return ss.str();
+
+  for (list<Option>::const_iterator it = _opts.begin(); it != _opts.end(); ++it) {
+    if (it->help() != SUPPRESS_HELP)
+      ss << it->format_help(indent);
+  }
+
+  return ss.str();
+}
+
+string OptionParser::format_help() const {
+  stringstream ss;
+
+  if (usage() != SUPPRESS_USAGE)
+    ss << get_usage() << endl;
+
+  if (description() != "")
+    ss << str_format(description(), 0, cols()) << endl;
+
+  ss << _("Options") << ":" << endl;
+  ss << format_option_help();
+
+  for (list<OptionGroup const*>::const_iterator it = _groups.begin(); it != _groups.end(); ++it) {
+    const OptionGroup& group = **it;
+    ss << endl << "  " << group.title() << ":" << endl;
+    if (group.group_description() != "")
+      ss << str_format(group.group_description(), 4, cols()) << endl;
+    ss << group.format_option_help(4);
+  }
+
+  if (epilog() != "")
+    ss << endl << str_format(epilog(), 0, cols());
+
+  return ss.str();
+}
+void OptionParser::print_help() const {
+  cout << format_help();
+}
+
+void OptionParser::set_usage(const string& u) {
+  string lower = u;
+  transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
+  if (lower.compare(0, 7, "usage: ") == 0)
+    _usage = u.substr(7);
+  else
+    _usage = u;
+}
+string OptionParser::format_usage(const string& u) const {
+  stringstream ss;
+  ss << _("Usage") << ": " << u << endl;
+  return ss.str();
+}
+string OptionParser::get_usage() const {
+  if (usage() == SUPPRESS_USAGE)
+    return string("");
+  return format_usage(str_replace(usage(), "%prog", prog()));
+}
+void OptionParser::print_usage(ostream& out) const {
+  string u = get_usage();
+  if (u != "")
+    out << u << endl;
+}
+void OptionParser::print_usage() const {
+  print_usage(cout);
+}
+
+string OptionParser::get_version() const {
+  return str_replace(_version, "%prog", prog());
+}
+void OptionParser::print_version(ostream& out) const {
+  out << get_version() << endl;
+}
+void OptionParser::print_version() const {
+  print_version(cout);
+}
+
+void OptionParser::exit() const {
+  std::exit(2);
+}
+void OptionParser::error(const string& msg) const {
+  print_usage(cerr);
+  cerr << prog() << ": " << _("error") << ": " << msg << endl;
+  exit();
+}
+////////// } class OptionParser //////////
+
+////////// class Values { //////////
+const string& Values::operator[] (const string& d) const {
+  strMap::const_iterator it = _map.find(d);
+  static const string empty = "";
+  return (it != _map.end()) ? it->second : empty;
+}
+void Values::is_set_by_user(const string& d, bool yes) {
+  if (yes)
+    _userSet.insert(d);
+  else
+    _userSet.erase(d);
+}
+////////// } class Values //////////
+
+////////// class Option { //////////
+string Option::check_type(const string& opt, const string& val) const {
+  istringstream ss(val);
+  stringstream err;
+
+  if (type() == "int" || type() == "long") {
+    long t;
+    if (not (ss >> t))
+      err << _("option") << " " << opt << ": " << _("invalid integer value") << ": '" << val << "'";
+  }
+  else if (type() == "float" || type() == "double") {
+    double t;
+    if (not (ss >> t))
+      err << _("option") << " " << opt << ": " << _("invalid floating-point value") << ": '" << val << "'";
+  }
+  else if (type() == "choice") {
+    if (find(choices().begin(), choices().end(), val) == choices().end()) {
+      list<string> tmp = choices();
+      transform(tmp.begin(), tmp.end(), tmp.begin(), str_wrap("'"));
+      err << _("option") << " " << opt << ": " << _("invalid choice") << ": '" << val << "'"
+        << " (" << _("choose from") << " " << str_join(", ", tmp.begin(), tmp.end()) << ")";
+    }
+  }
+  else if (type() == "complex") {
+    complex<double> t;
+    if (not (ss >> t))
+      err << _("option") << " " << opt << ": " << _("invalid complex value") << ": '" << val << "'";
+  }
+
+  return err.str();
+}
+
+string Option::format_option_help(unsigned int indent /* = 2 */) const {
+
+  string mvar_short, mvar_long;
+  if (nargs() == 1) {
+    string mvar = metavar();
+    if (mvar == "") {
+      mvar = type();
+      transform(mvar.begin(), mvar.end(), mvar.begin(), ::toupper);
+     }
+    mvar_short = " " + mvar;
+    mvar_long = "=" + mvar;
+  }
+
+  stringstream ss;
+  ss << string(indent, ' ');
+
+  if (not _short_opts.empty()) {
+    ss << str_join_trans(", ", _short_opts.begin(), _short_opts.end(), str_wrap("-", mvar_short));
+    if (not _long_opts.empty())
+      ss << ", ";
+  }
+  if (not _long_opts.empty())
+    ss << str_join_trans(", ", _long_opts.begin(), _long_opts.end(), str_wrap("--", mvar_long));
+
+  if ( _short_opts.empty() && _long_opts.empty() )
+      ss << metavar();
+
+
+  return ss.str();
+}
+
+string Option::format_help(unsigned int indent /* = 2 */) const {
+  stringstream ss;
+  string h = format_option_help(indent);
+  unsigned int width = cols();
+  unsigned int opt_width = min(width*3/10, 36u);
+  bool indent_first = false;
+  ss << h;
+  // if the option list is too long, start a new paragraph
+  if (h.length() >= (opt_width-1)) {
+    ss << endl;
+    indent_first = true;
+  } else {
+    ss << string(opt_width - h.length(), ' ');
+    if (help() == "")
+      ss << endl;
+  }
+  if (help() != "") {
+    string help_str = (get_default() != "") ? str_replace(help(), "%default", get_default()) : help();
+    ss << str_format(help_str, opt_width, width, indent_first);
+  }
+  return ss.str();
+}
+
+Option& Option::action(const string& a) {
+  _action = a;
+  if (a == "store_const" || a == "store_true" || a == "store_false" ||
+      a == "append_const" || a == "count" || a == "help" || a == "version")
+    nargs(0);
+  return *this;
+}
+////////// } class Option //////////
+
+}
diff --git a/tools/common/OptionParser.h b/tools/common/OptionParser.h

new file mode 100644 (file)

index 0000000..3266f0f
--- /dev/null
+++ b/tools/common/OptionParser.h
@@ -0,0 +1,305 @@
+/**
+ * Copyright (C) 2010 Johannes Weißl <jargon@molb.org>
+ * License: your favourite BSD-style license
+ *
+ * git clone http://github.com/weisslj/cpp-optparse.git
+ *
+ * This is yet another option parser for C++. It is modelled after the
+ * excellent Python optparse API. Although incomplete, anyone familiar to
+ * optparse should feel at home:
+ * http://docs.python.org/library/optparse.html
+ *
+ * Design decisions:
+ * - elegant and easy usage more important than speed / flexibility
+ * - shortness more important than feature completeness
+ *   * no unicode
+ *   * no checking for user programming errors
+ *
+ * Why not use getopt/getopt_long?
+ * - not C++ / not completely POSIX
+ * - too cumbersome to use, would need lot of additional code
+ *
+ * Why not use Boost.Program_options?
+ * - boost not installed on all target platforms (esp. cluster, HPC, ...)
+ * - too big to include just for option handling:
+ *   322 *.h (44750 lines) + 7 *.cpp (2078 lines)
+ *
+ * Why not use tclap/Opag/Options/CmdLine/Anyoption/Argument_helper/...?
+ * - no reason, writing one is faster than code inspection :-)
+ * - similarity to Python desired for faster learning curve
+ *
+ * Future work:
+ * - nargs > 1?
+ * - comments?
+ *
+ * Python only features:
+ * - conflict handlers
+ * - adding new actions
+ *
+ *
+ * Example:
+ *
+ * using optparse::OptionParser;
+ *
+ * OptionParser parser = OptionParser() .description("just an example");
+ *
+ * parser.add_option("-f", "--file") .dest("filename")
+ *                   .help("write report to FILE") .metavar("FILE");
+ * parser.add_option("-q", "--quiet")
+ *                   .action("store_false") .dest("verbose") .set_default("1")
+ *                   .help("don't print status messages to stdout");
+ * 
+ * optparse::Values options = parser.parse_args(argc, argv);
+ * vector<string> args = parser.args();
+ *
+ * if (options.get("verbose"))
+ *     cout << options["filename"] << endl;
+ *
+ */
+
+#ifndef OPTIONPARSER_H_
+#define OPTIONPARSER_H_
+
+#include <iostream>
+#include <list>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace optparse {
+
+class OptionParser;
+class OptionGroup;
+class Option;
+class Values;
+class Value;
+class Callback;
+
+typedef std::map<std::string,std::string> strMap;
+typedef std::map<std::string,std::list<std::string> > lstMap;
+typedef std::map<std::string,Option const*> optMap;
+
+const char* const SUPPRESS_HELP = "SUPPRESS" "HELP";
+const char* const SUPPRESS_USAGE = "SUPPRESS" "USAGE";
+
+//! Class for automatic conversion from string -> anytype
+class Value {
+  public:
+    Value() : str(), valid(false) {}
+    Value(const std::string& v) : str(v), valid(true) {}
+    operator const char*() { return str.c_str(); }
+    operator bool() { bool t; return (valid && (std::istringstream(str) >> t)) ? t : false; }
+    operator short() { short t; return (valid && (std::istringstream(str) >> t)) ? t : 0; }
+    operator unsigned short() { unsigned short t; return (valid && (std::istringstream(str) >> t)) ? t : 0; }
+    operator int() { int t; return (valid && (std::istringstream(str) >> t)) ? t : 0; }
+    operator unsigned int() { unsigned int t; return (valid && (std::istringstream(str) >> t)) ? t : 0; }
+    operator long() { long t; return (valid && (std::istringstream(str) >> t)) ? t : 0; }
+    operator unsigned long() { unsigned long t; return (valid && (std::istringstream(str) >> t)) ? t : 0; }
+    operator float() { float t; return (valid && (std::istringstream(str) >> t)) ? t : 0; }
+    operator double() { double t; return (valid && (std::istringstream(str) >> t)) ? t : 0; }
+    operator long double() { long double t; return (valid && (std::istringstream(str) >> t)) ? t : 0; }
+ private:
+    const std::string str;
+    bool valid;
+};
+
+class Values {
+  public:
+    Values() : _map() {}
+    const std::string& operator[] (const std::string& d) const;
+    std::string& operator[] (const std::string& d) { return _map[d]; }
+    bool is_set(const std::string& d) const { return _map.find(d) != _map.end(); }
+    bool is_set_by_user(const std::string& d) const { return _userSet.find(d) != _userSet.end(); }
+    void is_set_by_user(const std::string& d, bool yes);
+    Value get(const std::string& d) const { return (is_set(d)) ? Value((*this)[d]) : Value(); }
+
+    typedef std::list<std::string>::iterator iterator;
+    typedef std::list<std::string>::const_iterator const_iterator;
+    std::list<std::string>& all(const std::string& d) { return _appendMap[d]; }
+    const std::list<std::string>& all(const std::string& d) const { return _appendMap.find(d)->second; }
+
+  private:
+    strMap _map;
+    lstMap _appendMap;
+    std::set<std::string> _userSet;
+};
+
+class OptionParser {
+  public:
+    OptionParser();
+    virtual ~OptionParser() {}
+
+    OptionParser& usage(const std::string& u) { set_usage(u); return *this; }
+    OptionParser& version(const std::string& v) { _version = v; return *this; }
+    OptionParser& description(const std::string& d) { _description = d; return *this; }
+    OptionParser& add_help_option(bool h) { _add_help_option = h; return *this; }
+    OptionParser& add_version_option(bool v) { _add_version_option = v; return *this; }
+    OptionParser& prog(const std::string& p) { _prog = p; return *this; }
+    OptionParser& epilog(const std::string& e) { _epilog = e; return *this; }
+    OptionParser& set_defaults(const std::string& dest, const std::string& val) {
+      _defaults[dest] = val; return *this;
+    }
+    OptionParser& enable_interspersed_args() { _interspersed_args = true; return *this; }
+    OptionParser& disable_interspersed_args() { _interspersed_args = false; return *this; }
+    OptionParser& add_option_group(const OptionGroup& group);
+
+    const std::string& usage() const { return _usage; }
+    const std::string& version() const { return _version; }
+    const std::string& description() const { return _description; }
+    bool add_help_option() const { return _add_help_option; }
+    bool add_version_option() const { return _add_version_option; }
+    const std::string& prog() const { return _prog; }
+    const std::string& epilog() const { return _epilog; }
+    bool interspersed_args() const { return _interspersed_args; }
+
+    Option& add_option(const std::string& opt);
+    Option& add_option(const std::string& opt1, const std::string& opt2);
+    Option& add_option(const std::string& opt1, const std::string& opt2, const std::string& opt3);
+    Option& add_option(const std::vector<std::string>& opt);
+
+    Values& parse_args(int argc, char const* const* argv);
+    Values& parse_args(const std::vector<std::string>& args);
+    template<typename InputIterator>
+    Values& parse_args(InputIterator begin, InputIterator end) {
+      return parse_args(std::vector<std::string>(begin, end));
+    }
+
+    const std::list<std::string>& args() const { return _leftover; }
+    std::vector<std::string> args() {
+      return std::vector<std::string>(_leftover.begin(), _leftover.end());
+    }
+
+    std::string format_help() const;
+    std::string format_option_help(unsigned int indent = 2) const;
+    void print_help() const;
+
+    void set_usage(const std::string& u);
+    std::string get_usage() const;
+    void print_usage(std::ostream& out) const;
+    void print_usage() const;
+
+    std::string get_version() const;
+    void print_version(std::ostream& out) const;
+    void print_version() const;
+
+    void error(const std::string& msg) const;
+    void exit() const;
+
+  private:
+    const Option& lookup_short_opt(const std::string& opt) const;
+    const Option& lookup_long_opt(const std::string& opt) const;
+
+    void handle_short_opt(const std::string& opt, const std::string& arg);
+    void handle_long_opt(const std::string& optstr);
+
+    void process_opt(const Option& option, const std::string& opt, const std::string& value);
+
+    std::string format_usage(const std::string& u) const;
+
+    std::string _usage;
+    std::string _version;
+    std::string _description;
+    bool _add_help_option;
+    bool _add_version_option;
+    std::string _prog;
+    std::string _epilog;
+    bool _interspersed_args;
+
+    Values _values;
+
+    std::list<Option> _opts;
+    optMap _optmap_s;
+    optMap _optmap_l;
+    strMap _defaults;
+    std::list<OptionGroup const*> _groups;
+
+    std::list<std::string> _remaining;
+    std::list<std::string> _leftover;
+};
+
+class OptionGroup : public OptionParser {
+  public:
+    OptionGroup(const OptionParser& p, const std::string& t, const std::string& d = "") :
+      _parser(p), _title(t), _group_description(d) {}
+    virtual ~OptionGroup() {}
+
+    OptionGroup& title(const std::string& t) { _title = t; return *this; }
+    OptionGroup& group_description(const std::string& d) { _group_description = d; return *this; }
+    const std::string& title() const { return _title; }
+    const std::string& group_description() const { return _group_description; }
+
+  private:
+    const OptionParser& _parser;
+    std::string _title;
+    std::string _group_description;
+};
+
+class Option {
+  public:
+    Option() : _action("store"), _type("string"), _nargs(1), _callback(0) {}
+    virtual ~Option() {}
+
+    Option& action(const std::string& a);
+    Option& type(const std::string& t) { _type = t; return *this; }
+    Option& dest(const std::string& d) { _dest = d; return *this; }
+    Option& set_default(const std::string& d) { _default = d; return *this; }
+    template<typename T>
+    Option& set_default(T t) { std::ostringstream ss; ss << t; _default = ss.str(); return *this; }
+    Option& nargs(size_t n) { _nargs = n; return *this; }
+    Option& set_const(const std::string& c) { _const = c; return *this; }
+    template<typename InputIterator>
+    Option& choices(InputIterator begin, InputIterator end) {
+      _choices.assign(begin, end); type("choice"); return *this;
+    }
+    template<typename InputEnumerable>
+    Option& choices(InputEnumerable enumerable) {
+      _choices.assign(enumerable.begin(), enumerable.end()); type("choice"); return *this;
+    }
+    Option& help(const std::string& h) { _help = h; return *this; }
+    Option& metavar(const std::string& m) { _metavar = m; return *this; }
+    Option& callback(Callback& c) { _callback = &c; return *this; }
+
+    const std::string& action() const { return _action; }
+    const std::string& type() const { return _type; }
+    const std::string& dest() const { return _dest; }
+    const std::string& get_default() const { return _default; }
+    size_t nargs() const { return _nargs; }
+    const std::string& get_const() const { return _const; }
+    const std::list<std::string>& choices() const { return _choices; }
+    const std::string& help() const { return _help; }
+    const std::string& metavar() const { return _metavar; }
+    Callback* callback() const { return _callback; }
+
+  private:
+    std::string check_type(const std::string& opt, const std::string& val) const;
+    std::string format_option_help(unsigned int indent = 2) const;
+    std::string format_help(unsigned int indent = 2) const;
+
+    std::set<std::string> _short_opts;
+    std::set<std::string> _long_opts;
+
+    std::string _action;
+    std::string _type;
+    std::string _dest;
+    std::string _default;
+    size_t _nargs;
+    std::string _const;
+    std::list<std::string> _choices;
+    std::string _help;
+    std::string _metavar;
+    Callback* _callback;
+
+    friend class OptionParser;
+};
+
+class Callback {
+public:
+  virtual void operator() (const Option& option, const std::string& opt, const std::string& val, const OptionParser& parser) = 0;
+  virtual ~Callback() {}
+};
+
+}
+
+#endif
diff --git a/tools/pbindex/CMakeLists.txt b/tools/pbindex/CMakeLists.txt

new file mode 100644 (file)

index 0000000..0bfcf33
--- /dev/null
+++ b/tools/pbindex/CMakeLists.txt
@@ -0,0 +1,22 @@
+
+set(PbindexSrcDir ${PacBioBAM_ToolsDir}/pbindex/src)
+
+# create version header
+set(PbIndex_VERSION ${PacBioBAM_VERSION})
+configure_file(
+    ${PbindexSrcDir}/PbIndexVersion.h.in ${GeneratedDir}/PbIndexVersion.h @ONLY
+)
+
+# list source files
+set(PBINDEX_SOURCES
+    ${ToolsCommonDir}/OptionParser.cpp
+    ${PbindexSrcDir}/main.cpp
+    ${PbindexSrcDir}/PbIndex.cpp
+)
+
+# build pbindex executable
+include(PbbamTool)
+create_pbbam_tool(
+    TARGET  pbindex
+    SOURCES ${PBINDEX_SOURCES}
+)
diff --git a/tools/pbindex/src/PbIndex.cpp b/tools/pbindex/src/PbIndex.cpp

new file mode 100644 (file)

index 0000000..e25fa0e
--- /dev/null
+++ b/tools/pbindex/src/PbIndex.cpp
@@ -0,0 +1,78 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "PbIndex.h"
+#include <pbbam/BamFile.h>
+#include <pbbam/PbiRawData.h>
+#include <iostream>
+#include <cassert>
+#include <cstdlib>
+using namespace pbindex;
+using namespace std;
+
+Settings::Settings(void)
+    : printPbiContents_(false)
+{ }
+
+int PbIndex::Create(const Settings& settings)
+{
+    try
+    {
+        PacBio::BAM::BamFile bamFile(settings.inputBamFilename_);
+        bamFile.CreatePacBioIndex();
+        return EXIT_SUCCESS;
+    }
+    catch (std::runtime_error& e)
+    {
+        cerr << "pbindex ERROR: " << e.what() << endl;
+        return EXIT_FAILURE;
+    }
+}
+
+//int PbIndex::Print(const Settings& settings)
+//{
+
+//}
+
+int PbIndex::Run(const Settings& settings)
+{
+//    if (settings.printPbiContents_)
+//        return Print(settings);
+//    else
+        return Create(settings);
+}
+
diff --git a/tools/pbindex/src/PbIndex.h b/tools/pbindex/src/PbIndex.h

new file mode 100644 (file)

index 0000000..05bdf6d
--- /dev/null
+++ b/tools/pbindex/src/PbIndex.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef PBINDEX_H
+#define PBINDEX_H
+
+#include <string>
+#include <vector>
+
+namespace pbindex {
+
+class Settings
+{
+public:
+    Settings(void);
+
+public:
+
+public:
+    std::string inputBamFilename_;
+    bool printPbiContents_;
+    std::vector<std::string> errors_;
+};
+
+class PbIndex
+{
+public:
+    static int Run(const Settings& settings);
+private:
+    static int Create(const Settings& settings);
+//    static int Print(const Settings& settings);
+};
+
+} // namespace pbindex
+
+#endif // PBINDEX_H
diff --git a/tools/pbindex/src/PbIndexVersion.h.in b/tools/pbindex/src/PbIndexVersion.h.in

new file mode 100644 (file)

index 0000000..9faf96d
--- /dev/null
+++ b/tools/pbindex/src/PbIndexVersion.h.in
@@ -0,0 +1,49 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef PBINDEXVERSION_H
+#define PBINDEXVERSION_H
+
+#include <string>
+
+namespace pbindex {
+
+const std::string Version = std::string("@PbIndex_VERSION@");
+
+} // namespace pbindex
+
+#endif // PBINDEXVERSION_H
diff --git a/tools/pbindex/src/main.cpp b/tools/pbindex/src/main.cpp

new file mode 100644 (file)

index 0000000..0f1bfb9
--- /dev/null
+++ b/tools/pbindex/src/main.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "../common/OptionParser.h"
+#include "PbIndex.h"
+#include "PbIndexVersion.h"
+#include <cassert>
+#include <iostream>
+using namespace std;
+
+static
+pbindex::Settings fromCommandLine(optparse::OptionParser& parser,
+                                  int argc, char* argv[])
+{
+    const optparse::Values options = parser.parse_args(argc, argv);
+    (void)options;
+
+    pbindex::Settings settings;
+
+    // get input filename
+    const vector<string> positionalArgs = parser.args();
+    const size_t numPositionalArgs = positionalArgs.size();
+    if (numPositionalArgs == 0)
+        settings.errors_.push_back("pbindex requires an input BAM filename");
+    else if (numPositionalArgs == 1)
+        settings.inputBamFilename_ = parser.args().front();
+    else {
+        assert(numPositionalArgs > 1);
+        settings.errors_.push_back("pbindex does not support more than one input file per run");
+    }
+
+    return settings;
+}
+
+int main(int argc, char* argv[])
+{
+    // setup help & options
+    optparse::OptionParser parser;
+    parser.description("pbindex creates a index file that enables random-access to PacBio-specific data in BAM files. "
+                       "Generated index filename will be the same as input BAM plus .pbi suffix."
+                       );
+    parser.prog("pbindex");
+    parser.usage("pbindex <input>");
+    parser.version(pbindex::Version);
+    parser.add_version_option(true);
+    parser.add_help_option(true);
+
+    auto ioGroup = optparse::OptionGroup(parser, "Input/Output");
+    ioGroup.add_option("")
+           .dest("input")
+           .metavar("input")
+           .help("Input BAM file");
+    parser.add_option_group(ioGroup);
+
+    // parse command line for settings
+    const pbindex::Settings settings = fromCommandLine(parser, argc, argv);
+    if (!settings.errors_.empty()) {
+        cerr << endl;
+        for (const auto e : settings.errors_)
+            cerr << "ERROR: " << e << endl;
+        cerr << endl;
+        parser.print_help();
+        return EXIT_FAILURE;
+    }
+
+    // run tool
+    return pbindex::PbIndex::Run(settings);
+}
diff --git a/tools/pbindexdump/CMakeLists.txt b/tools/pbindexdump/CMakeLists.txt

new file mode 100644 (file)

index 0000000..88c07b9
--- /dev/null
+++ b/tools/pbindexdump/CMakeLists.txt
@@ -0,0 +1,47 @@
+
+set(PbindexdumpSrcDir ${PacBioBAM_ToolsDir}/pbindexdump/src)
+
+# create version header
+set(PbIndexDump_VERSION ${PacBioBAM_VERSION})
+configure_file(
+    ${PbindexdumpSrcDir}/PbIndexDumpVersion.h.in ${GeneratedDir}/PbIndexDumpVersion.h @ONLY
+)
+
+# list source files
+set(PBINDEXDUMP_SOURCES
+    ${ToolsCommonDir}/OptionParser.cpp
+    ${PbindexdumpSrcDir}/CppFormatter.cpp
+    ${PbindexdumpSrcDir}/JsonFormatter.cpp
+    ${PbindexdumpSrcDir}/PbIndexDump.cpp
+    ${PbindexdumpSrcDir}/main.cpp
+)
+
+# build pbindexdump executable
+include(PbbamTool)
+create_pbbam_tool(
+    TARGET  pbindexdump
+    SOURCES ${PBINDEXDUMP_SOURCES}
+)
+
+# cram tests
+if (PacBioBAM_build_tests)
+
+    configure_file(
+        ${PacBioBAM_CramTestsDir}/pbindexdump_json.t.in
+        ${GeneratedDir}/pbindexdump_json.t
+    )
+
+    configure_file(
+        ${PacBioBAM_CramTestsDir}/pbindexdump_cpp.t.in
+        ${GeneratedDir}/pbindexdump_cpp.t
+    )
+
+    add_test(
+        NAME pbindexdump_CramTests
+        WORKING_DIRECTORY ${PacBioBAM_TestsDir}/scripts
+        COMMAND "python" cram.py
+            ${GeneratedDir}/pbindexdump_json.t
+            ${GeneratedDir}/pbindexdump_cpp.t
+    )
+
+endif()
diff --git a/tools/pbindexdump/src/CppFormatter.cpp b/tools/pbindexdump/src/CppFormatter.cpp

new file mode 100644 (file)

index 0000000..696421e
--- /dev/null
+++ b/tools/pbindexdump/src/CppFormatter.cpp
@@ -0,0 +1,177 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "CppFormatter.h"
+#include <pbbam/PbiRawData.h>
+
+#include <iostream>
+#include <sstream>
+
+using namespace pbindexdump;
+using namespace std;
+
+namespace pbindexdump {
+
+static
+string printCppReferenceData(const PacBio::BAM::PbiRawReferenceData& referenceData)
+{
+    auto result = string{ "" };
+    for (const PacBio::BAM::PbiReferenceEntry& entry : referenceData.entries_) {
+        if (!result.empty())
+            result.append(",\n");
+        result.append( string{ "    PbiReferenceEntry{" }
+                       + to_string(entry.tId_) + "," + to_string(entry.beginRow_) + "," + to_string(entry.endRow_)
+                       + string{ "}" } );
+    }
+    if (!result.empty())
+        result.append("\n");
+    return result;
+}
+
+template<typename T>
+string printVectorElements(const std::vector<T>& c)
+{
+    stringstream s;
+    for (const auto& e : c)
+        s << e << ",";
+    auto result = s.str();
+    if (!result.empty())
+        result.pop_back(); // remove final comma
+    return result;
+}
+
+template<>
+string printVectorElements(const std::vector<uint8_t>& c)
+{
+    stringstream s;
+    for (const auto& e : c)
+        s << static_cast<uint16_t>(e) << ","; // cast to larger uint, force print as number not character
+    auto result = s.str();
+    if (!result.empty())
+        result.pop_back(); // remove final comma
+    return result;
+}
+
+template<>
+string printVectorElements(const std::vector<int8_t>& c)
+{
+    stringstream s;
+    for (const auto& e : c)
+        s << static_cast<int16_t>(e) << ","; // cast to larger int, force print as number not character
+    auto result = s.str();
+    if (!result.empty())
+        result.pop_back(); // remove final comma
+    return result;
+}
+
+} // namespace pbindexdump
+
+CppFormatter::CppFormatter(const Settings& settings)
+    : IFormatter(settings)
+{ }
+
+void CppFormatter::Run(void)
+{
+    using namespace PacBio::BAM;
+
+    const PbiRawData rawData{ settings_.inputPbiFilename_ };
+    const PbiRawBarcodeData& barcodeData = rawData.BarcodeData();
+    const PbiRawBasicData& basicData     = rawData.BasicData();
+    const PbiRawMappedData& mappedData   = rawData.MappedData();
+    const PbiRawReferenceData& referenceData = rawData.ReferenceData();
+
+    auto version = string{ };
+    switch (rawData.Version()) {
+        case PbiFile::Version_3_0_0 : version = "PbiFile::Version_3_0_0"; break;
+        case PbiFile::Version_3_0_1 : version = "PbiFile::Version_3_0_1"; break;
+        default:
+            throw runtime_error("unsupported PBI version encountered");
+    }
+
+    auto fileSections = string{ "PbiFile::BASIC" };
+    if (rawData.HasBarcodeData())   fileSections += string{ " | PbiFile::BARCODE" };
+    if (rawData.HasMappedData())    fileSections += string{ " | PbiFile::MAPPED" };
+    if (rawData.HasReferenceData()) fileSections += string{ " | PbiFile::REFERENCE" };
+
+    stringstream s;
+    s << "PbiRawData rawData;" << endl
+      << "rawData.Version("      << version             << ");" << endl
+      << "rawData.FileSections(" << fileSections        << ");" << endl
+      << "rawData.NumReads("     << rawData.NumReads()  << ");" << endl
+      << endl
+      << "PbiRawBasicData& basicData = rawData.BasicData();" << endl
+      << "basicData.rgId_       = {" << printVectorElements(basicData.rgId_)       << "};" << endl
+      << "basicData.qStart_     = {" << printVectorElements(basicData.qStart_)     << "};" << endl
+      << "basicData.qEnd_       = {" << printVectorElements(basicData.qEnd_)       << "};" << endl
+      << "basicData.holeNumber_ = {" << printVectorElements(basicData.holeNumber_) << "};" << endl
+      << "basicData.readQual_   = {" << printVectorElements(basicData.readQual_)   << "};" << endl
+      << "basicData.ctxtFlag_   = {" << printVectorElements(basicData.ctxtFlag_)   << "};" << endl
+      << "basicData.fileOffset_ = {" << printVectorElements(basicData.fileOffset_) << "};" << endl
+      << endl;
+
+    if (rawData.HasBarcodeData()) {
+        s << "PbiRawBarcodeData& barcodeData = rawData.BarcodeData();" << endl
+          << "barcodeData.bcForward_ = {" << printVectorElements(barcodeData.bcForward_) << "};" << endl
+          << "barcodeData.bcReverse_ = {" << printVectorElements(barcodeData.bcReverse_) << "};" << endl
+          << "barcodeData.bcQual_    = {" << printVectorElements(barcodeData.bcQual_)    << "};" << endl
+          << endl;
+    }
+
+    if (rawData.HasMappedData()) {
+        s << "PbiRawMappedData& mappedData = rawData.MappedData();" << endl
+          << "mappedData.tId_       = {" << printVectorElements(mappedData.tId_)       << "};" << endl
+          << "mappedData.tStart_    = {" << printVectorElements(mappedData.tStart_)    << "};" << endl
+          << "mappedData.tEnd_      = {" << printVectorElements(mappedData.tEnd_)      << "};" << endl
+          << "mappedData.aStart_    = {" << printVectorElements(mappedData.aStart_)    << "};" << endl
+          << "mappedData.aEnd_      = {" << printVectorElements(mappedData.aEnd_)      << "};" << endl
+          << "mappedData.revStrand_ = {" << printVectorElements(mappedData.revStrand_) << "};" << endl
+          << "mappedData.nM_        = {" << printVectorElements(mappedData.nM_)        << "};" << endl
+          << "mappedData.nMM_       = {" << printVectorElements(mappedData.nMM_)       << "};" << endl
+          << "mappedData.mapQV_     = {" << printVectorElements(mappedData.mapQV_)     << "};" << endl
+          << endl;
+    }
+
+    if (rawData.HasReferenceData()) {
+        s << "PbiRawReferenceData& referenceData = rawData.ReferenceData();" << endl
+          << "referenceData.entries_ = { " << endl
+          << printCppReferenceData(referenceData)
+          << "};" << endl
+          << endl;
+    }
+
+    cout << s.str() << endl;
+}
diff --git a/tools/pbindexdump/src/CppFormatter.h b/tools/pbindexdump/src/CppFormatter.h

new file mode 100644 (file)

index 0000000..c2cda26
--- /dev/null
+++ b/tools/pbindexdump/src/CppFormatter.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef CPPFORMATTER_H
+#define CPPFORMATTER_H
+
+#include "IFormatter.h"
+
+namespace pbindexdump {
+
+class CppFormatter : public IFormatter
+{
+public:
+    CppFormatter(const Settings& settings);
+    void Run(void);
+};
+
+} // namespace pbindexdump
+
+#endif // CPPFORMATTER_H
diff --git a/tools/pbindexdump/src/IFormatter.h b/tools/pbindexdump/src/IFormatter.h

new file mode 100644 (file)

index 0000000..eb7e79b
--- /dev/null
+++ b/tools/pbindexdump/src/IFormatter.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef IFORMATTER_H
+#define IFORMATTER_H
+
+#include "Settings.h"
+
+namespace pbindexdump {
+
+class IFormatter
+{
+public:
+    ~IFormatter(void) { }
+
+public:
+    virtual void Run(void) =0;
+
+protected:
+    const Settings& settings_;
+
+protected:
+    IFormatter(const Settings& settings)
+        : settings_(settings)
+    { }
+};
+
+} // namespace pbindexdump
+
+#endif // IFORMATTER_H
diff --git a/tools/pbindexdump/src/JsonFormatter.cpp b/tools/pbindexdump/src/JsonFormatter.cpp

new file mode 100644 (file)

index 0000000..368f659
--- /dev/null
+++ b/tools/pbindexdump/src/JsonFormatter.cpp
@@ -0,0 +1,195 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "JsonFormatter.h"
+#include "json.hpp"
+#include <pbbam/PbiFile.h>
+#include <iostream>
+#include <sstream>
+using namespace pbindexdump;
+using namespace PacBio::BAM;
+using namespace std;
+
+namespace pbindexdump {
+
+
+} // namespace pbindexdump
+
+JsonFormatter::JsonFormatter(const Settings& settings)
+    : IFormatter(settings)
+    , index_(settings.inputPbiFilename_)
+{ }
+
+void JsonFormatter::FormatMetadata(void)
+{
+    auto version = string{ };
+    switch (index_.Version()) {
+        case PbiFile::Version_3_0_0 : version = "3.0.0"; break;
+        case PbiFile::Version_3_0_1 : version = "3.0.1"; break;
+        default:
+            throw runtime_error("unsupported PBI version encountered");
+    }
+
+    nlohmann::json fileSections;
+    fileSections.push_back("BasicData");
+    if (index_.HasBarcodeData())   fileSections.push_back("BarcodeData");
+    if (index_.HasMappedData())    fileSections.push_back("MappedData");
+    if (index_.HasReferenceData()) fileSections.push_back("ReferenceData");
+
+    json_["version"]      = version;
+    json_["fileSections"] = fileSections;
+    json_["numReads"]     = index_.NumReads();
+}
+
+void JsonFormatter::FormatRaw(void)
+{
+    const PbiRawBasicData& basicData = index_.BasicData();
+    json_["basicData"]["rgId"]       = basicData.rgId_;
+    json_["basicData"]["qStart"]     = basicData.qStart_;
+    json_["basicData"]["qEnd"]       = basicData.qEnd_;
+    json_["basicData"]["holeNumber"] = basicData.holeNumber_;
+    json_["basicData"]["readQual"]   = basicData.readQual_;
+    json_["basicData"]["ctxtFlag"]   = basicData.ctxtFlag_;
+    json_["basicData"]["fileOffset"] = basicData.fileOffset_;
+
+    if (index_.HasBarcodeData()) {
+        const PbiRawBarcodeData& barcodeData = index_.BarcodeData();
+        json_["barcodeData"]["bcForward"] = barcodeData.bcForward_;
+        json_["barcodeData"]["bcReverse"] = barcodeData.bcReverse_;
+        json_["barcodeData"]["bcQuality"] = barcodeData.bcQual_;
+    }
+
+    if (index_.HasMappedData()) {
+        const PbiRawMappedData& mappedData = index_.MappedData();
+
+        // casts to force -1 if unmapped
+        json_["mappedData"]["tId"]    = mappedData.tId_;
+        json_["mappedData"]["tStart"] = mappedData.tStart_;
+        json_["mappedData"]["tEnd"]   = mappedData.tEnd_;
+
+        json_["mappedData"]["aStart"]    = mappedData.aStart_;
+        json_["mappedData"]["aEnd"]      = mappedData.aEnd_;
+        json_["mappedData"]["revStrand"] = mappedData.revStrand_;
+        json_["mappedData"]["nM"]        = mappedData.nM_;
+        json_["mappedData"]["nMM"]       = mappedData.nMM_;
+        json_["mappedData"]["mapQV"]     = mappedData.mapQV_;
+    }
+}
+
+void JsonFormatter::FormatRecords(void)
+{
+    nlohmann::json reads;
+    const uint32_t numReads = index_.NumReads();
+    const bool hasBarcodeData = index_.HasBarcodeData();
+    const bool hasMappedData  = index_.HasMappedData();
+    for (uint32_t i = 0; i < numReads; ++i) {
+
+        nlohmann::json read;
+
+        // common data
+        const PbiRawBasicData& basicData = index_.BasicData();
+        read["rgId"]        = basicData.rgId_[i];
+        read["qStart"]      = basicData.qStart_[i];
+        read["qEnd"]        = basicData.qEnd_[i];
+        read["holeNumber"]  = basicData.holeNumber_[i];
+        read["readQuality"] = basicData.readQual_[i];
+        read["contextFlag"] = basicData.ctxtFlag_[i];
+        read["fileOffset"]  = basicData.fileOffset_[i];
+
+        // barcode data, if present
+        if (hasBarcodeData) {
+            const PbiRawBarcodeData& barcodeData = index_.BarcodeData();
+            read["bcForward"] = barcodeData.bcForward_[i];
+            read["bcReverse"] = barcodeData.bcReverse_[i];
+            read["bcQuality"] = barcodeData.bcQual_[i];
+        }
+
+        // mapping data, if present
+        if (hasMappedData) {
+            const PbiRawMappedData& mappedData = index_.MappedData();
+
+            // casts to force -1 if unmapped
+            read["tId"]    = static_cast<int32_t>(mappedData.tId_[i]);
+            read["tStart"] = static_cast<int32_t>(mappedData.tStart_[i]);
+            read["tEnd"]   = static_cast<int32_t>(mappedData.tEnd_[i]);
+
+            read["aStart"] = mappedData.aStart_[i];
+            read["aEnd"]   = mappedData.aEnd_[i];
+            read["nM"]     = mappedData.nM_[i];
+            read["nMM"]    = mappedData.nMM_[i];
+            read["mapQuality"]    = mappedData.mapQV_[i];
+            read["reverseStrand"] = mappedData.revStrand_[i];
+        }
+
+        reads.push_back(std::move(read));
+    }
+    json_["reads"] = reads;
+}
+
+void JsonFormatter::FormatReferences(void)
+{
+    if (index_.HasReferenceData()) {
+        const PbiRawReferenceData& referenceData = index_.ReferenceData();
+        nlohmann::json references;
+        for (const PbiReferenceEntry& entry : referenceData.entries_) {
+            nlohmann::json element;
+            element["tId"]      = static_cast<int32_t>(entry.tId_);
+            element["beginRow"] = static_cast<int32_t>(entry.beginRow_);
+            element["endRow"]   = static_cast<int32_t>(entry.endRow_);
+            references.push_back(std::move(element));
+        }
+        json_["references"] = references;
+    }
+}
+
+void JsonFormatter::Print(void)
+{
+    cout << json_.dump(settings_.jsonIndentLevel_) << endl;
+}
+
+void JsonFormatter::Run(void)
+{
+    FormatMetadata();
+    FormatReferences();
+
+    if (settings_.jsonRaw_)
+        FormatRaw();
+    else
+        FormatRecords();
+
+    Print();
+}
diff --git a/tools/pbindexdump/src/JsonFormatter.h b/tools/pbindexdump/src/JsonFormatter.h

new file mode 100644 (file)

index 0000000..9bd6911
--- /dev/null
+++ b/tools/pbindexdump/src/JsonFormatter.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef JSONFORMATTER_H
+#define JSONFORMATTER_H
+
+#include "IFormatter.h"
+#include "json.hpp"
+#include <pbbam/PbiRawData.h>
+
+namespace pbindexdump {
+
+class JsonFormatter : public IFormatter
+{
+public:
+    JsonFormatter(const Settings& settings);
+    void Run(void);
+
+private:
+    void FormatMetadata(void);
+    void FormatReferences(void);
+
+    void FormatRaw(void);
+    void FormatRecords(void);
+
+    void Print(void);
+
+private:
+    PacBio::BAM::PbiRawData index_;
+    nlohmann::json json_;
+};
+
+} // namespace pbindexdump
+
+#endif // JSONFORMATTER_H
diff --git a/tools/pbindexdump/src/PbIndexDump.cpp b/tools/pbindexdump/src/PbIndexDump.cpp

new file mode 100644 (file)

index 0000000..2dc925b
--- /dev/null
+++ b/tools/pbindexdump/src/PbIndexDump.cpp
@@ -0,0 +1,57 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "PbIndexDump.h"
+#include "CppFormatter.h"
+#include "JsonFormatter.h"
+#include <cassert>
+using namespace pbindexdump;
+using namespace std;
+
+void PbIndexDump::Run(const Settings& settings)
+{
+    std::unique_ptr<IFormatter> formatter(nullptr);
+    if      (settings.format_ == "json") formatter.reset(new JsonFormatter(settings));
+    else if (settings.format_ == "cpp")  formatter.reset(new CppFormatter(settings));
+    else {
+        string msg = { "unsupported output format requested: " };
+        msg += settings.format_;
+        throw runtime_error(msg);
+    }
+    assert(formatter);
+    formatter->Run();
+}
diff --git a/tools/pbindexdump/src/PbIndexDump.h b/tools/pbindexdump/src/PbIndexDump.h

new file mode 100644 (file)

index 0000000..e5ec2dc
--- /dev/null
+++ b/tools/pbindexdump/src/PbIndexDump.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef PBINDEXDUMP_H
+#define PBINDEXDUMP_H
+
+namespace pbindexdump {
+
+class Settings;
+
+class PbIndexDump
+{
+public:
+    static void Run(const Settings& settings);
+};
+
+} // namespace pbindex
+
+#endif // PBINDEXDUMP_H
diff --git a/tools/pbindexdump/src/PbIndexDumpVersion.h.in b/tools/pbindexdump/src/PbIndexDumpVersion.h.in

new file mode 100644 (file)

index 0000000..ec49612
--- /dev/null
+++ b/tools/pbindexdump/src/PbIndexDumpVersion.h.in
@@ -0,0 +1,49 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef PBINDEXDUMPVERSION_H
+#define PBINDEXDUMPVERSION_H
+
+#include <string>
+
+namespace pbindexdump {
+
+const std::string Version = std::string("@PbIndexDump_VERSION@");
+
+} // namespace pbindexdump
+
+#endif // PBINDEXDUMPVERSION_H
diff --git a/tools/pbindexdump/src/Settings.h b/tools/pbindexdump/src/Settings.h

new file mode 100644 (file)

index 0000000..a520293
--- /dev/null
+++ b/tools/pbindexdump/src/Settings.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT  SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef SETTINGS_H
+#define SETTINGS_H
+
+#include <string>
+#include <vector>
+
+namespace pbindexdump {
+
+class Settings
+{
+public:
+    Settings(void)
+        : format_("json")
+        , jsonIndentLevel_(4)
+        , jsonRaw_(false)
+    { }
+
+public:
+    std::string inputPbiFilename_;
+    std::string format_;
+    int jsonIndentLevel_;
+    bool jsonRaw_;
+    std::vector<std::string> errors_;
+};
+
+} // namespace pbindexdump
+
+#endif // SETTINGS_H
diff --git a/tools/pbindexdump/src/json.hpp b/tools/pbindexdump/src/json.hpp

new file mode 100644 (file)

index 0000000..7e174d7
--- /dev/null
+++ b/tools/pbindexdump/src/json.hpp
@@ -0,0 +1,7295 @@
+/*!
+@mainpage
+
+These pages contain the API documentation of JSON for Modern C++, a C++11
+header-only JSON class.
+
+Class @ref nlohmann::basic_json is a good entry point for the documentation.
+
+@copyright The code is licensed under the [MIT
+           License](http://opensource.org/licenses/MIT):
+           <br>
+           Copyright &copy; 2013-2015 Niels Lohmann.
+           <br>
+           Permission is hereby granted, free of charge, to any person
+           obtaining a copy of this software and associated documentation files
+           (the "Software"), to deal in the Software without restriction,
+           including without limitation the rights to use, copy, modify, merge,
+           publish, distribute, sublicense, and/or sell copies of the Software,
+           and to permit persons to whom the Software is furnished to do so,
+           subject to the following conditions:
+           <br>
+           The above copyright notice and this permission notice shall be
+           included in all copies or substantial portions of the Software.
+           <br>
+           THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+           EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+           MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+           NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+           BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+           ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+           CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+           SOFTWARE.
+
+@author [Niels Lohmann](http://nlohmann.me)
+@see https://github.com/nlohmann/json to download the source code
+*/
+
+#ifndef NLOHMANN_JSON_HPP
+#define NLOHMANN_JSON_HPP
+
+#include <algorithm>
+#include <array>
+#include <ciso646>
+#include <cmath>
+#include <cstdio>
+#include <functional>
+#include <initializer_list>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+// enable ssize_t on MinGW
+#ifdef __GNUC__
+    #ifdef __MINGW32__
+        #include <sys/types.h>
+    #endif
+#endif
+
+// enable ssize_t for MSVC
+#ifdef _MSC_VER
+    #include <basetsd.h>
+    using ssize_t = SSIZE_T;
+#endif
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+*/
+namespace nlohmann
+{
+
+
+/*!
+@brief unnamed namespace with internal helper functions
+*/
+namespace
+{
+/*!
+@brief Helper to determine whether there's a key_type for T.
+@sa http://stackoverflow.com/a/7728728/266378
+*/
+template<typename T>
+struct has_mapped_type
+{
+  private:
+    template<typename C> static char test(typename C::mapped_type*);
+    template<typename C> static int  test(...);
+  public:
+    enum { value = sizeof(test<T>(0)) == sizeof(char) };
+};
+
+/// "equality" comparison for floating point numbers
+template<typename T>
+static bool approx(const T a, const T b)
+{
+    return not (a > b or a < b);
+}
+}
+
+/*!
+@brief a class to store JSON values
+
+@tparam ObjectType type for JSON objects (@c std::map by default; will be used
+in @ref object_t)
+@tparam ArrayType type for JSON arrays (@c std::vector by default; will be used
+in @ref array_t)
+@tparam StringType type for JSON strings and object keys (@c std::string by
+default; will be used in @ref string_t)
+@tparam BooleanType type for JSON booleans (@c `bool` by default; will be used
+in @ref boolean_t)
+@tparam NumberIntegerType type for JSON integer numbers (@c `int64_t` by
+default; will be used in @ref number_integer_t)
+@tparam NumberFloatType type for JSON floating-point numbers (@c `double` by
+default; will be used in @ref number_float_t)
+@tparam AllocatorType type of the allocator to use (@c `std::allocator` by
+default)
+
+@requirement The class satisfies the following concept requirements:
+- Basic
+ - [DefaultConstructible](http://en.cppreference.com/w/cpp/concept/DefaultConstructible):
+   JSON values can be default constructed. The result will be a JSON null value.
+ - [MoveConstructible](http://en.cppreference.com/w/cpp/concept/MoveConstructible):
+   A JSON value can be constructed from an rvalue argument.
+ - [CopyConstructible](http://en.cppreference.com/w/cpp/concept/CopyConstructible):
+   A JSON value can be copy-constrcuted from an lvalue expression.
+ - [MoveAssignable](http://en.cppreference.com/w/cpp/concept/MoveAssignable):
+   A JSON value van be assigned from an rvalue argument.
+ - [CopyAssignable](http://en.cppreference.com/w/cpp/concept/CopyAssignable):
+   A JSON value can be copy-assigned from an lvalue expression.
+ - [Destructible](http://en.cppreference.com/w/cpp/concept/Destructible):
+   JSON values can be destructed.
+- Layout
+ - [StandardLayoutType](http://en.cppreference.com/w/cpp/concept/StandardLayoutType):
+   JSON values have
+   [standard layout](http://en.cppreference.com/w/cpp/language/data_members#Standard_layout):
+   All non-static data members are private and standard layout types, the class
+   has no virtual functions or (virtual) base classes.
+- Library-wide
+ - [EqualityComparable](http://en.cppreference.com/w/cpp/concept/EqualityComparable):
+   JSON values can be compared with `==`, see @ref
+   operator==(const_reference,const_reference).
+ - [LessThanComparable](http://en.cppreference.com/w/cpp/concept/LessThanComparable):
+   JSON values can be compared with `<`, see @ref
+   operator<(const_reference,const_reference).
+ - [Swappable](http://en.cppreference.com/w/cpp/concept/Swappable):
+   Any JSON lvalue or rvalue of can be swapped with any lvalue or rvalue of
+   other compatible types, using unqualified function call @ref swap().
+ - [NullablePointer](http://en.cppreference.com/w/cpp/concept/NullablePointer):
+   JSON values can be compared against `std::nullptr_t` objects which are used
+   to model the `null` value.
+- Container
+ - [Container](http://en.cppreference.com/w/cpp/concept/Container):
+   JSON values can be used like STL containers and provide iterator access.
+ - [ReversibleContainer](http://en.cppreference.com/w/cpp/concept/ReversibleContainer);
+   JSON values can be used like STL containers and provide reverse iterator
+   access.
+
+@internal
+@note ObjectType trick from http://stackoverflow.com/a/9860911
+@endinternal
+
+@see RFC 7159 <http://rfc7159.net/rfc7159>
+*/
+template <
+    template<typename U, typename V, typename... Args> class ObjectType = std::map,
+    template<typename U, typename... Args> class ArrayType = std::vector,
+    class StringType = std::string,
+    class BooleanType = bool,
+    class NumberIntegerType = int64_t,
+    class NumberFloatType = double,
+    template<typename U> class AllocatorType = std::allocator
+    >
+class basic_json
+{
+  private:
+    /// workaround type for MSVC
+    using basic_json_t = basic_json<ObjectType,
+          ArrayType,
+          StringType,
+          BooleanType,
+          NumberIntegerType,
+          NumberFloatType,
+          AllocatorType>;
+
+  public:
+
+    /////////////////////
+    // container types //
+    /////////////////////
+
+    /// @name container types
+    /// @{
+
+    /// the type of elements in a basic_json container
+    using value_type = basic_json;
+
+    /// the type of an element reference
+    using reference = value_type&;
+
+    /// the type of an element const reference
+    using const_reference = const value_type&;
+
+    /// a type to represent differences between iterators
+    using difference_type = std::ptrdiff_t;
+
+    /// a type to represent container sizes
+    using size_type = std::size_t;
+
+    /// the allocator type
+    using allocator_type = AllocatorType<basic_json>;
+
+    /// the type of an element pointer
+    using pointer = typename std::allocator_traits<allocator_type>::pointer;
+    /// the type of an element const pointer
+    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;
+
+    // forward declaration
+    template<typename Base> class json_reverse_iterator;
+
+    /// an iterator for a basic_json container
+    class iterator;
+    /// a const iterator for a basic_json container
+    class const_iterator;
+    /// a reverse iterator for a basic_json container
+    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
+    /// a const reverse iterator for a basic_json container
+    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;
+
+    /// @}
+
+
+    /*!
+    @brief returns the allocator associated with the container
+    */
+    static allocator_type get_allocator()
+    {
+        return allocator_type();
+    }
+
+
+    ///////////////////////////
+    // JSON value data types //
+    ///////////////////////////
+
+    /// @name JSON value data types
+    /// @{
+
+    /*!
+    @brief a type for an object
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON objects as follows:
+    > An object is an unordered collection of zero or more name/value pairs,
+    > where a name is a string and a value is a string, number, boolean, null,
+    > object, or array.
+
+    To store objects in C++, a type is defined by the template parameters @a
+    ObjectType which chooses the container (e.g., `std::map` or
+    `std::unordered_map`), @a StringType which chooses the type of the keys or
+    names, and @a AllocatorType which chooses the allocator to use.
+
+    #### Default type
+
+    With the default values for @a ObjectType (`std::map`), @a StringType
+    (`std::string`), and @a AllocatorType (`std::allocator`), the default value
+    for @a object_t is:
+
+    @code {.cpp}
+    std::map<
+      std::string, // key_type
+      basic_json, // value_type
+      std::less<std::string>, // key_compare
+      std::allocator<std::pair<const std::string, basic_json>> // allocator_type
+    >
+    @endcode
+
+    #### Behavior
+
+    The choice of @a object_t influences the behavior of the JSON class. With
+    the default type, objects have the following behavior:
+
+    - When all names are unique, objects will be interoperable in the sense
+      that all software implementations receiving that object will agree on the
+      name-value mappings.
+    - When the names within an object are not unique, later stored name/value
+      pairs overwrite previously stored name/value pairs, leaving the used
+      names unique. For instance, `{"key": 1}` and `{"key": 2, "key": 1}` will
+      be treated as equal and both stored as `{"key": 1}`.
+    - Internally, name/value pairs are stored in lexicographical order of the
+      names. Objects will also be serialized (see @ref dump) in this order. For
+      instance, `{"b": 1, "a": 2}` and `{"a": 2, "b": 1}` will be stored and
+      serialized as `{"a": 2, "b": 1}`.
+    - When comparing objects, the order of the name/value pairs is irrelevant.
+      This makes objects interoperable in the sense that they will not be
+      affected by these differences. For instance, `{"b": 1, "a": 2}` and
+      `{"a": 2, "b": 1}` will be treated as equal.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the object's limit of nesting is not constraint explicitly.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the @ref
+    max_size function of a JSON object.
+
+    #### Storage
+
+    Objects are stored as pointers in a `basic_json` type. That is, for any
+    access to object values, a pointer of type `object_t*` must be dereferenced.
+
+    @sa array_t
+    */
+    using object_t = ObjectType<StringType,
+          basic_json,
+          std::less<StringType>,
+          AllocatorType<std::pair<const StringType,
+          basic_json>>>;
+
+    /*!
+    @brief a type for an array
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON arrays as follows:
+    > An array is an ordered sequence of zero or more values.
+
+    To store objects in C++, a type is defined by the template parameters @a
+    ArrayType which chooses the container (e.g., `std::vector` or `std::list`)
+    and @a AllocatorType which chooses the allocator to use.
+
+    #### Default type
+
+    With the default values for @a ArrayType (`std::vector`) and @a
+    AllocatorType (`std::allocator`), the default value for @a array_t is:
+
+    @code {.cpp}
+    std::vector<
+      basic_json, // value_type
+      std::allocator<basic_json> // allocator_type
+    >
+    @endcode
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the array's limit of nesting is not constraint explicitly.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the @ref
+    max_size function of a JSON array.
+
+    #### Storage
+
+    Arrays are stored as pointers in a `basic_json` type. That is, for any
+    access to array values, a pointer of type `array_t*` must be dereferenced.
+    */
+    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;
+
+    /*!
+    @brief a type for a string
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON strings as follows:
+    > A string is a sequence of zero or more Unicode characters.
+
+    To store objects in C++, a type is defined by the template parameters @a
+    StringType which chooses the container (e.g., `std::string`) to use.
+
+    Unicode values are split by the JSON class into byte-sized characters
+    during deserialization.
+
+    #### Default type
+
+    With the default values for @a StringType (`std::string`), the default
+    value for @a string_t is:
+
+    @code {.cpp}
+    std::string
+    @endcode
+
+    #### String comparison
+
+    [RFC 7159](http://rfc7159.net/rfc7159) states:
+    > Software implementations are typically required to test names of object
+    > members for equality. Implementations that transform the textual
+    > representation into sequences of Unicode code units and then perform the
+    > comparison numerically, code unit by code unit, are interoperable in the
+    > sense that implementations will agree in all cases on equality or
+    > inequality of two strings. For example, implementations that compare
+    > strings with escaped characters unconverted may incorrectly find that
+    > `"a\\b"` and `"a\u005Cb"` are not equal.
+
+    This implementation is interoperable as it does compare strings code unit
+    by code unit.
+
+    #### Storage
+
+    String values are stored as pointers in a `basic_json` type. That is, for
+    any access to string values, a pointer of type `string_t*` must be
+    dereferenced.
+    */
+    using string_t = StringType;
+
+    /*!
+    @brief a type for a boolean
+
+    [RFC 7159](http://rfc7159.net/rfc7159) implicitly describes a boolean as a
+    type which differentiates the two literals `true` and `false`.
+
+    To store objects in C++, a type is defined by the template parameter @a
+    BooleanType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a BooleanType (`bool`), the default value for
+    @a boolean_t is:
+
+    @code {.cpp}
+    bool
+    @endcode
+
+    #### Storage
+
+    Boolean values are stored directly inside a `basic_json` type.
+    */
+    using boolean_t = BooleanType;
+
+    /*!
+    @brief a type for a number (integer)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most programming
+    > languages. A number is represented in base 10 using decimal digits. It
+    > contains an integer component that may be prefixed with an optional minus
+    > sign, which may be followed by a fraction part and/or an exponent part.
+    > Leading zeros are not allowed. (...) Numeric values that cannot be
+    > represented in the grammar below (such as Infinity and NaN) are not
+    > permitted.
+
+    This description includes both integer and floating-point numbers. However,
+    C++ allows more precise storage if it is known whether the number is an
+    integer or a floating-point number. Therefore, two different types, @ref
+    number_integer_t and @ref number_float_t are used.
+
+    To store integer numbers in C++, a type is defined by the template
+    parameter @a NumberIntegerType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberIntegerType (`int64_t`), the default
+    value for @a number_integer_t is:
+
+    @code {.cpp}
+    int64_t
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in integer literals lead to an interpretation as octal
+      number. Internally, the value will be stored as decimal number. For
+      instance, the C++ integer literal `010` will be serialized to `8`. During
+      deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the range and precision of numbers.
+
+    When the default type is used, the maximal integer number that can be
+    stored is `9223372036854775807` (INT64_MAX) and the minimal integer number
+    that can be stored is `-9223372036854775808` (INT64_MIN). Integer numbers
+    that are out of range will yield over/underflow when used in a constructor.
+    During deserialization, too large or small integer numbers will be
+    automatically be stored as @ref number_float_t.
+
+    [RFC 7159](http://rfc7159.net/rfc7159) further states:
+    > Note that when such software is used, numbers that are integers and are
+    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
+    > that implementations will agree exactly on their numeric values.
+
+    As this range is a subrange of the exactly supported range [INT64_MIN,
+    INT64_MAX], this class's integer type is interoperable.
+
+    #### Storage
+
+    Integer number values are stored directly inside a `basic_json` type.
+    */
+    using number_integer_t = NumberIntegerType;
+
+    /*!
+    @brief a type for a number (floating-point)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most programming
+    > languages. A number is represented in base 10 using decimal digits. It
+    > contains an integer component that may be prefixed with an optional minus
+    > sign, which may be followed by a fraction part and/or an exponent part.
+    > Leading zeros are not allowed. (...) Numeric values that cannot be
+    > represented in the grammar below (such as Infinity and NaN) are not
+    > permitted.
+
+    This description includes both integer and floating-point numbers. However,
+    C++ allows more precise storage if it is known whether the number is an
+    integer or a floating-point number. Therefore, two different types, @ref
+    number_integer_t and @ref number_float_t are used.
+
+    To store floating-point numbers in C++, a type is defined by the template
+    parameter @a NumberFloatType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberFloatType (`double`), the default
+    value for @a number_float_t is:
+
+    @code {.cpp}
+    double
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in floating-point literals will be ignored. Internally, the
+      value will be stored as decimal number. For instance, the C++
+      floating-point literal `01.2` will be serialized to `1.2`. During
+      deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) states:
+    > This specification allows implementations to set limits on the range and
+    > precision of numbers accepted. Since software that implements IEEE
+    > 754-2008 binary64 (double precision) numbers is generally available and
+    > widely used, good interoperability can be achieved by implementations that
+    > expect no more precision or range than these provide, in the sense that
+    > implementations will approximate JSON numbers within the expected
+    > precision.
+
+    This implementation does exactly follow this approach, as it uses double
+    precision floating-point numbers. Note values smaller than
+    `-1.79769313486232e+308` and values greather than `1.79769313486232e+308`
+    will be stored as NaN internally and be serialized to `null`.
+
+    #### Storage
+
+    Floating-point number values are stored directly inside a `basic_json` type.
+    */
+    using number_float_t = NumberFloatType;
+
+    /// @}
+
+
+    ///////////////////////////
+    // JSON type enumeration //
+    ///////////////////////////
+
+    /*!
+    @brief the JSON type enumeration
+
+    This enumeration collects the different JSON types. It is internally used
+    to distinguish the stored values, and the functions is_null, is_object,
+    is_array, is_string, is_boolean, is_number, and is_discarded rely on it.
+    */
+    enum class value_t : uint8_t
+    {
+        null,           ///< null value
+        object,         ///< object (unordered set of name/value pairs)
+        array,          ///< array (ordered collection of values)
+        string,         ///< string value
+        boolean,        ///< boolean value
+        number_integer, ///< number value (integer)
+        number_float,   ///< number value (floating-point)
+        discarded       ///< discarded by the the parser callback function
+    };
+
+
+  private:
+    /// helper for exception-safe object creation
+    template<typename T, typename... Args>
+    static T* create( Args&& ... args )
+    {
+        AllocatorType<T> alloc;
+        auto deleter = [&](T * object)
+        {
+            alloc.deallocate(object, 1);
+        };
+        std::unique_ptr<T, decltype(deleter)> object(alloc.allocate(1), deleter);
+        alloc.construct(object.get(), std::forward<Args>(args)...);
+        return object.release();
+    }
+
+    ////////////////////////
+    // JSON value storage //
+    ////////////////////////
+
+    /// a JSON value
+    union json_value
+    {
+        /// object (stored with pointer to save storage)
+        object_t* object;
+        /// array (stored with pointer to save storage)
+        array_t* array;
+        /// string (stored with pointer to save storage)
+        string_t* string;
+        /// boolean
+        boolean_t boolean;
+        /// number (integer)
+        number_integer_t number_integer;
+        /// number (floating-point)
+        number_float_t number_float;
+
+        /// default constructor (for null values)
+        json_value() noexcept = default;
+        /// constructor for booleans
+        json_value(boolean_t v) noexcept : boolean(v) {}
+        /// constructor for numbers (integer)
+        json_value(number_integer_t v) noexcept : number_integer(v) {}
+        /// constructor for numbers (floating-point)
+        json_value(number_float_t v) noexcept : number_float(v) {}
+        /// constructor for empty values of a given type
+        json_value(value_t t)
+        {
+            switch (t)
+            {
+                case (value_t::null):
+                case (value_t::discarded):
+                {
+                    break;
+                }
+
+                case (value_t::object):
+                {
+                    object = create<object_t>();
+                    break;
+                }
+
+                case (value_t::array):
+                {
+                    array = create<array_t>();
+                    break;
+                }
+
+                case (value_t::string):
+                {
+                    string = create<string_t>("");
+                    break;
+                }
+
+                case (value_t::boolean):
+                {
+                    boolean = boolean_t(false);
+                    break;
+                }
+
+                case (value_t::number_integer):
+                {
+                    number_integer = number_integer_t(0);
+                    break;
+                }
+
+                case (value_t::number_float):
+                {
+                    number_float = number_float_t(0.0);
+                    break;
+                }
+            }
+        }
+
+        /// constructor for strings
+        json_value(const string_t& value)
+        {
+            string = create<string_t>(value);
+        }
+
+        /// constructor for objects
+        json_value(const object_t& value)
+        {
+            object = create<object_t>(value);
+        }
+
+        /// constructor for arrays
+        json_value(const array_t& value)
+        {
+            array = create<array_t>(value);
+        }
+    };
+
+
+  public:
+    //////////////////////////
+    // JSON parser callback //
+    //////////////////////////
+
+    /*!
+    @brief JSON callback events
+
+    This enumeration lists the parser events that can trigger calling a
+    callback function of type @ref parser_callback_t during parsing.
+    */
+    enum class parse_event_t : uint8_t
+    {
+        /// the parser read `{` and started to process a JSON object
+        object_start,
+        /// the parser read `}` and finished processing a JSON object
+        object_end,
+        /// the parser read `[` and started to process a JSON array
+        array_start,
+        /// the parser read `]` and finished processing a JSON array
+        array_end,
+        /// the parser read a key of a value in an object
+        key,
+        /// the parser finished reading a JSON value
+        value
+    };
+
+    /*!
+    @brief per-element parser callback type
+
+    With a parser callback function, the result of parsing a JSON text can be
+    influenced. When passed to @ref parse(std::istream&, parser_callback_t) or
+    @ref parse(const string_t&, parser_callback_t), it is called on certain
+    events (passed as @ref parse_event_t via parameter @a event) with a set
+    recursion depth @a depth and context JSON value @a parsed. The return value
+    of the callback function is a boolean indicating whether the element that
+    emitted the callback shall be kept or not.
+
+    We distinguish six scenarios (determined by the event type) in which the
+    callback function can be called. The following table describes the values
+    of the parameters @a depth, @a event, and @a parsed.
+
+    parameter @a event | description | parameter @a depth | parameter @a parsed
+    ------------------ | ----------- | ------------------ | -------------------
+    parse_event_t::object_start | the parser read `{` and started to process a JSON object | depth of the parent of the JSON object | a JSON value with type discarded
+    parse_event_t::key | the parser read a key of a value in an object | depth of the currently parsed JSON object | a JSON string containing the key
+    parse_event_t::object_end | the parser read `}` and finished processing a JSON object | depth of the parent of the JSON object | the parsed JSON object
+    parse_event_t::array_start | the parser read `[` and started to process a JSON array | depth of the parent of the JSON array | a JSON value with type discarded
+    parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
+    parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value
+
+    Discarding a value (i.e., returning `false`) has different effects depending on the
+    context in which function was called:
+
+    - Discarded values in structured types are skipped. That is, the parser
+      will behave as if the discarded value was never read.
+    - In case a value outside a structured type is skipped, it is replaced with
+      `null`. This case happens if the top-level element is skipped.
+
+    @param[in] depth   the depth of the recursion during parsing
+
+    @param[in] event   an event of type parse_event_t indicating the context in
+    the callback function has been called
+
+    @param[in,out] parsed  the current intermediate parse result; note that
+    writing to this value has no effect for parse_event_t::key events
+
+    @return Whether the JSON value which called the function during parsing
+    should be kept (`true`) or not (`false`). In the latter case, it is either
+    skipped completely or replaced by an empty discarded object.
+
+    @sa @ref parse(std::istream&, parser_callback_t) or
+    @ref parse(const string_t&, parser_callback_t) for examples
+    */
+    using parser_callback_t = std::function<bool(
+                                  int depth, parse_event_t event, basic_json& parsed)>;
+
+
+    //////////////////
+    // constructors //
+    //////////////////
+
+    /*!
+    @brief create an empty value with a given type
+
+    Create an empty JSON value with a given type. The value will be default
+    initialized with an empty value which depends on the type:
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    object      | `{}`
+    array       | `[]`
+
+    @param[in] value  the type of the value to create
+
+    @complexity Constant.
+
+    @throw std::bad_alloc if allocation for object, array, or string value
+    fails
+
+    @liveexample{The following code shows the constructor for different @ref
+    value_t values,basic_json__value_t}
+    */
+    basic_json(const value_t value)
+        : m_type(value), m_value(value)
+    {}
+
+    /*!
+    @brief create a null object (implicitly)
+
+    Create a `null` JSON value. This is the implicit version of the `null`
+    value constructor as it takes no parameters.
+
+    @complexity Constant.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - As postcondition, it holds: `basic_json().empty() == true`.
+
+    @liveexample{The following code shows the constructor for a `null` JSON
+    value.,basic_json}
+
+    @sa basic_json(std::nullptr_t)
+    */
+    basic_json() noexcept = default;
+
+    /*!
+    @brief create a null object (explicitly)
+
+    Create a `null` JSON value. This is the explicitly version of the `null`
+    value constructor as it takes a null pointer as parameter. It allows to
+    create `null` values by explicitly assigning a @c nullptr to a JSON value.
+    The passed null pointer itself is not read - it is only used to choose the
+    right constructor.
+
+    @complexity Constant.
+
+    @liveexample{The following code shows the constructor with null pointer
+    parameter.,basic_json__nullptr_t}
+
+    @sa basic_json()
+    */
+    basic_json(std::nullptr_t) noexcept
+        : basic_json(value_t::null)
+    {}
+
+    /*!
+    @brief create an object (explicit)
+
+    Create an object JSON value with a given content.
+
+    @param[in] value  a value for the object
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for object value fails
+
+    @liveexample{The following code shows the constructor with an @ref object_t
+    parameter.,basic_json__object_t}
+
+    @sa basic_json(const CompatibleObjectType&)
+    */
+    basic_json(const object_t& value)
+        : m_type(value_t::object), m_value(value)
+    {}
+
+    /*!
+    @brief create an object (implicit)
+
+    Create an object JSON value with a given content. This constructor allows
+    any type that can be used to construct values of type @ref object_t.
+    Examples include the types `std::map` and `std::unordered_map`.
+
+    @tparam CompatibleObjectType an object type whose `key_type` and
+    `value_type` is compatible to @ref object_t
+
+    @param[in] value  a value for the object
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for object value fails
+
+    @liveexample{The following code shows the constructor with several
+    compatible object type parameters.,basic_json__CompatibleObjectType}
+
+    @sa basic_json(const object_t&)
+    */
+    template <class CompatibleObjectType, typename
+              std::enable_if<
+                  std::is_constructible<typename object_t::key_type, typename CompatibleObjectType::key_type>::value and
+                  std::is_constructible<basic_json, typename CompatibleObjectType::mapped_type>::value, int>::type
+              = 0>
+    basic_json(const CompatibleObjectType& value)
+        : m_type(value_t::object)
+    {
+        using std::begin;
+        using std::end;
+        m_value.object = create<object_t>(begin(value), end(value));
+    }
+
+    /*!
+    @brief create an array (explicit)
+
+    Create an array JSON value with a given content.
+
+    @param[in] value  a value for the array
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for array value fails
+
+    @liveexample{The following code shows the constructor with an @ref array_t
+    parameter.,basic_json__array_t}
+
+    @sa basic_json(const CompatibleArrayType&)
+    */
+    basic_json(const array_t& value)
+        : m_type(value_t::array), m_value(value)
+    {}
+
+    /*!
+    @brief create an array (implicit)
+
+    Create an array JSON value with a given content. This constructor allows
+    any type that can be used to construct values of type @ref array_t.
+    Examples include the types `std::vector`, `std::list`, and `std::set`.
+
+    @tparam CompatibleArrayType an object type whose `value_type` is compatible
+    to @ref array_t
+
+    @param[in] value  a value for the array
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for array value fails
+
+    @liveexample{The following code shows the constructor with several
+    compatible array type parameters.,basic_json__CompatibleArrayType}
+
+    @sa basic_json(const array_t&)
+    */
+    template <class CompatibleArrayType, typename
+              std::enable_if<
+                  not std::is_same<CompatibleArrayType, typename basic_json_t::iterator>::value and
+                  not std::is_same<CompatibleArrayType, typename basic_json_t::const_iterator>::value and
+                  not std::is_same<CompatibleArrayType, typename basic_json_t::reverse_iterator>::value and
+                  not std::is_same<CompatibleArrayType, typename basic_json_t::const_reverse_iterator>::value and
+                  not std::is_same<CompatibleArrayType, typename array_t::iterator>::value and
+                  not std::is_same<CompatibleArrayType, typename array_t::const_iterator>::value and
+                  std::is_constructible<basic_json, typename CompatibleArrayType::value_type>::value, int>::type
+              = 0>
+    basic_json(const CompatibleArrayType& value)
+        : m_type(value_t::array)
+    {
+        using std::begin;
+        using std::end;
+        m_value.array = create<array_t>(begin(value), end(value));
+    }
+
+    /*!
+    @brief create a string (explicit)
+
+    Create an string JSON value with a given content.
+
+    @param[in] value  a value for the string
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for string value fails
+
+    @liveexample{The following code shows the constructor with an @ref string_t
+    parameter.,basic_json__string_t}
+
+    @sa basic_json(const typename string_t::value_type*)
+    @sa basic_json(const CompatibleStringType&)
+    */
+    basic_json(const string_t& value)
+        : m_type(value_t::string), m_value(value)
+    {}
+
+    /*!
+    @brief create a string (explicit)
+
+    Create a string JSON value with a given content.
+
+    @param[in] value  a literal value for the string
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for string value fails
+
+    @liveexample{The following code shows the constructor with string literal
+    parameter.,basic_json__string_t_value_type}
+
+    @sa basic_json(const string_t&)
+    @sa basic_json(const CompatibleStringType&)
+    */
+    basic_json(const typename string_t::value_type* value)
+        : basic_json(string_t(value))
+    {}
+
+    /*!
+    @brief create a string (implicit)
+
+    Create a string JSON value with a given content.
+
+    @param[in] value  a value for the string
+
+    @tparam CompatibleStringType an string type which is compatible to @ref
+    string_t
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for string value fails
+
+    @liveexample{The following code shows the construction of a string value
+    from a compatible type.,basic_json__CompatibleStringType}
+
+    @sa basic_json(const string_t&)
+    */
+    template <class CompatibleStringType, typename
+              std::enable_if<
+                  std::is_constructible<string_t, CompatibleStringType>::value, int>::type
+              = 0>
+    basic_json(const CompatibleStringType& value)
+        : basic_json(string_t(value))
+    {}
+
+    /*!
+    @brief create a boolean (explicit)
+
+    Creates a JSON boolean type from a given value.
+
+    @param[in] value  a boolean value to store
+
+    @complexity Constant.
+
+    @liveexample{The example below demonstrates boolean
+    values.,basic_json__boolean_t}
+    */
+    basic_json(boolean_t value)
+        : m_type(value_t::boolean), m_value(value)
+    {}
+
+    /*!
+    @brief create an integer number (explicit)
+
+    Create an interger number JSON value with a given content.
+
+    @tparam T  helper type to compare number_integer_t and int (not visible in)
+    the interface.
+
+    @param[in] value  an integer to create a JSON number from
+
+    @note This constructor would have the same signature as @ref
+    basic_json(const int value), so we need to switch this one off in case
+    number_integer_t is the same as int. This is done via the helper type @a T.
+
+    @complexity Constant.
+
+    @liveexample{The example below shows the construction of a JSON integer
+    number value.,basic_json__number_integer_t}
+
+    @sa basic_json(const int)
+    */
+    template<typename T,
+             typename std::enable_if<
+                 not (std::is_same<T, int>::value)
+                 and std::is_same<T, number_integer_t>::value
+                 , int>::type = 0>
+    basic_json(const number_integer_t value)
+        : m_type(value_t::number_integer), m_value(value)
+    {}
+
+    /*!
+    @brief create an integer number from an enum type (explicit)
+
+    Create an integer number JSON value with a given content.
+
+    @param[in] value  an integer to create a JSON number from
+
+    @note This constructor allows to pass enums directly to a constructor. As
+    C++ has no way of specifying the type of an anonymous enum explicitly, we
+    can only rely on the fact that such values implicitly convert to int. As
+    int may already be the same type of number_integer_t, we may need to switch
+    off the constructor @ref basic_json(const number_integer_t).
+
+    @complexity Constant.
+
+    @liveexample{The example below shows the construction of a JSON integer
+    number value from an anonymous enum.,basic_json__const_int}
+
+    @sa basic_json(const number_integer_t)
+    */
+    basic_json(const int value)
+        : m_type(value_t::number_integer),
+          m_value(static_cast<number_integer_t>(value))
+    {}
+
+    /*!
+    @brief create an integer number (implicit)
+
+    Create an integer number JSON value with a given content. This constructor
+    allows any type that can be used to construct values of type @ref
+    number_integer_t. Examples may include the types `int`, `int32_t`, or
+    `short`.
+
+    @tparam CompatibleNumberIntegerType an integer type which is compatible to
+    @ref number_integer_t.
+
+    @param[in] value  an integer to create a JSON number from
+
+    @complexity Constant.
+
+    @liveexample{The example below shows the construction of several JSON
+    integer number values from compatible
+    types.,basic_json__CompatibleIntegerNumberType}
+
+    @sa basic_json(const number_integer_t)
+    */
+    template<typename CompatibleNumberIntegerType, typename
+             std::enable_if<
+                 std::is_constructible<number_integer_t, CompatibleNumberIntegerType>::value and
+                 std::numeric_limits<CompatibleNumberIntegerType>::is_integer, CompatibleNumberIntegerType>::type
+             = 0>
+    basic_json(const CompatibleNumberIntegerType value) noexcept
+        : m_type(value_t::number_integer),
+          m_value(static_cast<number_integer_t>(value))
+    {}
+
+    /*!
+    @brief create a floating-point number (explicit)
+
+    Create a floating-point number JSON value with a given content.
+
+    @param[in] value  a floating-point value to create a JSON number from
+
+    @note RFC 7159 <http://www.rfc-editor.org/rfc/rfc7159.txt>, section 6
+    disallows NaN values:
+    > Numeric values that cannot be represented in the grammar below (such
+    > as Infinity and NaN) are not permitted.
+    In case the parameter @a value is not a number, a JSON null value is
+    created instead.
+
+    @complexity Constant.
+
+    @liveexample{The following example creates several floating-point
+    values.,basic_json__number_float_t}
+    */
+    basic_json(const number_float_t value)
+        : m_type(value_t::number_float), m_value(value)
+    {
+        // replace infinity and NAN by null
+        if (not std::isfinite(value))
+        {
+            m_type = value_t::null;
+            m_value = json_value();
+        }
+    }
+
+    /*!
+    @brief create an floating-point number (implicit)
+
+    Create an floating-point number JSON value with a given content. This
+    constructor allows any type that can be used to construct values of type
+    @ref number_float_t. Examples may include the types `float`.
+
+    @tparam CompatibleNumberFloatType a floating-point type which is compatible
+    to @ref number_float_t.
+
+    @param[in] value  a floating-point to create a JSON number from
+
+    @note RFC 7159 <http://www.rfc-editor.org/rfc/rfc7159.txt>, section 6
+    disallows NaN values:
+    > Numeric values that cannot be represented in the grammar below (such
+    > as Infinity and NaN) are not permitted.
+    In case the parameter @a value is not a number, a JSON null value is
+    created instead.
+
+    @complexity Constant.
+
+    @liveexample{The example below shows the construction of several JSON
+    floating-point number values from compatible
+    types.,basic_json__CompatibleNumberFloatType}
+
+    @sa basic_json(const number_float_t)
+    */
+    template<typename CompatibleNumberFloatType, typename = typename
+             std::enable_if<
+                 std::is_constructible<number_float_t, CompatibleNumberFloatType>::value and
+                 std::is_floating_point<CompatibleNumberFloatType>::value>::type
+             >
+    basic_json(const CompatibleNumberFloatType value) noexcept
+        : basic_json(number_float_t(value))
+    {}
+
+    /*!
+    @brief create a container (array or object) from an initializer list
+
+    Creates a JSON value of type array or object from the passed initializer
+    list @a init. In case @a type_deduction is `true` (default), the type of
+    the JSON value to be created is deducted from the initializer list @a init
+    according to the following rules:
+
+    1. If the list is empty, an empty JSON object value `{}` is created.
+    2. If the list consists of pairs whose first element is a string, a JSON
+    object value is created where the first elements of the pairs are treated
+    as keys and the second elements are as values.
+    3. In all other cases, an array is created.
+
+    The rules aim to create the best fit between a C++ initializer list and
+    JSON values. The ratioinale is as follows:
+
+    1. The empty initializer list is written as `{}` which is exactly an empty
+    JSON object.
+    2. C++ has now way of describing mapped types other than to list a list of
+    pairs. As JSON requires that keys must be of type string, rule 2 is the
+    weakest constraint one can pose on initializer lists to interpret them as
+    an object.
+    3. In all other cases, the initializer list could not be interpreted as
+    JSON object type, so interpreting it as JSON array type is safe.
+
+    With the rules described above, the following JSON values cannot be
+    expressed by an initializer list:
+
+    - the empty array (`[]`): use @ref array(std::initializer_list<basic_json>)
+      with an empty initializer list in this case
+    - arrays whose elements satisfy rule 2: use @ref
+      array(std::initializer_list<basic_json>) with the same initializer list
+      in this case
+
+    @note When used without parentheses around an empty initializer list, @ref
+    basic_json() is called instead of this function, yielding the JSON null
+    value.
+
+    @param[in] init  initializer list with JSON values
+
+    @param[in] type_deduction internal parameter; when set to `true`, the type
+    of the JSON value is deducted from the initializer list @a init; when set
+    to `false`, the type provided via @a manual_type is forced. This mode is
+    used by the functions @ref array(std::initializer_list<basic_json>) and
+    @ref object(std::initializer_list<basic_json>).
+
+    @param[in] manual_type internal parameter; when @a type_deduction is set to
+    `false`, the created JSON value will use the provided type (only @ref
+    value_t::array and @ref value_t::object are valid); when @a type_deduction
+    is set to `true`, this parameter has no effect
+
+    @throw std::domain_error if @a type_deduction is `false`, @a manual_type is
+    `value_t::object`, but @a init contains an element which is not a pair
+    whose first element is a string
+
+    @complexity Linear in the size of the initializer list @a init.
+
+    @liveexample{The example below shows how JSON values are created from
+    initializer lists,basic_json__list_init_t}
+
+    @sa basic_json array(std::initializer_list<basic_json>) - create a JSON
+    array value from an initializer list
+    @sa basic_json object(std::initializer_list<basic_json>) - create a JSON
+    object value from an initializer list
+    */
+    basic_json(std::initializer_list<basic_json> init,
+               bool type_deduction = true,
+               value_t manual_type = value_t::array)
+    {
+        // the initializer list could describe an object
+        bool is_object = true;
+
+        // check if each element is an array with two elements whose first element
+        // is a string
+        for (const auto& element : init)
+        {
+            if (element.m_type != value_t::array or element.size() != 2
+                    or element[0].m_type != value_t::string)
+            {
+                // we found an element that makes it impossible to use the
+                // initializer list as object
+                is_object = false;
+                break;
+            }
+        }
+
+        // adjust type if type deduction is not wanted
+        if (not type_deduction)
+        {
+            // if array is wanted, do not create an object though possible
+            if (manual_type == value_t::array)
+            {
+                is_object = false;
+            }
+
+            // if object is wanted but impossible, throw an exception
+            if (manual_type == value_t::object and not is_object)
+            {
+                throw std::domain_error("cannot create object from initializer list");
+            }
+        }
+
+        if (is_object)
+        {
+            // the initializer list is a list of pairs -> create object
+            m_type = value_t::object;
+            m_value = value_t::object;
+
+            for (auto& element : init)
+            {
+                m_value.object->emplace(std::move(*(element[0].m_value.string)), std::move(element[1]));
+            }
+        }
+        else
+        {
+            // the initializer list describes an array -> create array
+            m_type = value_t::array;
+            m_value.array = create<array_t>(std::move(init));
+        }
+    }
+
+    /*!
+    @brief explicitly create an array from an initializer list
+
+    Creates a JSON array value from a given initializer list. That is, given a
+    list of values `a, b, c`, creates the JSON value `[a, b, c]`. If the
+    initializer list is empty, the empty array `[]` is created.
+
+    @note This function is only needed to express two edge cases that cannot be
+    realized with the initializer list constructor (@ref
+    basic_json(std::initializer_list<basic_json>, bool, value_t)). These cases
+    are:
+    1. creating an array whose elements are all pairs whose first element is a
+    string - in this case, the initializer list constructor would create an
+    object, taking the first elements as keys
+    2. creating an empty array - passing the empty initializer list to the
+    initializer list constructor yields an empty object
+
+    @param[in] init  initializer list with JSON values to create an array from
+    (optional)
+
+    @return JSON array value
+
+    @complexity Linear in the size of @a init.
+
+    @liveexample{The following code shows an example for the @ref array
+    function.,array}
+
+    @sa basic_json(std::initializer_list<basic_json>, bool, value_t) - create a
+    JSON value from an initializer list
+    @sa basic_json object(std::initializer_list<basic_json>) - create a JSON
+    object value from an initializer list
+    */
+    static basic_json array(std::initializer_list<basic_json> init =
+                                std::initializer_list<basic_json>())
+    {
+        return basic_json(init, false, value_t::array);
+    }
+
+    /*!
+    @brief explicitly create an object from an initializer list
+
+    Creates a JSON object value from a given initializer list. The initializer
+    lists elements must be pairs, and their first elments must be strings. If
+    the initializer list is empty, the empty object `{}` is created.
+
+    @note This function is only added for symmetry reasons. In contrast to the
+    related function @ref basic_json array(std::initializer_list<basic_json>),
+    there are no cases which can only be expressed by this function. That is,
+    any initializer list @a init can also be passed to the initializer list
+    constructor @ref basic_json(std::initializer_list<basic_json>, bool,
+    value_t).
+
+    @param[in] init  initializer list to create an object from (optional)
+
+    @return JSON object value
+
+    @throw std::domain_error if @a init is not a pair whose first elements are
+    strings; thrown by @ref basic_json(std::initializer_list<basic_json>, bool,
+    value_t)
+
+    @complexity Linear in the size of @a init.
+
+    @liveexample{The following code shows an example for the @ref object
+    function.,object}
+
+    @sa basic_json(std::initializer_list<basic_json>, bool, value_t) - create a
+    JSON value from an initializer list
+    @sa basic_json array(std::initializer_list<basic_json>) - create a JSON
+    array value from an initializer list
+    */
+    static basic_json object(std::initializer_list<basic_json> init =
+                                 std::initializer_list<basic_json>())
+    {
+        return basic_json(init, false, value_t::object);
+    }
+
+    /*!
+    @brief construct an array with count copies of given value
+
+    Constructs a JSON array value by creating @a count copies of a passed
+    value. In case @a count is `0`, an empty array is created. As postcondition,
+    `std::distance(begin(),end()) == count` holds.
+
+    @param[in] count  the number of JSON copies of @a value to create
+    @param[in] value  the JSON value to copy
+
+    @complexity Linear in @a count.
+
+    @liveexample{The following code shows examples for the @ref
+    basic_json(size_type\, const basic_json&)
+    constructor.,basic_json__size_type_basic_json}
+    */
+    basic_json(size_type count, const basic_json& value)
+        : m_type(value_t::array)
+    {
+        m_value.array = create<array_t>(count, value);
+    }
+
+    /*!
+    @brief construct a JSON container given an iterator range
+
+    Constructs the JSON value with the contents of the range `[first, last)`.
+    The semantics depends on the different types a JSON value can have:
+    - In case of primitive types (number, boolean, or string), @a first must
+      be `begin()` and @a last must be `end()`. In this case, the value is
+      copied. Otherwise, std::out_of_range is thrown.
+    - In case of structured types (array, object), the constructor behaves
+      as similar versions for `std::vector`.
+    - In case of a null type, std::domain_error is thrown.
+
+    @tparam InputIT an input iterator type (@ref iterator or @ref
+    const_iterator)
+
+    @param[in] first begin of the range to copy from (included)
+    @param[in] last end of the range to copy from (excluded)
+
+    @throw std::domain_error if iterators are not compatible; that is, do not
+    belong to the same JSON value
+    @throw std::out_of_range if iterators are for a primitive type (number,
+    boolean, or string) where an out of range error can be detected easily
+    @throw std::bad_alloc if allocation for object, array, or string fails
+    @throw std::domain_error if called with a null value
+
+    @complexity Linear in distance between @a first and @a last.
+
+    @liveexample{The example below shows several ways to create JSON values by
+    specifying a subrange with iterators.,basic_json__InputIt_InputIt}
+    */
+    template <class InputIT, typename
+              std::enable_if<
+                  std::is_same<InputIT, typename basic_json_t::iterator>::value or
+                  std::is_same<InputIT, typename basic_json_t::const_iterator>::value
+                  , int>::type
+              = 0>
+    basic_json(InputIT first, InputIT last) : m_type(first.m_object->m_type)
+    {
+        // make sure iterator fits the current value
+        if (first.m_object != last.m_object)
+        {
+            throw std::domain_error("iterators are not compatible");
+        }
+
+        // check if iterator range is complete for primitive values
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            case value_t::number_float:
+            case value_t::boolean:
+            case value_t::string:
+            {
+                if (not first.m_it.primitive_iterator.is_begin() or not last.m_it.primitive_iterator.is_end())
+                {
+                    throw std::out_of_range("iterators out of range");
+                }
+                break;
+            }
+
+            default:
+            {
+                break;
+            }
+        }
+
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_value.number_integer = first.m_object->m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value.number_float = first.m_object->m_value.number_float;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value.boolean = first.m_object->m_value.boolean;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value = *first.m_object->m_value.string;
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_value.object = create<object_t>(first.m_it.object_iterator, last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value.array = create<array_t>(first.m_it.array_iterator, last.m_it.array_iterator);
+                break;
+            }
+
+            default:
+            {
+                throw std::domain_error("cannot use construct with iterators from " + first.m_object->type_name());
+            }
+        }
+    }
+
+    ///////////////////////////////////////
+    // other constructors and destructor //
+    ///////////////////////////////////////
+
+    /*!
+    @brief copy constructor
+
+    Creates a copy of a given JSON value.
+
+    @param[in] other  the JSON value to copy
+
+    @complexity Linear in the size of @a other.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is linear.
+    - As postcondition, it holds: `other == basic_json(other)`.
+
+    @throw std::bad_alloc if allocation for object, array, or string fails.
+
+    @liveexample{The following code shows an example for the copy
+    constructor.,basic_json__basic_json}
+    */
+    basic_json(const basic_json& other)
+        : m_type(other.m_type)
+    {
+        switch (m_type)
+        {
+            case (value_t::null):
+            case (value_t::discarded):
+            {
+                break;
+            }
+
+            case (value_t::object):
+            {
+                m_value = *other.m_value.object;
+                break;
+            }
+
+            case (value_t::array):
+            {
+                m_value = *other.m_value.array;
+                break;
+            }
+
+            case (value_t::string):
+            {
+                m_value = *other.m_value.string;
+                break;
+            }
+
+            case (value_t::boolean):
+            {
+                m_value = other.m_value.boolean;
+                break;
+            }
+
+            case (value_t::number_integer):
+            {
+                m_value = other.m_value.number_integer;
+                break;
+            }
+
+            case (value_t::number_float):
+            {
+                m_value = other.m_value.number_float;
+                break;
+            }
+        }
+    }
+
+    /*!
+    @brief move constructor
+
+    Move constructor. Constructs a JSON value with the contents of the given
+    value @a other using move semantics. It "steals" the resources from @a
+    other and leaves it as JSON null value.
+
+    @param[in,out] other  value to move to this object
+
+    @post @a other is a JSON null value
+
+    @complexity Constant.
+
+    @liveexample{The code below shows the move constructor explicitly called
+    via std::move.,basic_json__moveconstructor}
+    */
+    basic_json(basic_json&& other) noexcept
+        : m_type(std::move(other.m_type)),
+          m_value(std::move(other.m_value))
+    {
+        // invalidate payload
+        other.m_type = value_t::null;
+        other.m_value = {};
+    }
+
+    /*!
+    @brief copy assignment
+
+    Copy assignment operator. Copies a JSON value via the "copy and swap"
+    strategy: It is expressed in terms of the copy constructor, destructor, and
+    the swap() member function.
+
+    @param[in] other  value to copy from
+
+    @complexity Linear.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is linear.
+
+    @liveexample{The code below shows and example for the copy assignment. It
+    creates a copy of value `a` which is then swapped with `b`. Finally\, the
+    copy of `a` (which is the null value after the swap) is
+    destroyed.,basic_json__copyassignment}
+    */
+    reference& operator=(basic_json other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value and
+        std::is_nothrow_move_assignable<value_t>::value and
+        std::is_nothrow_move_constructible<json_value>::value and
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        using std::swap;
+        swap(m_type, other.m_type);
+        swap(m_value, other.m_value);
+        return *this;
+    }
+
+    /*!
+    @brief destructor
+
+    Destroys the JSON value and frees all allocated memory.
+
+    @complexity Linear.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is linear.
+    - All stored elements are destroyed and all memory is freed.
+    */
+    ~basic_json()
+    {
+        switch (m_type)
+        {
+            case (value_t::object):
+            {
+                AllocatorType<object_t> alloc;
+                alloc.destroy(m_value.object);
+                alloc.deallocate(m_value.object, 1);
+                break;
+            }
+
+            case (value_t::array):
+            {
+                AllocatorType<array_t> alloc;
+                alloc.destroy(m_value.array);
+                alloc.deallocate(m_value.array, 1);
+                break;
+            }
+
+            case (value_t::string):
+            {
+                AllocatorType<string_t> alloc;
+                alloc.destroy(m_value.string);
+                alloc.deallocate(m_value.string, 1);
+                break;
+            }
+
+            default:
+            {
+                // all other types need no specific destructor
+                break;
+            }
+        }
+    }
+
+
+  public:
+    ///////////////////////
+    // object inspection //
+    ///////////////////////
+
+    /// @name object inspection
+    /// @{
+
+    /*!
+    @brief serialization
+
+    Serialization function for JSON values. The function tries to mimick
+    Python's @p json.dumps() function, and currently supports its @p indent
+    parameter.
+
+    @param[in] indent if indent is nonnegative, then array elements and object
+    members will be pretty-printed with that indent level. An indent level of 0
+    will only insert newlines. -1 (the default) selects the most compact
+    representation
+
+    @return string containing the serialization of the JSON value
+
+    @complexity Linear.
+
+    @liveexample{The following example shows the effect of different @a indent
+    parameters to the result of the serializaion.,dump}
+
+    @see https://docs.python.org/2/library/json.html#json.dump
+    */
+    string_t dump(const int indent = -1) const
+    {
+        std::stringstream ss;
+
+        if (indent >= 0)
+        {
+            dump(ss, true, static_cast<unsigned int>(indent));
+        }
+        else
+        {
+            dump(ss, false, 0);
+        }
+
+        return ss.str();
+    }
+
+    /*!
+    @brief return the type of the JSON value (explicit)
+
+    Return the type of the JSON value as a value from the @ref value_t
+    enumeration.
+
+    @return the type of the JSON value
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref type() for all JSON
+    types.,type}
+    */
+    value_t type() const noexcept
+    {
+        return m_type;
+    }
+
+    /*!
+    @brief return whether type is primitive
+
+    This function returns true iff the JSON type is primitive (string, number,
+    boolean, or null).
+
+    @return `true` if type is primitive (string, number, boolean, or null),
+    `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_primitive for all JSON
+    types.,is_primitive}
+    */
+    bool is_primitive() const noexcept
+    {
+        return is_null() or is_string() or is_boolean() or is_number();
+    }
+
+    /*!
+    @brief return whether type is structured
+
+    This function returns true iff the JSON type is structured (array or
+    object).
+
+    @return `true` if type is structured (array or object), `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_structured for all JSON
+    types.,is_structured}
+    */
+    bool is_structured() const noexcept
+    {
+        return is_array() or is_object();
+    }
+
+    /*!
+    @brief return whether value is null
+
+    This function returns true iff the JSON value is null.
+
+    @return `true` if type is null, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_null for all JSON
+    types.,is_null}
+    */
+    bool is_null() const noexcept
+    {
+        return m_type == value_t::null;
+    }
+
+    /*!
+    @brief return whether value is a boolean
+
+    This function returns true iff the JSON value is a boolean.
+
+    @return `true` if type is boolean, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_boolean for all JSON
+    types.,is_boolean}
+    */
+    bool is_boolean() const noexcept
+    {
+        return m_type == value_t::boolean;
+    }
+
+    /*!
+    @brief return whether value is a number
+
+    This function returns true iff the JSON value is a number. This includes
+    both integer and floating-point values.
+
+    @return `true` if type is number, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_number for all JSON
+    types.,is_number}
+    */
+    bool is_number() const noexcept
+    {
+        return is_number_integer() or is_number_float();
+    }
+
+    /*!
+    @brief return whether value is an integer number
+
+    This function returns true iff the JSON value is an integer number. This
+    excludes floating-point values.
+
+    @return `true` if type is an integer number, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_number_integer for all
+    JSON types.,is_number_integer}
+    */
+    bool is_number_integer() const noexcept
+    {
+        return m_type == value_t::number_integer;
+    }
+
+    /*!
+    @brief return whether value is a floating-point number
+
+    This function returns true iff the JSON value is a floating-point number.
+    This excludes integer values.
+
+    @return `true` if type is a floating-point number, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_number_float for all
+    JSON types.,is_number_float}
+    */
+    bool is_number_float() const noexcept
+    {
+        return m_type == value_t::number_float;
+    }
+
+    /*!
+    @brief return whether value is an object
+
+    This function returns true iff the JSON value is an object.
+
+    @return `true` if type is object, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_object for all JSON
+    types.,is_object}
+    */
+    bool is_object() const noexcept
+    {
+        return m_type == value_t::object;
+    }
+
+    /*!
+    @brief return whether value is an array
+
+    This function returns true iff the JSON value is an array.
+
+    @return `true` if type is array, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_array for all JSON
+    types.,is_array}
+    */
+    bool is_array() const noexcept
+    {
+        return m_type == value_t::array;
+    }
+
+    /*!
+    @brief return whether value is a string
+
+    This function returns true iff the JSON value is a string.
+
+    @return `true` if type is string, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_string for all JSON
+    types.,is_string}
+    */
+    bool is_string() const noexcept
+    {
+        return m_type == value_t::string;
+    }
+
+    /*!
+    @brief return whether value is discarded
+
+    This function returns true iff the JSON value was discarded during parsing
+    with a callback function (see @ref parser_callback_t).
+
+    @note This function will always be `false` for JSON values after parsing.
+    That is, discarded values can only occur during parsing, but will be
+    removed when inside a structured value or replaced by null in other cases.
+
+    @return `true` if type is discarded, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_discarded for all JSON
+    types.,is_discarded}
+    */
+    bool is_discarded() const noexcept
+    {
+        return m_type == value_t::discarded;
+    }
+
+    /*!
+    @brief return the type of the JSON value (implicit)
+
+    Implicitly return the type of the JSON value as a value from the @ref
+    value_t enumeration.
+
+    @return the type of the JSON value
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies the value_t operator for all
+    JSON types.,operator__value_t}
+    */
+    operator value_t() const noexcept
+    {
+        return m_type;
+    }
+
+    /// @}
+
+  private:
+    //////////////////
+    // value access //
+    //////////////////
+
+    /// get an object (explicit)
+    template <class T, typename
+              std::enable_if<
+                  std::is_convertible<typename object_t::key_type, typename T::key_type>::value and
+                  std::is_convertible<basic_json_t, typename T::mapped_type>::value
+                  , int>::type = 0>
+    T get_impl(T*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::object):
+            {
+                return T(m_value.object->begin(), m_value.object->end());
+            }
+            default:
+            {
+                throw std::domain_error("type must be object, but is " + type_name());
+            }
+        }
+    }
+
+    /// get an object (explicit)
+    object_t get_impl(object_t*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::object):
+            {
+                return *(m_value.object);
+            }
+            default:
+            {
+                throw std::domain_error("type must be object, but is " + type_name());
+            }
+        }
+    }
+
+    /// get an array (explicit)
+    template <class T, typename
+              std::enable_if<
+                  std::is_convertible<basic_json_t, typename T::value_type>::value and
+                  not std::is_same<basic_json_t, typename T::value_type>::value and
+                  not std::is_arithmetic<T>::value and
+                  not std::is_convertible<std::string, T>::value and
+                  not has_mapped_type<T>::value
+                  , int>::type = 0>
+    T get_impl(T*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::array):
+            {
+                T to_vector;
+                std::transform(m_value.array->begin(), m_value.array->end(),
+                               std::inserter(to_vector, to_vector.end()), [](basic_json i)
+                {
+                    return i.get<typename T::value_type>();
+                });
+                return to_vector;
+            }
+            default:
+            {
+                throw std::domain_error("type must be array, but is " + type_name());
+            }
+        }
+    }
+
+    /// get an array (explicit)
+    template <class T, typename
+              std::enable_if<
+                  std::is_convertible<basic_json_t, T>::value and
+                  not std::is_same<basic_json_t, T>::value
+                  , int>::type = 0>
+    std::vector<T> get_impl(std::vector<T>*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::array):
+            {
+                std::vector<T> to_vector;
+                to_vector.reserve(m_value.array->size());
+                std::transform(m_value.array->begin(), m_value.array->end(),
+                               std::inserter(to_vector, to_vector.end()), [](basic_json i)
+                {
+                    return i.get<T>();
+                });
+                return to_vector;
+            }
+            default:
+            {
+                throw std::domain_error("type must be array, but is " + type_name());
+            }
+        }
+    }
+
+    /// get an array (explicit)
+    template <class T, typename
+              std::enable_if<
+                  std::is_same<basic_json, typename T::value_type>::value and
+                  not has_mapped_type<T>::value
+                  , int>::type = 0>
+    T get_impl(T*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::array):
+            {
+                return T(m_value.array->begin(), m_value.array->end());
+            }
+            default:
+            {
+                throw std::domain_error("type must be array, but is " + type_name());
+            }
+        }
+    }
+
+    /// get an array (explicit)
+    array_t get_impl(array_t*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::array):
+            {
+                return *(m_value.array);
+            }
+            default:
+            {
+                throw std::domain_error("type must be array, but is " + type_name());
+            }
+        }
+    }
+
+    /// get a string (explicit)
+    template <typename T, typename
+              std::enable_if<
+                  std::is_convertible<string_t, T>::value
+                  , int>::type = 0>
+    T get_impl(T*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::string):
+            {
+                return *m_value.string;
+            }
+            default:
+            {
+                throw std::domain_error("type must be string, but is " + type_name());
+            }
+        }
+    }
+
+    /// get a number (explicit)
+    template<typename T, typename
+             std::enable_if<
+                 std::is_arithmetic<T>::value
+                 , int>::type = 0>
+    T get_impl(T*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::number_integer):
+            {
+                return static_cast<T>(m_value.number_integer);
+            }
+            case (value_t::number_float):
+            {
+                return static_cast<T>(m_value.number_float);
+            }
+            default:
+            {
+                throw std::domain_error("type must be number, but is " + type_name());
+            }
+        }
+    }
+
+    /// get a boolean (explicit)
+    boolean_t get_impl(boolean_t*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::boolean):
+            {
+                return m_value.boolean;
+            }
+            default:
+            {
+                throw std::domain_error("type must be boolean, but is " + type_name());
+            }
+        }
+    }
+
+    /// get a pointer to the value (object)
+    object_t* get_impl_ptr(object_t*) noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (object)
+    const object_t* get_impl_ptr(const object_t*) const noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    array_t* get_impl_ptr(array_t*) noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    const array_t* get_impl_ptr(const array_t*) const noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    string_t* get_impl_ptr(string_t*) noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    const string_t* get_impl_ptr(const string_t*) const noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    boolean_t* get_impl_ptr(boolean_t*) noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    const boolean_t* get_impl_ptr(const boolean_t*) const noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    number_integer_t* get_impl_ptr(number_integer_t*) noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    const number_integer_t* get_impl_ptr(const number_integer_t*) const noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    number_float_t* get_impl_ptr(number_float_t*) noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    const number_float_t* get_impl_ptr(const number_float_t*) const noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+  public:
+
+    /// @name value access
+    /// @{
+
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value.
+
+    @tparam ValueType non-pointer type compatible to the JSON value, for
+    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
+    `std::vector` types for JSON arrays
+
+    @return copy of the JSON value, converted to type @a ValueType
+
+    @throw std::domain_error in case passed type @a ValueType is incompatible
+    to JSON
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows serveral conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    assiciative containers such as `std::unordered_map<std::string\,
+    json>`.,get__ValueType_const}
+
+    @internal
+    The idea of using a casted null pointer to choose the correct
+    implementation is from <http://stackoverflow.com/a/8315197/266378>.
+    @endinternal
+
+    @sa @ref operator ValueType() const for implicit conversion
+    @sa @ref get() for pointer-member access
+    */
+    template<typename ValueType, typename
+             std::enable_if<
+                 not std::is_pointer<ValueType>::value
+                 , int>::type = 0>
+    ValueType get() const
+    {
+        return get_impl(static_cast<ValueType*>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+
+    Explicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning Writing data to the pointee of the result yields an undefined
+    state.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, or @ref
+    number_float_t.
+
+    @return pointer to the internally stored JSON value if the requested pointer
+    type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get__PointerType}
+
+    @sa @ref get_ptr() for explicit pointer-member access
+    */
+    template<typename PointerType, typename
+             std::enable_if<
+                 std::is_pointer<PointerType>::value
+                 , int>::type = 0>
+    PointerType get() noexcept
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+    @copydoc get()
+    */
+    template<typename PointerType, typename
+             std::enable_if<
+                 std::is_pointer<PointerType>::value
+                 , int>::type = 0>
+    const PointerType get() const noexcept
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /*!
+    @brief get a pointer value (implicit)
+
+    Implict pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning Writing data to the pointee of the result yields an undefined
+    state.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, or @ref
+    number_float_t.
+
+    @return pointer to the internally stored JSON value if the requested pointer
+    type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get_ptr}
+    */
+    template<typename PointerType, typename
+             std::enable_if<
+                 std::is_pointer<PointerType>::value
+                 , int>::type = 0>
+    PointerType get_ptr() noexcept
+    {
+        // delegate the call to get_impl_ptr<>()
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (implicit)
+    @copydoc get_ptr()
+    */
+    template<typename PointerType, typename
+             std::enable_if<
+                 std::is_pointer<PointerType>::value
+                 and std::is_const<PointerType>::value
+                 , int>::type = 0>
+    const PointerType get_ptr() const noexcept
+    {
+        // delegate the call to get_impl_ptr<>() const
+        return get_impl_ptr(static_cast<const PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a value (implicit)
+
+    Implict type conversion between the JSON value and a compatible value. The
+    call is realized by calling @ref get() const.
+
+    @tparam ValueType non-pointer type compatible to the JSON value, for
+    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
+    `std::vector` types for JSON arrays
+
+    @return copy of the JSON value, converted to type @a ValueType
+
+    @throw std::domain_error in case passed type @a ValueType is incompatible
+    to JSON, thrown by @ref get() const
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows serveral conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    assiciative containers such as `std::unordered_map<std::string\,
+    json>`.,operator__ValueType}
+    */
+    template<typename ValueType, typename
+             std::enable_if<
+                 not std::is_pointer<ValueType>::value
+                 , int>::type = 0>
+    operator ValueType() const
+    {
+        // delegate the call to get<>() const
+        return get<ValueType>();
+    }
+
+    /// @}
+
+
+    ////////////////////
+    // element access //
+    ////////////////////
+
+    /// @name element access
+    /// @{
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a reference to the element at specified location @a idx, with
+    bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw std::domain_error if JSON is not an array
+    @throw std::out_of_range if the index @a idx is out of range of the array;
+    that is, `idx >= size()`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how array elements can be read and
+    written using at.,at__size_type}
+    */
+    reference at(size_type idx)
+    {
+        // at only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use at() with " + type_name());
+        }
+
+        return m_value.array->at(idx);
+    }
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a const reference to the element at specified location @a idx, with
+    bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw std::domain_error if JSON is not an array
+    @throw std::out_of_range if the index @a idx is out of range of the array;
+    that is, `idx >= size()`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how array elements can be read using
+    at.,at__size_type_const}
+    */
+    const_reference at(size_type idx) const
+    {
+        // at only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use at() with " + type_name());
+        }
+
+        return m_value.array->at(idx);
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a reference to the element at with specified key @a key, with
+    bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object
+    @throw std::out_of_range if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using at.,at__object_t_key_type}
+    */
+    reference at(const typename object_t::key_type& key)
+    {
+        // at only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use at() with " + type_name());
+        }
+
+        return m_value.object->at(key);
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a const reference to the element at with specified key @a key, with
+    bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object
+    @throw std::out_of_range if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    at.,at__object_t_key_type_const}
+    */
+    const_reference at(const typename object_t::key_type& key) const
+    {
+        // at only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use at() with " + type_name());
+        }
+
+        return m_value.object->at(key);
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a reference to the element at specified location @a idx.
+
+    @note If @a idx is beyond the range of the array (i.e., `idx >= size()`),
+    then the array is silently filled up with `null` values to make `idx` a
+    valid reference to the last stored element.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw std::domain_error if JSON is not an array or null
+
+    @complexity Constant if @a idx is in the range of the array. Otherwise
+    linear in `idx - size()`.
+
+    @liveexample{The example below shows how array elements can be read and
+    written using [] operator. Note the addition of `null`
+    values.,operatorarray__size_type}
+    */
+    reference operator[](size_type idx)
+    {
+        // implicitly convert null to object
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::array;
+            m_value.array = create<array_t>();
+        }
+
+        // [] only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        for (size_t i = m_value.array->size(); i <= idx; ++i)
+        {
+            m_value.array->push_back(basic_json());
+        }
+
+        return m_value.array->operator[](idx);
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a const reference to the element at specified location @a idx.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw std::domain_error if JSON is not an array
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how array elements can be read using
+    the [] operator.,operatorarray__size_type_const}
+    */
+    const_reference operator[](size_type idx) const
+    {
+        // at only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        return m_value.array->operator[](idx);
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object or null
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the [] operator.,operatorarray__key_type}
+    */
+    reference operator[](const typename object_t::key_type& key)
+    {
+        // implicitly convert null to object
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+        }
+
+        // [] only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        return m_value.object->operator[](key);
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object or null
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the [] operator.,operatorarray__key_type_const}
+    */
+    const_reference operator[](const typename object_t::key_type& key) const
+    {
+        // at only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        return m_value.object->operator[](key);
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @note This function is required for compatibility reasons with Clang.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object or null
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the [] operator.,operatorarray__key_type}
+    */
+    template<typename T, std::size_t n>
+    reference operator[](const T (&key)[n])
+    {
+        // implicitly convert null to object
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+        }
+
+        // at only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        return m_value.object->operator[](key);
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note This function is required for compatibility reasons with Clang.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object or null
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the [] operator.,operatorarray__key_type_const}
+    */
+    template<typename T, std::size_t n>
+    const_reference operator[](const T (&key)[n]) const
+    {
+        // at only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        return m_value.object->operator[](key);
+    }
+
+    /*!
+    @brief access the first element
+
+    Returns a reference to the first element in the container. For a JSON
+    container `c`, the expression `c.front()` is equivalent to `*c.begin()`.
+
+    @return In case of a structured type (array or object), a reference to the
+    first element is returned. In cast of number, string, or boolean values, a
+    reference to the value is returned.
+
+    @complexity Constant.
+
+    @note Calling `front` on an empty container is undefined.
+
+    @throw std::out_of_range when called on null value
+
+    @liveexample{The following code shows an example for @ref front.,front}
+    */
+    reference front()
+    {
+        return *begin();
+    }
+
+    /*!
+    @copydoc basic_json::front()
+    */
+    const_reference front() const
+    {
+        return *cbegin();
+    }
+
+    /*!
+    @brief access the last element
+
+    Returns a reference to the last element in the container. For a JSON
+    container `c`, the expression `c.back()` is equivalent to `{ auto tmp =
+    c.end(); --tmp; return *tmp; }`.
+
+    @return In case of a structured type (array or object), a reference to the
+    last element is returned. In cast of number, string, or boolean values, a
+    reference to the value is returned.
+
+    @complexity Constant.
+
+    @note Calling `back` on an empty container is undefined.
+
+    @throw std::out_of_range when called on null value.
+
+    @liveexample{The following code shows an example for @ref back.,back}
+    */
+    reference back()
+    {
+        auto tmp = end();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @copydoc basic_json::back()
+    */
+    const_reference back() const
+    {
+        auto tmp = cend();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @brief remove element given an iterator
+
+    Removes the element specified by iterator @a pos. Invalidates iterators and
+    references at or after the point of the erase, including the end()
+    iterator. The iterator @a pos must be valid and dereferenceable. Thus the
+    end() iterator (which is valid, but is not dereferencable) cannot be used
+    as a value for @a pos.
+
+    If called on a primitive type other than null, the resulting JSON value
+    will be `null`.
+
+    @param[in] pos iterator to the element to remove
+    @return Iterator following the last removed element. If the iterator @a pos
+    refers to the last element, the end() iterator is returned.
+
+    @tparam InteratorType an @ref iterator or @ref const_iterator
+
+    @throw std::domain_error if called on a `null` value
+    @throw std::domain_error if called on an iterator which does not belong to
+    the current JSON value
+    @throw std::out_of_range if called on a primitive type with invalid iterator
+    (i.e., any iterator which is not end())
+
+    @complexity The complexity depends on the type:
+    - objects: amortized constant
+    - arrays: linear in distance between pos and the end of the container
+    - strings: linear in the length of the string
+    - other types: constant
+
+    @liveexample{The example shows the result of erase for different JSON
+    types.,erase__IteratorType}
+    */
+    template <class InteratorType, typename
+              std::enable_if<
+                  std::is_same<InteratorType, typename basic_json_t::iterator>::value or
+                  std::is_same<InteratorType, typename basic_json_t::const_iterator>::value
+                  , int>::type
+              = 0>
+    InteratorType erase(InteratorType pos)
+    {
+        // make sure iterator fits the current value
+        if (this != pos.m_object)
+        {
+            throw std::domain_error("iterator does not fit current value");
+        }
+
+        InteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            case value_t::number_float:
+            case value_t::boolean:
+            case value_t::string:
+            {
+                if (not pos.m_it.primitive_iterator.is_begin())
+                {
+                    throw std::out_of_range("iterator out of range");
+                }
+
+                if (m_type == value_t::string)
+                {
+                    delete m_value.string;
+                    m_value.string = nullptr;
+                }
+
+                m_type = value_t::null;
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator);
+                break;
+            }
+
+            default:
+            {
+                throw std::domain_error("cannot use erase() with " + type_name());
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove elements given an iterator range
+
+    Removes the element specified by the range `[first; last)`. Invalidates
+    iterators and references at or after the point of the erase, including the
+    end() iterator. The iterator @a first does not need to be dereferenceable
+    if `first == last`: erasing an empty range is a no-op.
+
+    If called on a primitive type other than null, the resulting JSON value
+    will be `null`.
+
+    @param[in] first iterator to the beginning of the range to remove
+    @param[in] last iterator past the end of the range to remove
+    @return Iterator following the last removed element. If the iterator @a
+    second refers to the last element, the end() iterator is returned.
+
+    @tparam InteratorType an @ref iterator or @ref const_iterator
+
+    @throw std::domain_error if called on a `null` value
+    @throw std::domain_error if called on iterators which does not belong to
+    the current JSON value
+    @throw std::out_of_range if called on a primitive type with invalid iterators
+    (i.e., if `first != begin()` and `last != end()`)
+
+    @complexity The complexity depends on the type:
+    - objects: `log(size()) + std::distance(first, last)`
+    - arrays: linear in the distance between @a first and @a last, plus linear
+      in the distance between @a last and end of the container
+    - strings: linear in the length of the string
+    - other types: constant
+
+    @liveexample{The example shows the result of erase for different JSON
+    types.,erase__IteratorType_IteratorType}
+    */
+    template <class InteratorType, typename
+              std::enable_if<
+                  std::is_same<InteratorType, typename basic_json_t::iterator>::value or
+                  std::is_same<InteratorType, typename basic_json_t::const_iterator>::value
+                  , int>::type
+              = 0>
+    InteratorType erase(InteratorType first, InteratorType last)
+    {
+        // make sure iterator fits the current value
+        if (this != first.m_object or this != last.m_object)
+        {
+            throw std::domain_error("iterators do not fit current value");
+        }
+
+        InteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            case value_t::number_float:
+            case value_t::boolean:
+            case value_t::string:
+            {
+                if (not first.m_it.primitive_iterator.is_begin() or not last.m_it.primitive_iterator.is_end())
+                {
+                    throw std::out_of_range("iterators out of range");
+                }
+
+                if (m_type == value_t::string)
+                {
+                    delete m_value.string;
+                    m_value.string = nullptr;
+                }
+
+                m_type = value_t::null;
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator,
+                                              last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator,
+                                             last.m_it.array_iterator);
+                break;
+            }
+
+            default:
+            {
+                throw std::domain_error("cannot use erase with " + type_name());
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove element from a JSON object given a key
+
+    Removes elements from a JSON object with the key value @a key.
+
+    @param[in] key value of the elements to remove
+
+    @return Number of elements removed. If ObjectType is the default `std::map`
+    type, the return value will always be `0` (@a key was not found) or `1` (@a
+    key was found).
+
+    @throw std::domain_error when called on a type other than JSON object
+
+    @complexity `log(size()) + count(key)`
+
+    @liveexample{The example shows the effect of erase.,erase__key_type}
+    */
+    size_type erase(const typename object_t::key_type& key)
+    {
+        // this erase only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use erase() with " + type_name());
+        }
+
+        return m_value.object->erase(key);
+    }
+
+    /*!
+    @brief remove element from a JSON array given an index
+
+    Removes element from a JSON array at the index @a idx.
+
+    @param[in] idx index of the element to remove
+
+    @throw std::domain_error when called on a type other than JSON array
+    @throw std::out_of_range when `idx >= size()`
+
+    @complexity Linear in distance between @a idx and the end of the container.
+
+    @liveexample{The example shows the effect of erase.,erase__size_type}
+    */
+    void erase(const size_type idx)
+    {
+        // this erase only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use erase() with " + type_name());
+        }
+
+        if (idx >= size())
+        {
+            throw std::out_of_range("index out of range");
+        }
+
+        m_value.array->erase(m_value.array->begin() + static_cast<difference_type>(idx));
+    }
+
+    /*!
+    @brief find an element in a JSON object
+
+    Finds an element in a JSON object with key equivalent to @a key. If the
+    element is not found or the JSON value is not an object, end() is returned.
+
+    @param[in] key key value of the element to search for
+
+    @return Iterator to an element with key equivalent to @a key. If no such
+    element is found, past-the-end (see end()) iterator is returned.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how find is used.,find__key_type}
+    */
+    iterator find(typename object_t::key_type key)
+    {
+        auto result = end();
+
+        if (m_type == value_t::object)
+        {
+            result.m_it.object_iterator = m_value.object->find(key);
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief find an element in a JSON object
+    @copydoc find(typename object_t::key_type)
+    */
+    const_iterator find(typename object_t::key_type key) const
+    {
+        auto result = cend();
+
+        if (m_type == value_t::object)
+        {
+            result.m_it.object_iterator = m_value.object->find(key);
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief returns the number of occurrences of a key in a JSON object
+
+    Returns the number of elements with key @a key. If ObjectType is the
+    default `std::map` type, the return value will always be `0` (@a key was
+    not found) or `1` (@a key was found).
+
+    @param[in] key key value of the element to count
+
+    @return Number of elements with key @a key. If the JSON value is not an
+    object, the return value will be `0`.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how count is used.,count}
+    */
+    size_type count(typename object_t::key_type key) const
+    {
+        // return 0 for all nonobject types
+        return (m_type == value_t::object) ? m_value.object->count(key) : 0;
+    }
+
+    /// @}
+
+
+    ///////////////
+    // iterators //
+    ///////////////
+
+    /// @name iterators
+    /// @{
+
+    /*!
+    @brief returns an iterator to the first element
+
+    Returns an iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for @ref begin.,begin}
+    */
+    iterator begin()
+    {
+        iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cbegin()
+    */
+    const_iterator begin() const
+    {
+        return cbegin();
+    }
+
+    /*!
+    @brief returns a const iterator to the first element
+
+    Returns a const iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).begin()`.
+
+    @liveexample{The following code shows an example for @ref cbegin.,cbegin}
+    */
+    const_iterator cbegin() const
+    {
+        const_iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to one past the last element
+
+    Returns an iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for @ref end.,end}
+    */
+    iterator end()
+    {
+        iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cend()
+    */
+    const_iterator end() const
+    {
+        return cend();
+    }
+
+    /*!
+    @brief returns a const iterator to one past the last element
+
+    Returns a const iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).end()`.
+
+    @liveexample{The following code shows an example for @ref cend.,cend}
+    */
+    const_iterator cend() const
+    {
+        const_iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-beginning
+
+    Returns an iterator to the reverse-beginning; that is, the last element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function satisfies the ReversibleContainer requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(end())`.
+
+    @liveexample{The following code shows an example for @ref rbegin.,rbegin}
+    */
+    reverse_iterator rbegin()
+    {
+        return reverse_iterator(end());
+    }
+
+    /*!
+    @copydoc basic_json::crbegin()
+    */
+    const_reverse_iterator rbegin() const
+    {
+        return crbegin();
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-end
+
+    Returns an iterator to the reverse-end; that is, one before the first
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function satisfies the ReversibleContainer requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(begin())`.
+
+    @liveexample{The following code shows an example for @ref rend.,rend}
+    */
+    reverse_iterator rend()
+    {
+        return reverse_iterator(begin());
+    }
+
+    /*!
+    @copydoc basic_json::crend()
+    */
+    const_reverse_iterator rend() const
+    {
+        return crend();
+    }
+
+    /*!
+    @brief returns a const reverse iterator to the last element
+
+    Returns a const iterator to the reverse-beginning; that is, the last
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function satisfies the ReversibleContainer requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rbegin()`.
+
+    @liveexample{The following code shows an example for @ref crbegin.,crbegin}
+    */
+    const_reverse_iterator crbegin() const
+    {
+        return const_reverse_iterator(cend());
+    }
+
+    /*!
+    @brief returns a const reverse iterator to one before the first
+
+    Returns a const reverse iterator to the reverse-end; that is, one before
+    the first element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function satisfies the ReversibleContainer requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rend()`.
+
+    @liveexample{The following code shows an example for @ref crend.,crend}
+    */
+    const_reverse_iterator crend() const
+    {
+        return const_reverse_iterator(cbegin());
+    }
+
+    /// @}
+
+
+    //////////////
+    // capacity //
+    //////////////
+
+    /// @name capacity
+    /// @{
+
+    /*!
+    @brief checks whether the container is empty
+
+    Checks if a JSON value has no elements.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | @c true
+            boolean     | @c false
+            string      | @c false
+            number      | @c false
+            object      | result of function object_t::empty()
+            array       | result of function array_t::empty()
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy the
+                Container concept; that is, their empty() functions have
+                constant complexity.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - Has the semantics of `begin() == end()`.
+
+    @liveexample{The following code uses @ref empty to check if a @ref json
+    object contains any elements.,empty}
+    */
+    bool empty() const noexcept
+    {
+        switch (m_type)
+        {
+            case (value_t::null):
+            {
+                return true;
+            }
+
+            case (value_t::array):
+            {
+                return m_value.array->empty();
+            }
+
+            case (value_t::object):
+            {
+                return m_value.object->empty();
+            }
+
+            default:
+            {
+                // all other types are nonempty
+                return false;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the number of elements
+
+    Returns the number of elements in a JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | @c 0
+            boolean     | @c 1
+            string      | @c 1
+            number      | @c 1
+            object      | result of function object_t::size()
+            array       | result of function array_t::size()
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy the
+                Container concept; that is, their size() functions have
+                constant complexity.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - Has the semantics of `std::distance(begin(), end())`.
+
+    @liveexample{The following code calls @ref size on the different value
+    types.,size}
+    */
+    size_type size() const noexcept
+    {
+        switch (m_type)
+        {
+            case (value_t::null):
+            {
+                return 0;
+            }
+
+            case (value_t::array):
+            {
+                return m_value.array->size();
+            }
+
+            case (value_t::object):
+            {
+                return m_value.object->size();
+            }
+
+            default:
+            {
+                // all other types have size 1
+                return 1;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the maximum possible number of elements
+
+    Returns the maximum number of elements a JSON value is able to hold due to
+    system or library implementation limitations, i.e. `std::distance(begin(),
+    end())` for the JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | @c 0 (same as size())
+            boolean     | @c 1 (same as size())
+            string      | @c 1 (same as size())
+            number      | @c 1 (same as size())
+            object      | result of function object_t::max_size()
+            array       | result of function array_t::max_size()
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy the
+                Container concept; that is, their max_size() functions have
+                constant complexity.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - Has the semantics of returning `b.size()` where `b` is the largest
+      possible JSON value.
+
+    @liveexample{The following code calls @ref max_size on the different value
+    types. Note the output is implementation specific.,max_size}
+    */
+    size_type max_size() const noexcept
+    {
+        switch (m_type)
+        {
+            case (value_t::array):
+            {
+                return m_value.array->max_size();
+            }
+
+            case (value_t::object):
+            {
+                return m_value.object->max_size();
+            }
+
+            default:
+            {
+                // all other types have max_size() == size()
+                return size();
+            }
+        }
+    }
+
+    /// @}
+
+
+    ///////////////
+    // modifiers //
+    ///////////////
+
+    /// @name modifiers
+    /// @{
+
+    /*!
+    @brief clears the contents
+
+    Clears the content of a JSON value and resets it to the default value as
+    if @ref basic_json(value_t) would have been called:
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    object      | `{}`
+    array       | `[]`
+
+    @note Floating-point numbers are set to `0.0` which will be serialized to
+    `0`. The vale type remains @ref number_float_t.
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows the effect of @ref clear to different
+    JSON types.,clear}
+    */
+    void clear() noexcept
+    {
+        switch (m_type)
+        {
+            case (value_t::null):
+            case (value_t::discarded):
+            {
+                break;
+            }
+
+            case (value_t::number_integer):
+            {
+                m_value.number_integer = 0;
+                break;
+            }
+
+            case (value_t::number_float):
+            {
+                m_value.number_float = 0.0;
+                break;
+            }
+
+            case (value_t::boolean):
+            {
+                m_value.boolean = false;
+                break;
+            }
+
+            case (value_t::string):
+            {
+                m_value.string->clear();
+                break;
+            }
+
+            case (value_t::array):
+            {
+                m_value.array->clear();
+                break;
+            }
+
+            case (value_t::object):
+            {
+                m_value.object->clear();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @brief add an object to an array
+
+    Appends the given element @a value to the end of the JSON value. If the
+    function is called on a JSON null value, an empty array is created before
+    appending @a value.
+
+    @param value the value to add to the JSON array
+
+    @throw std::domain_error when called on a type other than JSON array or null
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows how `push_back` and `+=` can be used to add
+    elements to a JSON array. Note how the `null` value was silently converted
+    to a JSON array.,push_back}
+    */
+    void push_back(basic_json&& value)
+    {
+        // push_back only works for null objects or arrays
+        if (not(m_type == value_t::null or m_type == value_t::array))
+        {
+            throw std::domain_error("cannot use push_back() with " + type_name());
+        }
+
+        // transform null object into an array
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+        }
+
+        // add element to array (move semantics)
+        m_value.array->push_back(std::move(value));
+        // invalidate object
+        value.m_type = value_t::null;
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(basic_json&& value)
+    {
+        push_back(std::move(value));
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    void push_back(const basic_json& value)
+    {
+        // push_back only works for null objects or arrays
+        if (not(m_type == value_t::null or m_type == value_t::array))
+        {
+            throw std::domain_error("cannot use push_back() with " + type_name());
+        }
+
+        // transform null object into an array
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+        }
+
+        // add element to array
+        m_value.array->push_back(value);
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(const basic_json& value)
+    {
+        push_back(value);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an object
+
+    Inserts the given element @a value to the JSON object. If the function is
+    called on a JSON null value, an empty object is created before inserting @a
+    value.
+
+    @param[in] value the value to add to the JSON object
+
+    @throw std::domain_error when called on a type other than JSON object or
+    null
+
+    @complexity Logarithmic in the size of the container, O(log(`size()`)).
+
+    @liveexample{The example shows how `push_back` and `+=` can be used to add
+    elements to a JSON object. Note how the `null` value was silently converted
+    to a JSON object.,push_back__object_t__value}
+    */
+    void push_back(const typename object_t::value_type& value)
+    {
+        // push_back only works for null objects or objects
+        if (not(m_type == value_t::null or m_type == value_t::object))
+        {
+            throw std::domain_error("cannot use push_back() with " + type_name());
+        }
+
+        // transform null object into an object
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+        }
+
+        // add element to array
+        m_value.object->insert(value);
+    }
+
+    /*!
+    @brief add an object to an object
+    @copydoc push_back(const typename object_t::value_type&)
+    */
+    reference operator+=(const typename object_t::value_type& value)
+    {
+        push_back(value);
+        return operator[](value.first);
+    }
+
+    /*!
+    @brief inserts element
+
+    Inserts element @a value before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] value element to insert
+    @return iterator pointing to the inserted @a value.
+
+    @throw std::domain_error if called on JSON values other than arrays
+    @throw std::domain_error if @a pos is not an iterator of *this
+
+    @complexity Constant plus linear in the distance between pos and end of the
+    container.
+
+    @liveexample{The example shows how insert is used.,insert}
+    */
+    iterator insert(const_iterator pos, const basic_json& value)
+    {
+        // insert only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use insert() with " + type_name());
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (pos.m_object != this)
+        {
+            throw std::domain_error("iterator does not fit current value");
+        }
+
+        // insert to array and return iterator
+        iterator result(this);
+        result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, value);
+        return result;
+    }
+
+    /*!
+    @brief inserts element
+    @copydoc insert(const_iterator, const basic_json&)
+    */
+    iterator insert(const_iterator pos, basic_json&& value)
+    {
+        return insert(pos, value);
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts @a count copies of @a value before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] count number of copies of @a value to insert
+    @param[in] value element to insert
+    @return iterator pointing to the first element inserted, or @a pos if
+    `count==0`
+
+    @throw std::domain_error if called on JSON values other than arrays
+    @throw std::domain_error if @a pos is not an iterator of *this
+
+    @complexity Linear in @a count plus linear in the distance between @a pos
+    and end of the container.
+
+    @liveexample{The example shows how insert is used.,insert__count}
+    */
+    iterator insert(const_iterator pos, size_type count, const basic_json& value)
+    {
+        // insert only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use insert() with " + type_name());
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (pos.m_object != this)
+        {
+            throw std::domain_error("iterator does not fit current value");
+        }
+
+        // insert to array and return iterator
+        iterator result(this);
+        result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, count, value);
+        return result;
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from range `[first, last)` before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw std::domain_error if called on JSON values other than arrays
+    @throw std::domain_error if @a pos is not an iterator of *this
+    @throw std::domain_error if @a first and @a last do not belong to the same
+    JSON value
+    @throw std::domain_error if @a first or @a last are iterators into
+    container for which insert is called
+    @return iterator pointing to the first element inserted, or @a pos if
+    `first==last`
+
+    @complexity Linear in `std::distance(first, last)` plus linear in the
+    distance between @a pos and end of the container.
+
+    @liveexample{The example shows how insert is used.,insert__range}
+    */
+    iterator insert(const_iterator pos, const_iterator first, const_iterator last)
+    {
+        // insert only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use insert() with " + type_name());
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (pos.m_object != this)
+        {
+            throw std::domain_error("iterator does not fit current value");
+        }
+
+        if (first.m_object != last.m_object)
+        {
+            throw std::domain_error("iterators does not fit");
+        }
+
+        if (first.m_object == this or last.m_object == this)
+        {
+            throw std::domain_error("passed iterators may not belong to container");
+        }
+
+        // insert to array and return iterator
+        iterator result(this);
+        result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator,
+                                     first.m_it.array_iterator, last.m_it.array_iterator);
+        return result;
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from initializer list @a ilist before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] ilist initializer list to insert the values from
+
+    @throw std::domain_error if called on JSON values other than arrays
+    @throw std::domain_error if @a pos is not an iterator of *this
+    @return iterator pointing to the first element inserted, or @a pos if
+    `ilist` is empty
+
+    @complexity Linear in `ilist.size()` plus linear in the distance between @a
+    pos and end of the container.
+
+    @liveexample{The example shows how insert is used.,insert__ilist}
+    */
+    iterator insert(const_iterator pos, std::initializer_list<basic_json> ilist)
+    {
+        // insert only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use insert() with " + type_name());
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (pos.m_object != this)
+        {
+            throw std::domain_error("iterator does not fit current value");
+        }
+
+        // insert to array and return iterator
+        iterator result(this);
+        result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, ilist);
+        return result;
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of the JSON value with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other JSON value to exchange the contents with
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON arrays can be
+    swapped.,swap__reference}
+    */
+    void swap(reference other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value and
+        std::is_nothrow_move_assignable<value_t>::value and
+        std::is_nothrow_move_constructible<json_value>::value and
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        std::swap(m_type, other.m_type);
+        std::swap(m_value, other.m_value);
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON array with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other array to exchange the contents with
+
+    @throw std::domain_error when JSON value is not an array
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be
+    swapped.,swap__array_t}
+    */
+    void swap(array_t& other)
+    {
+        // swap only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use swap() with " + type_name());
+        }
+
+        // swap arrays
+        std::swap(*(m_value.array), other);
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON object with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other object to exchange the contents with
+
+    @throw std::domain_error when JSON value is not an object
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be
+    swapped.,swap__object_t}
+    */
+    void swap(object_t& other)
+    {
+        // swap only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use swap() with " + type_name());
+        }
+
+        // swap objects
+        std::swap(*(m_value.object), other);
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON string with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other string to exchange the contents with
+
+    @throw std::domain_error when JSON value is not a string
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be
+    swapped.,swap__string_t}
+    */
+    void swap(string_t& other)
+    {
+        // swap only works for strings
+        if (m_type != value_t::string)
+        {
+            throw std::domain_error("cannot use swap() with " + type_name());
+        }
+
+        // swap strings
+        std::swap(*(m_value.string), other);
+    }
+
+    /// @}
+
+
+    //////////////////////////////////////////
+    // lexicographical comparison operators //
+    //////////////////////////////////////////
+
+    /// @name lexicographical comparison operators
+    /// @{
+
+  private:
+    /*!
+    @brief comparison operator for JSON types
+
+    Returns an ordering that is similar to Python:
+    - order: null < boolean < number < object < array < string
+    - furthermore, each type is not smaller than itself
+    */
+    friend bool operator<(const value_t lhs, const value_t rhs)
+    {
+        static constexpr std::array<uint8_t, 7> order = {{
+                0, // null
+                3, // object
+                4, // array
+                5, // string
+                1, // boolean
+                2, // integer
+                2  // float
+            }
+        };
+
+        // discarded values are not comparable
+        if (lhs == value_t::discarded or rhs == value_t::discarded)
+        {
+            return false;
+        }
+
+        return order[static_cast<std::size_t>(lhs)] < order[static_cast<std::size_t>(rhs)];
+    }
+
+  public:
+    /*!
+    @brief comparison: equal
+
+    Compares two JSON values for equality according to the following rules:
+    - Two JSON values are equal if (1) they are from the same type and (2)
+      their stored values are the same.
+    - Integer and floating-point numbers are automatically converted before
+      comparison. Floating-point numbers are compared indirectly: two
+      floating-point numbers `f1` and `f2` are considered equal if neither
+      `f1 > f2` nor `f2 > f1` holds.
+    - Two JSON null values are equal.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are equal
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__equal}
+    */
+    friend bool operator==(const_reference lhs, const_reference rhs) noexcept
+    {
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case (value_t::array):
+                    return *lhs.m_value.array == *rhs.m_value.array;
+                case (value_t::object):
+                    return *lhs.m_value.object == *rhs.m_value.object;
+                case (value_t::null):
+                    return true;
+                case (value_t::string):
+                    return *lhs.m_value.string == *rhs.m_value.string;
+                case (value_t::boolean):
+                    return lhs.m_value.boolean == rhs.m_value.boolean;
+                case (value_t::number_integer):
+                    return lhs.m_value.number_integer == rhs.m_value.number_integer;
+                case (value_t::number_float):
+                    return approx(lhs.m_value.number_float, rhs.m_value.number_float);
+                case (value_t::discarded):
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer and rhs_type == value_t::number_float)
+        {
+            return approx(static_cast<number_float_t>(lhs.m_value.number_integer),
+                          rhs.m_value.number_float);
+        }
+        else if (lhs_type == value_t::number_float and rhs_type == value_t::number_integer)
+        {
+            return approx(lhs.m_value.number_float,
+                          static_cast<number_float_t>(rhs.m_value.number_integer));
+        }
+        return false;
+    }
+
+    /*!
+    @brief comparison: equal
+
+    The functions compares the given JSON value against a null pointer. As the
+    null pointer can be used to initialize a JSON value to null, a comparison
+    of JSON value @a v with a null pointer should be equivalent to call
+    `v.is_null()`.
+
+    @param[in] v  JSON value to consider
+    @return whether @a v is null
+
+    @complexity Constant.
+
+    @liveexample{The example compares several JSON types to the null pointer.
+    ,operator__equal__nullptr_t}
+    */
+    friend bool operator==(const_reference v, std::nullptr_t) noexcept
+    {
+        return v.is_null();
+    }
+
+    /*!
+    @brief comparison: equal
+    @copydoc operator==(const_reference, std::nullptr_t)
+    */
+    friend bool operator==(std::nullptr_t, const_reference v) noexcept
+    {
+        return v.is_null();
+    }
+
+    /*!
+    @brief comparison: not equal
+
+    Compares two JSON values for inequality by calculating `not (lhs == rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are not equal
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__notequal}
+    */
+    friend bool operator!=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (lhs == rhs);
+    }
+
+    /*!
+    @brief comparison: not equal
+
+    The functions compares the given JSON value against a null pointer. As the
+    null pointer can be used to initialize a JSON value to null, a comparison
+    of JSON value @a v with a null pointer should be equivalent to call
+    `not v.is_null()`.
+
+    @param[in] v  JSON value to consider
+    @return whether @a v is not null
+
+    @complexity Constant.
+
+    @liveexample{The example compares several JSON types to the null pointer.
+    ,operator__notequal__nullptr_t}
+    */
+    friend bool operator!=(const_reference v, std::nullptr_t) noexcept
+    {
+        return not v.is_null();
+    }
+
+    /*!
+    @brief comparison: not equal
+    @copydoc operator!=(const_reference, std::nullptr_t)
+    */
+    friend bool operator!=(std::nullptr_t, const_reference v) noexcept
+    {
+        return not v.is_null();
+    }
+
+    /*!
+    @brief comparison: less than
+
+    Compares whether one JSON value @a lhs is less than another JSON value @a
+    rhs according to the following rules:
+    - If @a lhs and @a rhs have the same type, the values are compared using
+      the default `<` operator.
+    - Integer and floating-point numbers are automatically converted before
+      comparison
+    - In case @a lhs and @a rhs have different types, the values are ignored
+      and the order of the types is considered, see
+      @ref operator<(const value_t, const value_t).
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than @a rhs
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__less}
+    */
+    friend bool operator<(const_reference lhs, const_reference rhs) noexcept
+    {
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case (value_t::array):
+                    return *lhs.m_value.array < *rhs.m_value.array;
+                case (value_t::object):
+                    return *lhs.m_value.object < *rhs.m_value.object;
+                case (value_t::null):
+                    return false;
+                case (value_t::string):
+                    return *lhs.m_value.string < *rhs.m_value.string;
+                case (value_t::boolean):
+                    return lhs.m_value.boolean < rhs.m_value.boolean;
+                case (value_t::number_integer):
+                    return lhs.m_value.number_integer < rhs.m_value.number_integer;
+                case (value_t::number_float):
+                    return lhs.m_value.number_float < rhs.m_value.number_float;
+                case (value_t::discarded):
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer and rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_integer) <
+                   rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float and rhs_type == value_t::number_integer)
+        {
+            return lhs.m_value.number_float <
+                   static_cast<number_float_t>(rhs.m_value.number_integer);
+        }
+
+        // We only reach this line if we cannot compare values. In that case,
+        // we compare types. Note we have to call the operator explicitly,
+        // because MSVC has problems otherwise.
+        return operator<(lhs_type, rhs_type);
+    }
+
+    /*!
+    @brief comparison: less than or equal
+
+    Compares whether one JSON value @a lhs is less than or equal to another
+    JSON value by calculating `not (rhs < lhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than or equal to @a rhs
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greater}
+    */
+    friend bool operator<=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (rhs < lhs);
+    }
+
+    /*!
+    @brief comparison: greater than
+
+    Compares whether one JSON value @a lhs is greater than another
+    JSON value by calculating `not (lhs <= rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than to @a rhs
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__lessequal}
+    */
+    friend bool operator>(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (lhs <= rhs);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+
+    Compares whether one JSON value @a lhs is greater than or equal to another
+    JSON value by calculating `not (lhs < rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than or equal to @a rhs
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greaterequal}
+    */
+    friend bool operator>=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (lhs < rhs);
+    }
+
+    /// @}
+
+
+    ///////////////////
+    // serialization //
+    ///////////////////
+
+    /// @name serialization
+    /// @{
+
+    /*!
+    @brief serialize to stream
+
+    Serialize the given JSON value @a j to the output stream @a o. The JSON
+    value will be serialized using the @ref dump member function. The
+    indentation of the output can be controlled with the member variable
+    `width` of the output stream @a o. For instance, using the manipulator
+    `std::setw(4)` on @a o sets the indentation level to `4` and the
+    serialization result is the same as calling `dump(4)`.
+
+    @param[in,out] o  stream to serialize to
+    @param[in] j  JSON value to serialize
+
+    @return the stream @a o
+
+    @complexity Linear.
+
+    @liveexample{The example below shows the serialization with different
+    parameters to `width` to adjust the indentation level.,operator_serialize}
+    */
+    friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
+    {
+        // read width member and use it as indentation parameter if nonzero
+        const bool pretty_print = (o.width() > 0);
+        const auto indentation = (pretty_print ? o.width() : 0);
+
+        // reset width to 0 for subsequent calls to this stream
+        o.width(0);
+
+        // do the actual serialization
+        j.dump(o, pretty_print, static_cast<unsigned int>(indentation));
+        return o;
+    }
+
+    /*!
+    @brief serialize to stream
+    @copydoc operator<<(std::ostream&, const basic_json&)
+    */
+    friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
+    {
+        return o << j;
+    }
+
+    /// @}
+
+
+    /////////////////////
+    // deserialization //
+    /////////////////////
+
+    /// @name deserialization
+    /// @{
+
+    /*!
+    @brief deserialize from string
+
+    @param[in] s  string to read a serialized JSON value from
+    @param[in] cb a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+
+    @return result of the deserialization
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the parser callback function
+    @a cb has a super-linear complexity.
+
+    @liveexample{The example below demonstrates the parse function with and
+    without callback function.,parse__string__parser_callback_t}
+
+    @sa parse(std::istream&, parser_callback_t) for a version that reads from
+    an input stream
+    */
+    static basic_json parse(const string_t& s, parser_callback_t cb = nullptr)
+    {
+        return parser(s, cb).parse();
+    }
+
+    /*!
+    @brief deserialize from stream
+
+    @param[in,out] i  stream to read a serialized JSON value from
+    @param[in] cb a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+
+    @return result of the deserialization
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the parser callback function
+    @a cb has a super-linear complexity.
+
+    @liveexample{The example below demonstrates the parse function with and
+    without callback function.,parse__istream__parser_callback_t}
+
+    @sa parse(const string_t&, parser_callback_t) for a version that reads
+    from a string
+    */
+    static basic_json parse(std::istream& i, parser_callback_t cb = nullptr)
+    {
+        return parser(i, cb).parse();
+    }
+
+    static basic_json parse(std::istream&& i, parser_callback_t cb = nullptr)
+    {
+        return parser(i, cb).parse();
+    }
+
+    /*!
+    @brief deserialize from stream
+
+    Deserializes an input stream to a JSON value.
+
+    @param[in,out] i  input stream to read a serialized JSON value from
+    @param[in,out] j  JSON value to write the deserialized input to
+
+    @throw std::invalid_argument in case of parse errors
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser.
+
+    @liveexample{The example below shows how a JSON value is constructed by
+    reading a serialization from a stream.,operator_deserialize}
+
+    @sa parse(std::istream&, parser_callback_t) for a variant with a parser
+    callback function to filter values while parsing
+    */
+    friend std::istream& operator<<(basic_json& j, std::istream& i)
+    {
+        j = parser(i).parse();
+        return i;
+    }
+
+    /*!
+    @brief deserialize from stream
+    @copydoc operator<<(basic_json&, std::istream&)
+    */
+    friend std::istream& operator>>(std::istream& i, basic_json& j)
+    {
+        j = parser(i).parse();
+        return i;
+    }
+
+    /// @}
+
+
+  private:
+    ///////////////////////////
+    // convenience functions //
+    ///////////////////////////
+
+    /// return the type as string
+    string_t type_name() const
+    {
+        switch (m_type)
+        {
+            case (value_t::null):
+            {
+                return "null";
+            }
+
+            case (value_t::object):
+            {
+                return "object";
+            }
+
+            case (value_t::array):
+            {
+                return "array";
+            }
+
+            case (value_t::string):
+            {
+                return "string";
+            }
+
+            case (value_t::boolean):
+            {
+                return "boolean";
+            }
+
+            case (value_t::discarded):
+            {
+                return "discarded";
+            }
+
+            default:
+            {
+                return "number";
+            }
+        }
+    }
+
+    /*!
+    @brief calculates the extra space to escape a JSON string
+
+    @param[in] s  the string to escape
+    @return the number of characters required to escape string @a s
+
+    @complexity Linear in the length of string @a s.
+    */
+    static std::size_t extra_space(const string_t& s) noexcept
+    {
+        std::size_t result = 0;
+
+        for (const auto& c : s)
+        {
+            switch (c)
+            {
+                case '"':
+                case '\\':
+                case '\b':
+                case '\f':
+                case '\n':
+                case '\r':
+                case '\t':
+                {
+                    // from c (1 byte) to \x (2 bytes)
+                    result += 1;
+                    break;
+                }
+
+                default:
+                {
+                    if (c >= 0x00 and c <= 0x1f)
+                    {
+                        // from c (1 byte) to \uxxxx (6 bytes)
+                        result += 5;
+                    }
+                    break;
+                }
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief escape a string
+
+    Escape a string by replacing certain special characters by a sequence of an
+    escape character (backslash) and another character and other control
+    characters by a sequence of "\u" followed by a four-digit hex
+    representation.
+
+    @param[in] s  the string to escape
+    @return  the escaped string
+
+    @complexity Linear in the length of string @a s.
+    */
+    static string_t escape_string(const string_t& s) noexcept
+    {
+        const auto space = extra_space(s);
+        if (space == 0)
+        {
+            return s;
+        }
+
+        // create a result string of necessary size
+        string_t result(s.size() + space, '\\');
+        std::size_t pos = 0;
+
+        for (const auto& c : s)
+        {
+            switch (c)
+            {
+                // quotation mark (0x22)
+                case '"':
+                {
+                    result[pos + 1] = '"';
+                    pos += 2;
+                    break;
+                }
+
+                // reverse solidus (0x5c)
+                case '\\':
+                {
+                    // nothing to change
+                    pos += 2;
+                    break;
+                }
+
+                // backspace (0x08)
+                case '\b':
+                {
+                    result[pos + 1] = 'b';
+                    pos += 2;
+                    break;
+                }
+
+                // formfeed (0x0c)
+                case '\f':
+                {
+                    result[pos + 1] = 'f';
+                    pos += 2;
+                    break;
+                }
+
+                // newline (0x0a)
+                case '\n':
+                {
+                    result[pos + 1] = 'n';
+                    pos += 2;
+                    break;
+                }
+
+                // carriage return (0x0d)
+                case '\r':
+                {
+                    result[pos + 1] = 'r';
+                    pos += 2;
+                    break;
+                }
+
+                // horizontal tab (0x09)
+                case '\t':
+                {
+                    result[pos + 1] = 't';
+                    pos += 2;
+                    break;
+                }
+
+                default:
+                {
+                    if (c >= 0x00 and c <= 0x1f)
+                    {
+                        // print character c as \uxxxx
+                        sprintf(&result[pos + 1], "u%04x", int(c));
+                        pos += 6;
+                        // overwrite trailing null character
+                        result[pos] = '\\';
+                    }
+                    else
+                    {
+                        // all other characters are added as-is
+                        result[pos++] = c;
+                    }
+                    break;
+                }
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief internal implementation of the serialization function
+
+    This function is called by the public member function dump and organizes
+    the serializaion internally. The indentation level is propagated as
+    additional parameter. In case of arrays and objects, the function is called
+    recursively. Note that
+
+    - strings and object keys are escaped using escape_string()
+    - integer numbers are converted implictly via operator<<
+    - floating-point numbers are converted to a string using "%g" format
+
+    @param[out] o              stream to write to
+    @param[in] pretty_print    whether the output shall be pretty-printed
+    @param[in] indent_step     the indent level
+    @param[in] current_indent  the current indent level (only used internally)
+    */
+    void dump(std::ostream& o, const bool pretty_print, const unsigned int indent_step,
+              const unsigned int current_indent = 0) const
+    {
+        // variable to hold indentation for recursive calls
+        unsigned int new_indent = current_indent;
+
+        switch (m_type)
+        {
+            case (value_t::object):
+            {
+                if (m_value.object->empty())
+                {
+                    o << "{}";
+                    return;
+                }
+
+                o << "{";
+
+                // increase indentation
+                if (pretty_print)
+                {
+                    new_indent += indent_step;
+                    o << "\n";
+                }
+
+                for (auto i = m_value.object->cbegin(); i != m_value.object->cend(); ++i)
+                {
+                    if (i != m_value.object->cbegin())
+                    {
+                        o << (pretty_print ? ",\n" : ",");
+                    }
+                    o << string_t(new_indent, ' ') << "\""
+                      << escape_string(i->first) << "\":"
+                      << (pretty_print ? " " : "");
+                    i->second.dump(o, pretty_print, indent_step, new_indent);
+                }
+
+                // decrease indentation
+                if (pretty_print)
+                {
+                    new_indent -= indent_step;
+                    o << "\n";
+                }
+
+                o << string_t(new_indent, ' ') + "}";
+                return;
+            }
+
+            case (value_t::array):
+            {
+                if (m_value.array->empty())
+                {
+                    o << "[]";
+                    return;
+                }
+
+                o << "[";
+
+                // increase indentation
+                if (pretty_print)
+                {
+                    new_indent += indent_step;
+                    o << "\n";
+                }
+
+                for (auto i = m_value.array->cbegin(); i != m_value.array->cend(); ++i)
+                {
+                    if (i != m_value.array->cbegin())
+                    {
+                        o << (pretty_print ? ",\n" : ",");
+                    }
+                    o << string_t(new_indent, ' ');
+                    i->dump(o, pretty_print, indent_step, new_indent);
+                }
+
+                // decrease indentation
+                if (pretty_print)
+                {
+                    new_indent -= indent_step;
+                    o << "\n";
+                }
+
+                o << string_t(new_indent, ' ') << "]";
+                return;
+            }
+
+            case (value_t::string):
+            {
+                o << string_t("\"") << escape_string(*m_value.string) << "\"";
+                return;
+            }
+
+            case (value_t::boolean):
+            {
+                o << (m_value.boolean ? "true" : "false");
+                return;
+            }
+
+            case (value_t::number_integer):
+            {
+                o << m_value.number_integer;
+                return;
+            }
+
+            case (value_t::number_float):
+            {
+                // 15 digits of precision allows round-trip IEEE 754
+                // string->double->string; to be safe, we read this value from
+                // std::numeric_limits<number_float_t>::digits10
+                o << std::setprecision(std::numeric_limits<number_float_t>::digits10) << m_value.number_float;
+                return;
+            }
+
+            case (value_t::discarded):
+            {
+                o << "<discarded>";
+                return;
+            }
+
+            default:
+            {
+                o << "null";
+                return;
+            }
+        }
+    }
+
+  private:
+    //////////////////////
+    // member variables //
+    //////////////////////
+
+    /// the type of the current element
+    value_t m_type = value_t::null;
+
+    /// the value of the current element
+    json_value m_value = {};
+
+
+  private:
+    ///////////////
+    // iterators //
+    ///////////////
+
+    /*!
+    @brief an iterator for primitive JSON types
+
+    This class models an iterator for primitive JSON types (boolean, number,
+    string). It's only purpose is to allow the iterator/const_iterator classes
+    to "iterate" over primitive values. Internally, the iterator is modeled by
+    a `difference_type` variable. Value begin_value (`0`) models the begin,
+    end_value (`1`) models past the end.
+    */
+    class primitive_iterator_t
+    {
+      public:
+        /// set iterator to a defined beginning
+        void set_begin()
+        {
+            m_it = begin_value;
+        }
+
+        /// set iterator to a defined past the end
+        void set_end()
+        {
+            m_it = end_value;
+        }
+
+        /// return whether the iterator can be dereferenced
+        bool is_begin() const
+        {
+            return (m_it == begin_value);
+        }
+
+        /// return whether the iterator is at end
+        bool is_end() const
+        {
+            return (m_it == end_value);
+        }
+
+        /// return reference to the value to change and compare
+        operator difference_type& ()
+        {
+            return m_it;
+        }
+
+        /// return value to compare
+        operator difference_type () const
+        {
+            return m_it;
+        }
+
+      private:
+        static constexpr difference_type begin_value = 0;
+        static constexpr difference_type end_value = begin_value + 1;
+
+        /// iterator as signed integer type
+        difference_type m_it = std::numeric_limits<std::ptrdiff_t>::min();
+    };
+
+    /*!
+    @brief an iterator value
+
+    @note This structure could easily be a union, but MSVC currently does not
+    allow unions members with complex constructors, see
+    https://github.com/nlohmann/json/pull/105.
+    */
+    struct internal_iterator
+    {
+        /// iterator for JSON objects
+        typename object_t::iterator object_iterator;
+        /// iterator for JSON arrays
+        typename array_t::iterator array_iterator;
+        /// generic iterator for all other types
+        primitive_iterator_t primitive_iterator;
+
+        /// create an uninitialized internal_iterator
+        internal_iterator()
+            : object_iterator(), array_iterator(), primitive_iterator()
+        {}
+    };
+
+  public:
+    /*!
+    @brief a const random access iterator for the @ref basic_json class
+
+    This class implements a const iterator for the @ref basic_json class. From
+    this class, the @ref iterator class is derived.
+
+    @requirement The class satisfies the following concept requirements:
+    - [RandomAccessIterator](http://en.cppreference.com/w/cpp/concept/RandomAccessIterator):
+      The iterator that can be moved to point (forward and backward) to any
+      element in constant time.
+    */
+    class const_iterator : public std::iterator<std::random_access_iterator_tag, const basic_json>
+    {
+        /// allow basic_json to access private members
+        friend class basic_json;
+
+      public:
+        /// the type of the values when the iterator is dereferenced
+        using value_type = typename basic_json::value_type;
+        /// a type to represent differences between iterators
+        using difference_type = typename basic_json::difference_type;
+        /// defines a pointer to the type iterated over (value_type)
+        using pointer = typename basic_json::const_pointer;
+        /// defines a reference to the type iterated over (value_type)
+        using reference = typename basic_json::const_reference;
+        /// the category of the iterator
+        using iterator_category = std::bidirectional_iterator_tag;
+
+        /// default constructor
+        const_iterator() = default;
+
+        /// constructor for a given JSON instance
+        const_iterator(pointer object) : m_object(object)
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    m_it.object_iterator = typename object_t::iterator();
+                    break;
+                }
+                case (basic_json::value_t::array):
+                {
+                    m_it.array_iterator = typename array_t::iterator();
+                    break;
+                }
+                default:
+                {
+                    m_it.primitive_iterator = primitive_iterator_t();
+                    break;
+                }
+            }
+        }
+
+        /// copy constructor given a nonconst iterator
+        const_iterator(const iterator& other) : m_object(other.m_object)
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    m_it.object_iterator = other.m_it.object_iterator;
+                    break;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    m_it.array_iterator = other.m_it.array_iterator;
+                    break;
+                }
+
+                default:
+                {
+                    m_it.primitive_iterator = other.m_it.primitive_iterator;
+                    break;
+                }
+            }
+        }
+
+        /// copy constructor
+        const_iterator(const const_iterator& other) noexcept
+            : m_object(other.m_object), m_it(other.m_it)
+        {}
+
+        /// copy assignment
+        const_iterator& operator=(const_iterator other) noexcept(
+            std::is_nothrow_move_constructible<pointer>::value and
+            std::is_nothrow_move_assignable<pointer>::value and
+            std::is_nothrow_move_constructible<internal_iterator>::value and
+            std::is_nothrow_move_assignable<internal_iterator>::value
+        )
+        {
+            std::swap(m_object, other.m_object);
+            std::swap(m_it, other.m_it);
+            return *this;
+        }
+
+      private:
+        /// set the iterator to the first value
+        void set_begin()
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    m_it.object_iterator = m_object->m_value.object->begin();
+                    break;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    m_it.array_iterator = m_object->m_value.array->begin();
+                    break;
+                }
+
+                case (basic_json::value_t::null):
+                {
+                    // set to end so begin()==end() is true: null is empty
+                    m_it.primitive_iterator.set_end();
+                    break;
+                }
+
+                default:
+                {
+                    m_it.primitive_iterator.set_begin();
+                    break;
+                }
+            }
+        }
+
+        /// set the iterator past the last value
+        void set_end()
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    m_it.object_iterator = m_object->m_value.object->end();
+                    break;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    m_it.array_iterator = m_object->m_value.array->end();
+                    break;
+                }
+
+                default:
+                {
+                    m_it.primitive_iterator.set_end();
+                    break;
+                }
+            }
+        }
+
+      public:
+        /// return a reference to the value pointed to by the iterator
+        reference operator*() const
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    return m_it.object_iterator->second;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return *m_it.array_iterator;
+                }
+
+                case (basic_json::value_t::null):
+                {
+                    throw std::out_of_range("cannot get value");
+                }
+
+                default:
+                {
+                    if (m_it.primitive_iterator.is_begin())
+                    {
+                        return *m_object;
+                    }
+                    else
+                    {
+                        throw std::out_of_range("cannot get value");
+                    }
+                }
+            }
+        }
+
+        /// dereference the iterator
+        pointer operator->() const
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    return &(m_it.object_iterator->second);
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return &*m_it.array_iterator;
+                }
+
+                default:
+                {
+                    if (m_it.primitive_iterator.is_begin())
+                    {
+                        return m_object;
+                    }
+                    else
+                    {
+                        throw std::out_of_range("cannot get value");
+                    }
+                }
+            }
+        }
+
+        /// post-increment (it++)
+        const_iterator operator++(int)
+        {
+            auto result = *this;
+            ++(*this);
+
+            return result;
+        }
+
+        /// pre-increment (++it)
+        const_iterator& operator++()
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    ++m_it.object_iterator;
+                    break;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    ++m_it.array_iterator;
+                    break;
+                }
+
+                default:
+                {
+                    ++m_it.primitive_iterator;
+                    break;
+                }
+            }
+
+            return *this;
+        }
+
+        /// post-decrement (it--)
+        const_iterator operator--(int)
+        {
+            auto result = *this;
+            --(*this);
+
+            return result;
+        }
+
+        /// pre-decrement (--it)
+        const_iterator& operator--()
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    --m_it.object_iterator;
+                    break;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    --m_it.array_iterator;
+                    break;
+                }
+
+                default:
+                {
+                    --m_it.primitive_iterator;
+                    break;
+                }
+            }
+
+            return *this;
+        }
+
+        /// comparison: equal
+        bool operator==(const const_iterator& other) const
+        {
+            // if objects are not the same, the comparison is undefined
+            if (m_object != other.m_object)
+            {
+                throw std::domain_error("cannot compare iterators of different containers");
+            }
+
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    return (m_it.object_iterator == other.m_it.object_iterator);
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return (m_it.array_iterator == other.m_it.array_iterator);
+                }
+
+                default:
+                {
+                    return (m_it.primitive_iterator == other.m_it.primitive_iterator);
+                }
+            }
+        }
+
+        /// comparison: not equal
+        bool operator!=(const const_iterator& other) const
+        {
+            return not operator==(other);
+        }
+
+        /// comparison: smaller
+        bool operator<(const const_iterator& other) const
+        {
+            // if objects are not the same, the comparison is undefined
+            if (m_object != other.m_object)
+            {
+                throw std::domain_error("cannot compare iterators of different containers");
+            }
+
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    throw std::domain_error("cannot use operator< for object iterators");
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return (m_it.array_iterator < other.m_it.array_iterator);
+                }
+
+                default:
+                {
+                    return (m_it.primitive_iterator < other.m_it.primitive_iterator);
+                }
+            }
+        }
+
+        /// comparison: less than or equal
+        bool operator<=(const const_iterator& other) const
+        {
+            return not other.operator < (*this);
+        }
+
+        /// comparison: greater than
+        bool operator>(const const_iterator& other) const
+        {
+            return not operator<=(other);
+        }
+
+        /// comparison: greater than or equal
+        bool operator>=(const const_iterator& other) const
+        {
+            return not operator<(other);
+        }
+
+        /// add to iterator
+        const_iterator& operator+=(difference_type i)
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    throw std::domain_error("cannot use operator+= for object iterators");
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    m_it.array_iterator += i;
+                    break;
+                }
+
+                default:
+                {
+                    m_it.primitive_iterator += i;
+                    break;
+                }
+            }
+
+            return *this;
+        }
+
+        /// subtract from iterator
+        const_iterator& operator-=(difference_type i)
+        {
+            return operator+=(-i);
+        }
+
+        /// add to iterator
+        const_iterator operator+(difference_type i)
+        {
+            auto result = *this;
+            result += i;
+            return result;
+        }
+
+        /// subtract from iterator
+        const_iterator operator-(difference_type i)
+        {
+            auto result = *this;
+            result -= i;
+            return result;
+        }
+
+        /// return difference
+        difference_type operator-(const const_iterator& other) const
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    throw std::domain_error("cannot use operator- for object iterators");
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return m_it.array_iterator - other.m_it.array_iterator;
+                }
+
+                default:
+                {
+                    return m_it.primitive_iterator - other.m_it.primitive_iterator;
+                }
+            }
+        }
+
+        /// access to successor
+        reference operator[](difference_type n) const
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    throw std::domain_error("cannot use operator[] for object iterators");
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return *(m_it.array_iterator + n);
+                }
+
+                case (basic_json::value_t::null):
+                {
+                    throw std::out_of_range("cannot get value");
+                }
+
+                default:
+                {
+                    if (m_it.primitive_iterator == -n)
+                    {
+                        return *m_object;
+                    }
+                    else
+                    {
+                        throw std::out_of_range("cannot get value");
+                    }
+                }
+            }
+        }
+
+        /// return the key of an object iterator
+        typename object_t::key_type key() const
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    return m_it.object_iterator->first;
+                }
+
+                default:
+                {
+                    throw std::domain_error("cannot use key() for non-object iterators");
+                }
+            }
+        }
+
+        /// return the value of an iterator
+        reference value() const
+        {
+            return operator*();
+        }
+
+      private:
+        /// associated JSON instance
+        pointer m_object = nullptr;
+        /// the actual iterator of the associated instance
+        internal_iterator m_it = internal_iterator();
+    };
+
+    /*!
+    @brief a mutable random access iterator for the @ref basic_json class
+
+    @requirement The class satisfies the following concept requirements:
+    - [RandomAccessIterator](http://en.cppreference.com/w/cpp/concept/RandomAccessIterator):
+      The iterator that can be moved to point (forward and backward) to any
+      element in constant time.
+    - [OutputIterator](http://en.cppreference.com/w/cpp/concept/OutputIterator):
+      It is possible to write to the pointed-to element.
+    */
+    class iterator : public const_iterator
+    {
+      public:
+        using base_iterator = const_iterator;
+        using pointer = typename basic_json::pointer;
+        using reference = typename basic_json::reference;
+
+        /// default constructor
+        iterator() = default;
+
+        /// constructor for a given JSON instance
+        iterator(pointer object) noexcept : base_iterator(object)
+        {}
+
+        /// copy constructor
+        iterator(const iterator& other) noexcept
+            : base_iterator(other)
+        {}
+
+        /// copy assignment
+        iterator& operator=(iterator other) noexcept(
+            std::is_nothrow_move_constructible<pointer>::value and
+            std::is_nothrow_move_assignable<pointer>::value and
+            std::is_nothrow_move_constructible<internal_iterator>::value and
+            std::is_nothrow_move_assignable<internal_iterator>::value
+        )
+        {
+            base_iterator::operator=(other);
+            return *this;
+        }
+
+        /// return a reference to the value pointed to by the iterator
+        reference operator*()
+        {
+            return const_cast<reference>(base_iterator::operator*());
+        }
+
+        /// dereference the iterator
+        pointer operator->()
+        {
+            return const_cast<pointer>(base_iterator::operator->());
+        }
+
+        /// post-increment (it++)
+        iterator operator++(int)
+        {
+            iterator result = *this;
+            base_iterator::operator++();
+            return result;
+        }
+
+        /// pre-increment (++it)
+        iterator& operator++()
+        {
+            base_iterator::operator++();
+            return *this;
+        }
+
+        /// post-decrement (it--)
+        iterator operator--(int)
+        {
+            iterator result = *this;
+            base_iterator::operator--();
+            return result;
+        }
+
+        /// pre-decrement (--it)
+        iterator& operator--()
+        {
+            base_iterator::operator--();
+            return *this;
+        }
+
+        /// add to iterator
+        iterator& operator+=(difference_type i)
+        {
+            base_iterator::operator+=(i);
+            return *this;
+        }
+
+        /// subtract from iterator
+        iterator& operator-=(difference_type i)
+        {
+            base_iterator::operator-=(i);
+            return *this;
+        }
+
+        /// add to iterator
+        iterator operator+(difference_type i)
+        {
+            auto result = *this;
+            result += i;
+            return result;
+        }
+
+        /// subtract from iterator
+        iterator operator-(difference_type i)
+        {
+            auto result = *this;
+            result -= i;
+            return result;
+        }
+
+        difference_type operator-(const iterator& other) const
+        {
+            return base_iterator::operator-(other);
+        }
+
+        /// access to successor
+        reference operator[](difference_type n) const
+        {
+            return const_cast<reference>(base_iterator::operator[](n));
+        }
+
+        /// return the value of an iterator
+        reference value() const
+        {
+            return const_cast<reference>(base_iterator::value());
+        }
+    };
+
+    /*!
+    @brief a template for a reverse iterator class
+
+    @tparam Base the base iterator type to reverse. Valid types are @ref
+    iterator (to create @ref reverse_iterator) and @ref const_iterator (to
+    create @ref const_reverse_iterator).
+
+    @requirement The class satisfies the following concept requirements:
+    - [RandomAccessIterator](http://en.cppreference.com/w/cpp/concept/RandomAccessIterator):
+      The iterator that can be moved to point (forward and backward) to any
+      element in constant time.
+    - [OutputIterator](http://en.cppreference.com/w/cpp/concept/OutputIterator):
+      It is possible to write to the pointed-to element (only if @a Base is
+      @ref iterator).
+    */
+    template<typename Base>
+    class json_reverse_iterator : public std::reverse_iterator<Base>
+    {
+      public:
+        /// shortcut to the reverse iterator adaptor
+        using base_iterator = std::reverse_iterator<Base>;
+        /// the reference type for the pointed-to element
+        using reference = typename Base::reference;
+
+        /// create reverse iterator from iterator
+        json_reverse_iterator(const typename base_iterator::iterator_type& it)
+            : base_iterator(it) {}
+
+        /// create reverse iterator from base class
+        json_reverse_iterator(const base_iterator& it) : base_iterator(it) {}
+
+        /// post-increment (it++)
+        json_reverse_iterator operator++(int)
+        {
+            return base_iterator::operator++(1);
+        }
+
+        /// pre-increment (++it)
+        json_reverse_iterator& operator++()
+        {
+            base_iterator::operator++();
+            return *this;
+        }
+
+        /// post-decrement (it--)
+        json_reverse_iterator operator--(int)
+        {
+            return base_iterator::operator--(1);
+        }
+
+        /// pre-decrement (--it)
+        json_reverse_iterator& operator--()
+        {
+            base_iterator::operator--();
+            return *this;
+        }
+
+        /// add to iterator
+        json_reverse_iterator& operator+=(difference_type i)
+        {
+            base_iterator::operator+=(i);
+            return *this;
+        }
+
+        /// add to iterator
+        json_reverse_iterator operator+(difference_type i) const
+        {
+            auto result = *this;
+            result += i;
+            return result;
+        }
+
+        /// subtract from iterator
+        json_reverse_iterator operator-(difference_type i) const
+        {
+            auto result = *this;
+            result -= i;
+            return result;
+        }
+
+        /// return difference
+        difference_type operator-(const json_reverse_iterator& other) const
+        {
+            return this->base() - other.base();
+        }
+
+        /// access to successor
+        reference operator[](difference_type n) const
+        {
+            return *(this->operator+(n));
+        }
+
+        /// return the key of an object iterator
+        typename object_t::key_type key() const
+        {
+            auto it = --this->base();
+            return it.key();
+        }
+
+        /// return the value of an iterator
+        reference value() const
+        {
+            auto it = --this->base();
+            return it.operator * ();
+        }
+    };
+
+    /*!
+    @brief wrapper to access iterator member functions in range-based for
+
+    This class allows to access @ref key() and @ref value() during range-based
+    for loops. In these loops, a reference to the JSON values is returned, so
+    there is no access to the underlying iterator.
+    */
+    class iterator_wrapper
+    {
+      private:
+        /// the container to iterate
+        basic_json& container;
+        /// the type of the iterator to use while iteration
+        using json_iterator = decltype(std::begin(container));
+
+        /// internal iterator wrapper
+        class iterator_wrapper_internal
+        {
+          private:
+            /// the iterator
+            json_iterator anchor;
+            /// an index for arrays
+            size_t array_index = 0;
+
+          public:
+            /// construct wrapper given an iterator
+            iterator_wrapper_internal(json_iterator i) : anchor(i)
+            {}
+
+            /// dereference operator (needed for range-based for)
+            iterator_wrapper_internal& operator*()
+            {
+                return *this;
+            }
+
+            /// increment operator (needed for range-based for)
+            iterator_wrapper_internal& operator++()
+            {
+                ++anchor;
+                ++array_index;
+
+                return *this;
+            }
+
+            /// inequality operator (needed for range-based for)
+            bool operator!= (const iterator_wrapper_internal& o)
+            {
+                return anchor != o.anchor;
+            }
+
+            /// stream operator
+            friend std::ostream& operator<<(std::ostream& o, const iterator_wrapper_internal& w)
+            {
+                return o << w.value();
+            }
+
+            /// return key of the iterator
+            typename basic_json::string_t key() const
+            {
+                switch (anchor.m_object->type())
+                {
+                    /// use integer array index as key
+                    case (value_t::array):
+                    {
+                        return std::to_string(array_index);
+                    }
+
+                    /// use key from the object
+                    case (value_t::object):
+                    {
+                        return anchor.key();
+                    }
+
+                    /// use an empty key for all primitive types
+                    default:
+                    {
+                        return "";
+                    }
+                }
+            }
+
+            /// return value of the iterator
+            typename json_iterator::reference value() const
+            {
+                return anchor.value();
+            }
+        };
+
+      public:
+        /// construct iterator wrapper from a container
+        iterator_wrapper(basic_json& cont)
+            : container(cont)
+        {}
+
+        /// return iterator begin (needed for range-based for)
+        iterator_wrapper_internal begin()
+        {
+            return iterator_wrapper_internal(container.begin());
+        }
+
+        /// return iterator end (needed for range-based for)
+        iterator_wrapper_internal end()
+        {
+            return iterator_wrapper_internal(container.end());
+        }
+    };
+
+  private:
+    //////////////////////
+    // lexer and parser //
+    //////////////////////
+
+    /*!
+    @brief lexical analysis
+
+    This class organizes the lexical analysis during JSON deserialization. The
+    core of it is a scanner generated by re2c <http://re2c.org> that processes
+    a buffer and recognizes tokens according to RFC 7159.
+    */
+    class lexer
+    {
+      public:
+        /// token types for the parser
+        enum class token_type
+        {
+            uninitialized,    ///< indicating the scanner is uninitialized
+            literal_true,     ///< the "true" literal
+            literal_false,    ///< the "false" literal
+            literal_null,     ///< the "null" literal
+            value_string,     ///< a string - use get_string() for actual value
+            value_number,     ///< a number - use get_number() for actual value
+            begin_array,      ///< the character for array begin "["
+            begin_object,     ///< the character for object begin "{"
+            end_array,        ///< the character for array end "]"
+            end_object,       ///< the character for object end "}"
+            name_separator,   ///< the name separator ":"
+            value_separator,  ///< the value separator ","
+            parse_error,      ///< indicating a parse error
+            end_of_input      ///< indicating the end of the input buffer
+        };
+
+        /// the char type to use in the lexer
+        using lexer_char_t = unsigned char;
+
+        /// constructor with a given buffer
+        explicit lexer(const string_t& s) noexcept
+            : m_stream(nullptr), m_buffer(s)
+        {
+            m_content = reinterpret_cast<const lexer_char_t*>(s.c_str());
+            m_start = m_cursor = m_content;
+            m_limit = m_content + s.size();
+        }
+        explicit lexer(std::istream* s) noexcept
+            : m_stream(s), m_buffer()
+        {
+            getline(*m_stream, m_buffer);
+            m_content = reinterpret_cast<const lexer_char_t*>(m_buffer.c_str());
+            m_start = m_cursor = m_content;
+            m_limit = m_content + m_buffer.size();
+        }
+
+        /// default constructor
+        lexer() = default;
+
+        // switch of unwanted functions
+        lexer(const lexer&) = delete;
+        lexer operator=(const lexer&) = delete;
+
+        /*!
+        @brief create a string from a Unicode code point
+
+        @param[in] codepoint1  the code point (can be high surrogate)
+        @param[in] codepoint2  the code point (can be low surrogate or 0)
+        @return string representation of the code point
+        @throw std::out_of_range if code point is >0x10ffff
+        @throw std::invalid_argument if the low surrogate is invalid
+
+        @see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
+        */
+        static string_t to_unicode(const std::size_t codepoint1,
+                                   const std::size_t codepoint2 = 0)
+        {
+            string_t result;
+
+            // calculate the codepoint from the given code points
+            std::size_t codepoint = codepoint1;
+
+            // check if codepoint1 is a high surrogate
+            if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF)
+            {
+                // check if codepoint2 is a low surrogate
+                if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF)
+                {
+                    codepoint =
+                        // high surrogate occupies the most significant 22 bits
+                        (codepoint1 << 10)
+                        // low surrogate occupies the least significant 15 bits
+                        + codepoint2
+                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
+                        // in the result so we have to substract with:
+                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                        - 0x35FDC00;
+                }
+                else
+                {
+                    throw std::invalid_argument("missing or wrong low surrogate");
+                }
+            }
+
+            if (codepoint < 0x80)
+            {
+                // 1-byte characters: 0xxxxxxx (ASCII)
+                result.append(1, static_cast<typename string_t::value_type>(codepoint));
+            }
+            else if (codepoint <= 0x7ff)
+            {
+                // 2-byte characters: 110xxxxx 10xxxxxx
+                result.append(1, static_cast<typename string_t::value_type>(0xC0 | ((codepoint >> 6) & 0x1F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
+            }
+            else if (codepoint <= 0xffff)
+            {
+                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+                result.append(1, static_cast<typename string_t::value_type>(0xE0 | ((codepoint >> 12) & 0x0F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
+            }
+            else if (codepoint <= 0x10ffff)
+            {
+                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                result.append(1, static_cast<typename string_t::value_type>(0xF0 | ((codepoint >> 18) & 0x07)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 12) & 0x3F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
+            }
+            else
+            {
+                throw std::out_of_range("code points above 0x10FFFF are invalid");
+            }
+
+            return result;
+        }
+
+        /// return name of values of type token_type
+        static std::string token_type_name(token_type t)
+        {
+            switch (t)
+            {
+                case (token_type::uninitialized):
+                    return "<uninitialized>";
+                case (token_type::literal_true):
+                    return "true literal";
+                case (token_type::literal_false):
+                    return "false literal";
+                case (token_type::literal_null):
+                    return "null literal";
+                case (token_type::value_string):
+                    return "string literal";
+                case (token_type::value_number):
+                    return "number literal";
+                case (token_type::begin_array):
+                    return "[";
+                case (token_type::begin_object):
+                    return "{";
+                case (token_type::end_array):
+                    return "]";
+                case (token_type::end_object):
+                    return "}";
+                case (token_type::name_separator):
+                    return ":";
+                case (token_type::value_separator):
+                    return ",";
+                case (token_type::end_of_input):
+                    return "<end of input>";
+                default:
+                    return "<parse error>";
+            }
+        }
+
+        /*!
+        This function implements a scanner for JSON. It is specified using
+        regular expressions that try to follow RFC 7159 as close as possible.
+        These regular expressions are then translated into a deterministic
+        finite automaton (DFA) by the tool re2c <http://re2c.org>. As a result,
+        the translated code for this function consists of a large block of code
+        with goto jumps.
+
+        @return the class of the next token read from the buffer
+        */
+        token_type scan() noexcept
+        {
+            // pointer for backtracking information
+            m_marker = nullptr;
+
+            // remember the begin of the token
+            m_start = m_cursor;
+
+
+            {
+                lexer_char_t yych;
+                unsigned int yyaccept = 0;
+                static const unsigned char yybm[] =
+                {
+                    0,   0,   0,   0,   0,   0,   0,   0,
+                    0,  32,  32,   0,   0,  32,   0,   0,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    96,  64,   0,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    192, 192, 192, 192, 192, 192, 192, 192,
+                    192, 192,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,   0,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                };
+
+                if ((m_limit - m_cursor) < 5)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '9')
+                {
+                    if (yych <= ' ')
+                    {
+                        if (yych <= '\n')
+                        {
+                            if (yych <= 0x00)
+                            {
+                                goto basic_json_parser_27;
+                            }
+                            if (yych <= 0x08)
+                            {
+                                goto basic_json_parser_29;
+                            }
+                            if (yych >= '\n')
+                            {
+                                goto basic_json_parser_4;
+                            }
+                        }
+                        else
+                        {
+                            if (yych == '\r')
+                            {
+                                goto basic_json_parser_2;
+                            }
+                            if (yych <= 0x1F)
+                            {
+                                goto basic_json_parser_29;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if (yych <= ',')
+                        {
+                            if (yych == '"')
+                            {
+                                goto basic_json_parser_26;
+                            }
+                            if (yych <= '+')
+                            {
+                                goto basic_json_parser_29;
+                            }
+                            goto basic_json_parser_14;
+                        }
+                        else
+                        {
+                            if (yych <= '-')
+                            {
+                                goto basic_json_parser_22;
+                            }
+                            if (yych <= '/')
+                            {
+                                goto basic_json_parser_29;
+                            }
+                            if (yych <= '0')
+                            {
+                                goto basic_json_parser_23;
+                            }
+                            goto basic_json_parser_25;
+                        }
+                    }
+                }
+                else
+                {
+                    if (yych <= 'm')
+                    {
+                        if (yych <= '\\')
+                        {
+                            if (yych <= ':')
+                            {
+                                goto basic_json_parser_16;
+                            }
+                            if (yych == '[')
+                            {
+                                goto basic_json_parser_6;
+                            }
+                            goto basic_json_parser_29;
+                        }
+                        else
+                        {
+                            if (yych <= ']')
+                            {
+                                goto basic_json_parser_8;
+                            }
+                            if (yych == 'f')
+                            {
+                                goto basic_json_parser_21;
+                            }
+                            goto basic_json_parser_29;
+                        }
+                    }
+                    else
+                    {
+                        if (yych <= 'z')
+                        {
+                            if (yych <= 'n')
+                            {
+                                goto basic_json_parser_18;
+                            }
+                            if (yych == 't')
+                            {
+                                goto basic_json_parser_20;
+                            }
+                            goto basic_json_parser_29;
+                        }
+                        else
+                        {
+                            if (yych <= '{')
+                            {
+                                goto basic_json_parser_10;
+                            }
+                            if (yych == '}')
+                            {
+                                goto basic_json_parser_12;
+                            }
+                            goto basic_json_parser_29;
+                        }
+                    }
+                }
+basic_json_parser_2:
+                ++m_cursor;
+                yych = *m_cursor;
+                goto basic_json_parser_5;
+basic_json_parser_3:
+                {
+                    return scan();
+                }
+basic_json_parser_4:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+basic_json_parser_5:
+                if (yybm[0 + yych] & 32)
+                {
+                    goto basic_json_parser_4;
+                }
+                goto basic_json_parser_3;
+basic_json_parser_6:
+                ++m_cursor;
+                {
+                    return token_type::begin_array;
+                }
+basic_json_parser_8:
+                ++m_cursor;
+                {
+                    return token_type::end_array;
+                }
+basic_json_parser_10:
+                ++m_cursor;
+                {
+                    return token_type::begin_object;
+                }
+basic_json_parser_12:
+                ++m_cursor;
+                {
+                    return token_type::end_object;
+                }
+basic_json_parser_14:
+                ++m_cursor;
+                {
+                    return token_type::value_separator;
+                }
+basic_json_parser_16:
+                ++m_cursor;
+                {
+                    return token_type::name_separator;
+                }
+basic_json_parser_18:
+                yyaccept = 0;
+                yych = *(m_marker = ++m_cursor);
+                if (yych == 'u')
+                {
+                    goto basic_json_parser_59;
+                }
+basic_json_parser_19:
+                {
+                    return token_type::parse_error;
+                }
+basic_json_parser_20:
+                yyaccept = 0;
+                yych = *(m_marker = ++m_cursor);
+                if (yych == 'r')
+                {
+                    goto basic_json_parser_55;
+                }
+                goto basic_json_parser_19;
+basic_json_parser_21:
+                yyaccept = 0;
+                yych = *(m_marker = ++m_cursor);
+                if (yych == 'a')
+                {
+                    goto basic_json_parser_50;
+                }
+                goto basic_json_parser_19;
+basic_json_parser_22:
+                yych = *++m_cursor;
+                if (yych <= '/')
+                {
+                    goto basic_json_parser_19;
+                }
+                if (yych <= '0')
+                {
+                    goto basic_json_parser_49;
+                }
+                if (yych <= '9')
+                {
+                    goto basic_json_parser_40;
+                }
+                goto basic_json_parser_19;
+basic_json_parser_23:
+                yyaccept = 1;
+                yych = *(m_marker = ++m_cursor);
+                if (yych <= 'D')
+                {
+                    if (yych == '.')
+                    {
+                        goto basic_json_parser_42;
+                    }
+                }
+                else
+                {
+                    if (yych <= 'E')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    if (yych == 'e')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                }
+basic_json_parser_24:
+                {
+                    return token_type::value_number;
+                }
+basic_json_parser_25:
+                yyaccept = 1;
+                yych = *(m_marker = ++m_cursor);
+                goto basic_json_parser_41;
+basic_json_parser_26:
+                yyaccept = 0;
+                yych = *(m_marker = ++m_cursor);
+                if (yych <= 0x0F)
+                {
+                    goto basic_json_parser_19;
+                }
+                goto basic_json_parser_31;
+basic_json_parser_27:
+                ++m_cursor;
+                {
+                    return token_type::end_of_input;
+                }
+basic_json_parser_29:
+                yych = *++m_cursor;
+                goto basic_json_parser_19;
+basic_json_parser_30:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+basic_json_parser_31:
+                if (yybm[0 + yych] & 64)
+                {
+                    goto basic_json_parser_30;
+                }
+                if (yych <= 0x0F)
+                {
+                    goto basic_json_parser_32;
+                }
+                if (yych <= '"')
+                {
+                    goto basic_json_parser_34;
+                }
+                goto basic_json_parser_33;
+basic_json_parser_32:
+                m_cursor = m_marker;
+                if (yyaccept == 0)
+                {
+                    goto basic_json_parser_19;
+                }
+                else
+                {
+                    goto basic_json_parser_24;
+                }
+basic_json_parser_33:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= 'e')
+                {
+                    if (yych <= '/')
+                    {
+                        if (yych == '"')
+                        {
+                            goto basic_json_parser_30;
+                        }
+                        if (yych <= '.')
+                        {
+                            goto basic_json_parser_32;
+                        }
+                        goto basic_json_parser_30;
+                    }
+                    else
+                    {
+                        if (yych <= '\\')
+                        {
+                            if (yych <= '[')
+                            {
+                                goto basic_json_parser_32;
+                            }
+                            goto basic_json_parser_30;
+                        }
+                        else
+                        {
+                            if (yych == 'b')
+                            {
+                                goto basic_json_parser_30;
+                            }
+                            goto basic_json_parser_32;
+                        }
+                    }
+                }
+                else
+                {
+                    if (yych <= 'q')
+                    {
+                        if (yych <= 'f')
+                        {
+                            goto basic_json_parser_30;
+                        }
+                        if (yych == 'n')
+                        {
+                            goto basic_json_parser_30;
+                        }
+                        goto basic_json_parser_32;
+                    }
+                    else
+                    {
+                        if (yych <= 's')
+                        {
+                            if (yych <= 'r')
+                            {
+                                goto basic_json_parser_30;
+                            }
+                            goto basic_json_parser_32;
+                        }
+                        else
+                        {
+                            if (yych <= 't')
+                            {
+                                goto basic_json_parser_30;
+                            }
+                            if (yych <= 'u')
+                            {
+                                goto basic_json_parser_36;
+                            }
+                            goto basic_json_parser_32;
+                        }
+                    }
+                }
+basic_json_parser_34:
+                ++m_cursor;
+                {
+                    return token_type::value_string;
+                }
+basic_json_parser_36:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '@')
+                {
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= ':')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+                else
+                {
+                    if (yych <= 'F')
+                    {
+                        goto basic_json_parser_37;
+                    }
+                    if (yych <= '`')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= 'g')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+basic_json_parser_37:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '@')
+                {
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= ':')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+                else
+                {
+                    if (yych <= 'F')
+                    {
+                        goto basic_json_parser_38;
+                    }
+                    if (yych <= '`')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= 'g')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+basic_json_parser_38:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '@')
+                {
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= ':')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+                else
+                {
+                    if (yych <= 'F')
+                    {
+                        goto basic_json_parser_39;
+                    }
+                    if (yych <= '`')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= 'g')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+basic_json_parser_39:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '@')
+                {
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych <= '9')
+                    {
+                        goto basic_json_parser_30;
+                    }
+                    goto basic_json_parser_32;
+                }
+                else
+                {
+                    if (yych <= 'F')
+                    {
+                        goto basic_json_parser_30;
+                    }
+                    if (yych <= '`')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych <= 'f')
+                    {
+                        goto basic_json_parser_30;
+                    }
+                    goto basic_json_parser_32;
+                }
+basic_json_parser_40:
+                yyaccept = 1;
+                m_marker = ++m_cursor;
+                if ((m_limit - m_cursor) < 3)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+basic_json_parser_41:
+                if (yybm[0 + yych] & 128)
+                {
+                    goto basic_json_parser_40;
+                }
+                if (yych <= 'D')
+                {
+                    if (yych != '.')
+                    {
+                        goto basic_json_parser_24;
+                    }
+                }
+                else
+                {
+                    if (yych <= 'E')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    if (yych == 'e')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    goto basic_json_parser_24;
+                }
+basic_json_parser_42:
+                yych = *++m_cursor;
+                if (yych <= '/')
+                {
+                    goto basic_json_parser_32;
+                }
+                if (yych <= '9')
+                {
+                    goto basic_json_parser_47;
+                }
+                goto basic_json_parser_32;
+basic_json_parser_43:
+                yych = *++m_cursor;
+                if (yych <= ',')
+                {
+                    if (yych != '+')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+                else
+                {
+                    if (yych <= '-')
+                    {
+                        goto basic_json_parser_44;
+                    }
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych <= '9')
+                    {
+                        goto basic_json_parser_45;
+                    }
+                    goto basic_json_parser_32;
+                }
+basic_json_parser_44:
+                yych = *++m_cursor;
+                if (yych <= '/')
+                {
+                    goto basic_json_parser_32;
+                }
+                if (yych >= ':')
+                {
+                    goto basic_json_parser_32;
+                }
+basic_json_parser_45:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '/')
+                {
+                    goto basic_json_parser_24;
+                }
+                if (yych <= '9')
+                {
+                    goto basic_json_parser_45;
+                }
+                goto basic_json_parser_24;
+basic_json_parser_47:
+                yyaccept = 1;
+                m_marker = ++m_cursor;
+                if ((m_limit - m_cursor) < 3)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= 'D')
+                {
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_24;
+                    }
+                    if (yych <= '9')
+                    {
+                        goto basic_json_parser_47;
+                    }
+                    goto basic_json_parser_24;
+                }
+                else
+                {
+                    if (yych <= 'E')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    if (yych == 'e')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    goto basic_json_parser_24;
+                }
+basic_json_parser_49:
+                yyaccept = 1;
+                yych = *(m_marker = ++m_cursor);
+                if (yych <= 'D')
+                {
+                    if (yych == '.')
+                    {
+                        goto basic_json_parser_42;
+                    }
+                    goto basic_json_parser_24;
+                }
+                else
+                {
+                    if (yych <= 'E')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    if (yych == 'e')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    goto basic_json_parser_24;
+                }
+basic_json_parser_50:
+                yych = *++m_cursor;
+                if (yych != 'l')
+                {
+                    goto basic_json_parser_32;
+                }
+                yych = *++m_cursor;
+                if (yych != 's')
+                {
+                    goto basic_json_parser_32;
+                }
+                yych = *++m_cursor;
+                if (yych != 'e')
+                {
+                    goto basic_json_parser_32;
+                }
+                ++m_cursor;
+                {
+                    return token_type::literal_false;
+                }
+basic_json_parser_55:
+                yych = *++m_cursor;
+                if (yych != 'u')
+                {
+                    goto basic_json_parser_32;
+                }
+                yych = *++m_cursor;
+                if (yych != 'e')
+                {
+                    goto basic_json_parser_32;
+                }
+                ++m_cursor;
+                {
+                    return token_type::literal_true;
+                }
+basic_json_parser_59:
+                yych = *++m_cursor;
+                if (yych != 'l')
+                {
+                    goto basic_json_parser_32;
+                }
+                yych = *++m_cursor;
+                if (yych != 'l')
+                {
+                    goto basic_json_parser_32;
+                }
+                ++m_cursor;
+                {
+                    return token_type::literal_null;
+                }
+            }
+
+
+        }
+
+        /// append data from the stream to the internal buffer
+        void yyfill() noexcept
+        {
+            if (not m_stream or not * m_stream)
+            {
+                return;
+            }
+
+            const ssize_t offset_start = m_start - m_content;
+            const ssize_t offset_marker = m_marker - m_start;
+            const ssize_t offset_cursor = m_cursor - m_start;
+
+            m_buffer.erase(0, static_cast<size_t>(offset_start));
+            std::string line;
+            std::getline(*m_stream, line);
+            m_buffer += "\n" + line; // add line with newline symbol
+
+            m_content = reinterpret_cast<const lexer_char_t*>(m_buffer.c_str());
+            m_start  = m_content;
+            m_marker = m_start + offset_marker;
+            m_cursor = m_start + offset_cursor;
+            m_limit  = m_start + m_buffer.size() - 1;
+        }
+
+        /// return string representation of last read token
+        string_t get_token() const noexcept
+        {
+            return string_t(reinterpret_cast<typename string_t::const_pointer>(m_start),
+                            static_cast<size_t>(m_cursor - m_start));
+        }
+
+        /*!
+        @brief return string value for string tokens
+
+        The function iterates the characters between the opening and closing
+        quotes of the string value. The complete string is the range
+        [m_start,m_cursor). Consequently, we iterate from m_start+1 to
+        m_cursor-1.
+
+        We differentiate two cases:
+
+        1. Escaped characters. In this case, a new character is constructed
+           according to the nature of the escape. Some escapes create new
+           characters (e.g., @c "\\n" is replaced by @c "\n"), some are copied
+           as is (e.g., @c "\\\\"). Furthermore, Unicode escapes of the shape
+           @c "\\uxxxx" need special care. In this case, to_unicode takes care
+           of the construction of the values.
+        2. Unescaped characters are copied as is.
+
+        @return string value of current token without opening and closing quotes
+        @throw std::out_of_range if to_unicode fails
+        */
+        string_t get_string() const
+        {
+            string_t result;
+            result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
+
+            // iterate the result between the quotes
+            for (const lexer_char_t* i = m_start + 1; i < m_cursor - 1; ++i)
+            {
+                // process escaped characters
+                if (*i == '\\')
+                {
+                    // read next character
+                    ++i;
+
+                    switch (*i)
+                    {
+                        // the default escapes
+                        case 't':
+                        {
+                            result += "\t";
+                            break;
+                        }
+                        case 'b':
+                        {
+                            result += "\b";
+                            break;
+                        }
+                        case 'f':
+                        {
+                            result += "\f";
+                            break;
+                        }
+                        case 'n':
+                        {
+                            result += "\n";
+                            break;
+                        }
+                        case 'r':
+                        {
+                            result += "\r";
+                            break;
+                        }
+                        case '\\':
+                        {
+                            result += "\\";
+                            break;
+                        }
+                        case '/':
+                        {
+                            result += "/";
+                            break;
+                        }
+                        case '"':
+                        {
+                            result += "\"";
+                            break;
+                        }
+
+                        // unicode
+                        case 'u':
+                        {
+                            // get code xxxx from uxxxx
+                            auto codepoint = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>(i + 1),
+                                                          4).c_str(), nullptr, 16);
+
+                            // check if codepoint is a high surrogate
+                            if (codepoint >= 0xD800 and codepoint <= 0xDBFF)
+                            {
+                                // make sure there is a subsequent unicode
+                                if ((i + 6 >= m_limit) or * (i + 5) != '\\' or * (i + 6) != 'u')
+                                {
+                                    throw std::invalid_argument("missing low surrogate");
+                                }
+
+                                // get code yyyy from uxxxx\uyyyy
+                                auto codepoint2 = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>
+                                                               (i + 7), 4).c_str(), nullptr, 16);
+                                result += to_unicode(codepoint, codepoint2);
+                                // skip the next 11 characters (xxxx\uyyyy)
+                                i += 11;
+                            }
+                            else
+                            {
+                                // add unicode character(s)
+                                result += to_unicode(codepoint);
+                                // skip the next four characters (xxxx)
+                                i += 4;
+                            }
+                            break;
+                        }
+                    }
+                }
+                else
+                {
+                    // all other characters are just copied to the end of the
+                    // string
+                    result.append(1, static_cast<typename string_t::value_type>(*i));
+                }
+            }
+
+            return result;
+        }
+
+        /*!
+        @brief return number value for number tokens
+
+        This function translates the last token into a floating point number.
+        The pointer m_start points to the beginning of the parsed number. We
+        pass this pointer to std::strtod which sets endptr to the first
+        character past the converted number. If this pointer is not the same as
+        m_cursor, then either more or less characters have been used during the
+        comparison. This can happen for inputs like "01" which will be treated
+        like number 0 followed by number 1.
+
+        @return the result of the number conversion or NAN if the conversion
+        read past the current token. The latter case needs to be treated by the
+        caller function.
+
+        @throw std::range_error if passed value is out of range
+        */
+        long double get_number() const
+        {
+            // conversion
+            typename string_t::value_type* endptr;
+            const auto float_val = std::strtold(reinterpret_cast<typename string_t::const_pointer>(m_start),
+                                                &endptr);
+
+            // return float_val if the whole number was translated and NAN
+            // otherwise
+            return (reinterpret_cast<lexer_char_t*>(endptr) == m_cursor) ? float_val : NAN;
+        }
+
+      private:
+        /// optional input stream
+        std::istream* m_stream;
+        /// the buffer
+        string_t m_buffer;
+        /// the buffer pointer
+        const lexer_char_t* m_content = nullptr;
+        /// pointer to the beginning of the current symbol
+        const lexer_char_t* m_start = nullptr;
+        /// pointer for backtracking information
+        const lexer_char_t* m_marker = nullptr;
+        /// pointer to the current symbol
+        const lexer_char_t* m_cursor = nullptr;
+        /// pointer to the end of the buffer
+        const lexer_char_t* m_limit = nullptr;
+    };
+
+    /*!
+    @brief syntax analysis
+    */
+    class parser
+    {
+      public:
+        /// constructor for strings
+        parser(const string_t& s, parser_callback_t cb = nullptr)
+            : callback(cb), m_lexer(s)
+        {
+            // read first token
+            get_token();
+        }
+
+        /// a parser reading from an input stream
+        parser(std::istream& _is, parser_callback_t cb = nullptr)
+            : callback(cb), m_lexer(&_is)
+        {
+            // read first token
+            get_token();
+        }
+
+        /// public parser interface
+        basic_json parse()
+        {
+            basic_json result = parse_internal(true);
+
+            expect(lexer::token_type::end_of_input);
+
+            // return parser result and replace it with null in case the
+            // top-level value was discarded by the callback function
+            return result.is_discarded() ? basic_json() : result;
+        }
+
+      private:
+        /// the actual parser
+        basic_json parse_internal(bool keep)
+        {
+            auto result = basic_json(value_t::discarded);
+
+            switch (last_token)
+            {
+                case (lexer::token_type::begin_object):
+                {
+                    if (keep and (not callback or (keep = callback(depth++, parse_event_t::object_start, result))))
+                    {
+                        // explicitly set result to object to cope with {}
+                        result.m_type = value_t::object;
+                        result.m_value = json_value(value_t::object);
+                    }
+
+                    // read next token
+                    get_token();
+
+                    // closing } -> we are done
+                    if (last_token == lexer::token_type::end_object)
+                    {
+                        get_token();
+                        if (keep and callback and not callback(--depth, parse_event_t::object_end, result))
+                        {
+                            result = basic_json(value_t::discarded);
+                        }
+                        return result;
+                    }
+
+                    // no comma is expected here
+                    unexpect(lexer::token_type::value_separator);
+
+                    // otherwise: parse key-value pairs
+                    do
+                    {
+                        // ugly, but could be fixed with loop reorganization
+                        if (last_token == lexer::token_type::value_separator)
+                        {
+                            get_token();
+                        }
+
+                        // store key
+                        expect(lexer::token_type::value_string);
+                        const auto key = m_lexer.get_string();
+
+                        bool keep_tag = false;
+                        if (keep)
+                        {
+                            if (callback)
+                            {
+                                basic_json k(key);
+                                keep_tag = callback(depth, parse_event_t::key, k);
+                            }
+                            else
+                            {
+                                keep_tag = true;
+                            }
+                        }
+
+                        // parse separator (:)
+                        get_token();
+                        expect(lexer::token_type::name_separator);
+
+                        // parse and add value
+                        get_token();
+                        auto value = parse_internal(keep);
+                        if (keep and keep_tag and not value.is_discarded())
+                        {
+                            result[key] = std::move(value);
+                        }
+                    }
+                    while (last_token == lexer::token_type::value_separator);
+
+                    // closing }
+                    expect(lexer::token_type::end_object);
+                    get_token();
+                    if (keep and callback and not callback(--depth, parse_event_t::object_end, result))
+                    {
+                        result = basic_json(value_t::discarded);
+                    }
+
+                    return result;
+                }
+
+                case (lexer::token_type::begin_array):
+                {
+                    if (keep and (not callback or (keep = callback(depth++, parse_event_t::array_start, result))))
+                    {
+                        // explicitly set result to object to cope with []
+                        result.m_type = value_t::array;
+                        result.m_value = json_value(value_t::array);
+                    }
+
+                    // read next token
+                    get_token();
+
+                    // closing ] -> we are done
+                    if (last_token == lexer::token_type::end_array)
+                    {
+                        get_token();
+                        if (callback and not callback(--depth, parse_event_t::array_end, result))
+                        {
+                            result = basic_json(value_t::discarded);
+                        }
+                        return result;
+                    }
+
+                    // no comma is expected here
+                    unexpect(lexer::token_type::value_separator);
+
+                    // otherwise: parse values
+                    do
+                    {
+                        // ugly, but could be fixed with loop reorganization
+                        if (last_token == lexer::token_type::value_separator)
+                        {
+                            get_token();
+                        }
+
+                        // parse value
+                        auto value = parse_internal(keep);
+                        if (keep and not value.is_discarded())
+                        {
+                            result.push_back(std::move(value));
+                        }
+                    }
+                    while (last_token == lexer::token_type::value_separator);
+
+                    // closing ]
+                    expect(lexer::token_type::end_array);
+                    get_token();
+                    if (keep and callback and not callback(--depth, parse_event_t::array_end, result))
+                    {
+                        result = basic_json(value_t::discarded);
+                    }
+
+                    return result;
+                }
+
+                case (lexer::token_type::literal_null):
+                {
+                    get_token();
+                    result.m_type = value_t::null;
+                    break;
+                }
+
+                case (lexer::token_type::value_string):
+                {
+                    const auto s = m_lexer.get_string();
+                    get_token();
+                    result = basic_json(s);
+                    break;
+                }
+
+                case (lexer::token_type::literal_true):
+                {
+                    get_token();
+                    result.m_type = value_t::boolean;
+                    result.m_value = true;
+                    break;
+                }
+
+                case (lexer::token_type::literal_false):
+                {
+                    get_token();
+                    result.m_type = value_t::boolean;
+                    result.m_value = false;
+                    break;
+                }
+
+                case (lexer::token_type::value_number):
+                {
+                    auto float_val = m_lexer.get_number();
+
+                    // NAN is returned if token could not be translated
+                    // completely
+                    if (std::isnan(float_val))
+                    {
+                        throw std::invalid_argument(std::string("parse error - ") +
+                                                    m_lexer.get_token() + " is not a number");
+                    }
+
+                    get_token();
+
+                    // check if conversion loses precision
+                    const auto int_val = static_cast<number_integer_t>(float_val);
+                    if (approx(float_val, static_cast<long double>(int_val)))
+                    {
+                        // we basic_json not lose precision -> return int
+                        result.m_type = value_t::number_integer;
+                        result.m_value = int_val;
+                    }
+                    else
+                    {
+                        // we would lose precision -> returnfloat
+                        result.m_type = value_t::number_float;
+                        result.m_value = static_cast<number_float_t>(float_val);
+                    }
+                    break;
+                }
+
+                default:
+                {
+                    // the last token was unexpected
+                    unexpect(last_token);
+                }
+            }
+
+            if (keep and callback and not callback(depth, parse_event_t::value, result))
+            {
+                result = basic_json(value_t::discarded);
+            }
+            return result;
+        }
+
+        /// get next token from lexer
+        typename lexer::token_type get_token()
+        {
+            last_token = m_lexer.scan();
+            return last_token;
+        }
+
+        void expect(typename lexer::token_type t) const
+        {
+            if (t != last_token)
+            {
+                std::string error_msg = "parse error - unexpected \'";
+                error_msg += m_lexer.get_token();
+                error_msg += "\' (" + lexer::token_type_name(last_token);
+                error_msg += "); expected " + lexer::token_type_name(t);
+                throw std::invalid_argument(error_msg);
+            }
+        }
+
+        void unexpect(typename lexer::token_type t) const
+        {
+            if (t == last_token)
+            {
+                std::string error_msg = "parse error - unexpected \'";
+                error_msg += m_lexer.get_token();
+                error_msg += "\' (";
+                error_msg += lexer::token_type_name(last_token) + ")";
+                throw std::invalid_argument(error_msg);
+            }
+        }
+
+      private:
+        /// current level of recursion
+        int depth = 0;
+        /// callback function
+        parser_callback_t callback;
+        /// the type of the last read token
+        typename lexer::token_type last_token = lexer::token_type::uninitialized;
+        /// the lexer
+        lexer m_lexer;
+    };
+};
+
+
+/////////////
+// presets //
+/////////////
+
+/*!
+@brief default JSON class
+
+This type is the default specialization of the @ref basic_json class which uses
+the standard template types.
+*/
+using json = basic_json<>;
+}
+
+
+/////////////////////////
+// nonmember functions //
+/////////////////////////
+
+// specialization of std::swap, and std::hash
+namespace std
+{
+/*!
+@brief exchanges the values of two JSON objects
+*/
+template <>
+inline void swap(nlohmann::json& j1,
+                 nlohmann::json& j2) noexcept(
+                     is_nothrow_move_constructible<nlohmann::json>::value and
+                     is_nothrow_move_assignable<nlohmann::json>::value
+                 )
+{
+    j1.swap(j2);
+}
+
+/// hash value for JSON objects
+template <>
+struct hash<nlohmann::json>
+{
+    /// return a hash value for a JSON object
+    std::size_t operator()(const nlohmann::json& j) const
+    {
+        // a naive hashing via the string representation
+        const auto& h = hash<nlohmann::json::string_t>();
+        return h(j.dump());
+    }
+};
+}
+
+/*!
+@brief user-defined string literal for JSON values
+
+This operator implements a user-defined string literal for JSON objects. It can
+be used by adding \p "_json" to a string literal and returns a JSON object if
+no parse error occurred.
+
+@param[in] s  a string representation of a JSON object
+@return a JSON object
+*/
+inline nlohmann::json operator "" _json(const char* s, std::size_t)
+{
+    return nlohmann::json::parse(reinterpret_cast<nlohmann::json::string_t::value_type*>
+                                 (const_cast<char*>(s)));
+}
+
+#endif
diff --git a/tools/pbindexdump/src/main.cpp b/tools/pbindexdump/src/main.cpp

new file mode 100644 (file)

index 0000000..a6aefc6
--- /dev/null
+++ b/tools/pbindexdump/src/main.cpp
@@ -0,0 +1,146 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "../common/OptionParser.h"
+#include "PbIndexDump.h"
+#include "PbIndexDumpVersion.h"
+#include "Settings.h"
+#include <cassert>
+#include <iostream>
+using namespace std;
+
+static
+pbindexdump::Settings fromCommandLine(optparse::OptionParser& parser,
+                                      int argc,
+                                      char* argv[])
+{
+    const optparse::Values options = parser.parse_args(argc, argv);
+    pbindexdump::Settings settings;
+
+    // input
+    const vector<string> positionalArgs = parser.args();
+    const size_t numPositionalArgs = positionalArgs.size();
+    if (numPositionalArgs == 0)
+        settings.inputPbiFilename_ = "-"; // stdin
+    else if (numPositionalArgs == 1)
+        settings.inputPbiFilename_ = parser.args().front();
+    else {
+        assert(numPositionalArgs > 1);
+        settings.errors_.push_back("pbindexdump does not support more than one input file per run");
+    }
+
+    // output format
+    if (options.is_set("format"))
+        settings.format_ = options["format"];
+
+    // JSON options
+    if (settings.format_ == "json") {
+        if (options.is_set("json_indent_level"))
+            settings.jsonIndentLevel_ = options.get("json_indent_level");
+        if (options.is_set("json_raw"))
+            settings.jsonRaw_ = options.get("json_raw");
+    } else {
+        if (options.is_set("json_indent_level") ||
+            options.is_set("json_raw"))
+        {
+            settings.errors_.push_back("JSON formatting options not valid on non-JSON output");
+        }
+    }
+
+    return settings;
+}
+
+int main(int argc, char* argv[])
+{
+    // setup help & options
+    optparse::OptionParser parser;
+    parser.description("pbindexdump prints a human-readable view of PBI data to stdout.");
+    parser.prog("pbindexdump");
+    parser.usage("pbindexdump [options] [input]");
+    parser.version(pbindexdump::Version);
+    parser.add_version_option(true);
+    parser.add_help_option(true);
+
+    auto ioGroup = optparse::OptionGroup(parser, "Input/Output");
+    ioGroup.add_option("")
+           .dest("input")
+           .metavar("input")
+           .help("Input PBI file. If not provided, stdin will be used as input.");
+    ioGroup.add_option("--format")
+           .dest("format")
+           .metavar("STRING")
+           .help("Output format, one of:\n"
+                 "    json, cpp\n\n"
+                 "json: pretty-printed JSON [default]\n\n"
+                 "cpp: copy/paste-able C++ code that can be used to construct the"
+                 " equivalent PacBio::BAM::PbiRawData object");
+    parser.add_option_group(ioGroup);
+
+    auto jsonGroup = optparse::OptionGroup(parser, "JSON Formatting");
+    jsonGroup.add_option("--json-indent-level")
+             .dest("json_indent_level")
+             .metavar("INT")
+             .help("JSON indent level [4]");
+    jsonGroup.add_option("--json-raw")
+             .dest("json_raw")
+             .action("store_true")
+             .help("Prints fields in a manner that more closely reflects the PBI"
+                   " file format - presenting data as per-field columns, not"
+                   " per-record objects.");
+    parser.add_option_group(jsonGroup);
+
+    // parse command line for settings
+    const pbindexdump::Settings settings = fromCommandLine(parser, argc, argv);
+    if (!settings.errors_.empty()) {
+        cerr << endl;
+        for (const auto e : settings.errors_)
+            cerr << "ERROR: " << e << endl;
+        cerr << endl;
+        parser.print_help();
+        return EXIT_FAILURE;
+    }
+
+    // run tool
+    try {
+        pbindexdump::PbIndexDump::Run(settings);
+        return EXIT_SUCCESS;
+    }
+    catch (std::exception& e) {
+        cerr << "ERROR: " << e.what() << endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tools/pbmerge/CMakeLists.txt b/tools/pbmerge/CMakeLists.txt

new file mode 100644 (file)

index 0000000..fa9a906
--- /dev/null
+++ b/tools/pbmerge/CMakeLists.txt
@@ -0,0 +1,72 @@
+
+set(PbmergeSrcDir ${PacBioBAM_ToolsDir}/pbmerge/src)
+
+# create version header
+set(PbMerge_VERSION ${PacBioBAM_VERSION})
+configure_file(
+    ${PbmergeSrcDir}/PbMergeVersion.h.in ${GeneratedDir}/PbMergeVersion.h @ONLY
+)
+
+# list source files
+set(PBMERGE_SOURCES
+    ${ToolsCommonDir}/BamFileMerger.h
+    ${ToolsCommonDir}/OptionParser.cpp
+    ${PbmergeSrcDir}/main.cpp
+)
+
+# build pbmerge executable
+include(PbbamTool)
+create_pbbam_tool(
+    TARGET  pbmerge
+    SOURCES ${PBMERGE_SOURCES}
+)
+
+# cram tests
+if (PacBioBAM_build_tests)
+    if(PacBioBAM_auto_validate)
+        # skip for now til we clean up merge tests under autovalidate, too
+    else()
+
+        configure_file(
+            ${PacBioBAM_CramTestsDir}/pbmerge_pacbio_ordering.t.in
+            ${GeneratedDir}/pbmerge_pacbio_ordering.t
+            @ONLY
+        )
+ 
+        configure_file(
+            ${PacBioBAM_CramTestsDir}/pbmerge_aligned_ordering.t.in
+            ${GeneratedDir}/pbmerge_aligned_ordering.t
+            @ONLY
+        )
+
+        configure_file(
+            ${PacBioBAM_CramTestsDir}/pbmerge_mixed_ordering.t.in
+            ${GeneratedDir}/pbmerge_mixed_ordering.t
+            @ONLY
+        )
+
+        configure_file(
+            ${PacBioBAM_CramTestsDir}/pbmerge_dataset.t.in
+            ${GeneratedDir}/pbmerge_dataset.t
+            @ONLY
+        )
+
+        configure_file(
+            ${PacBioBAM_CramTestsDir}/pbmerge_fofn.t.in
+            ${GeneratedDir}/pbmerge_fofn.t
+            @ONLY
+        )
+
+        add_test(
+            NAME pbmerge_CramTests
+            WORKING_DIRECTORY ${PacBioBAM_TestsDir}/scripts
+            COMMAND "python" cram.py
+                ${GeneratedDir}/pbmerge_pacbio_ordering.t
+                ${GeneratedDir}/pbmerge_aligned_ordering.t
+                ${GeneratedDir}/pbmerge_mixed_ordering.t
+                ${GeneratedDir}/pbmerge_dataset.t
+                ${GeneratedDir}/pbmerge_fofn.t
+        )
+
+    endif()
+endif()
diff --git a/tools/pbmerge/src/PbMergeVersion.h.in b/tools/pbmerge/src/PbMergeVersion.h.in

new file mode 100644 (file)

index 0000000..2bda4f0
--- /dev/null
+++ b/tools/pbmerge/src/PbMergeVersion.h.in
@@ -0,0 +1,49 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#ifndef PBMERGEVERSION_H
+#define PBMERGEVERSION_H
+
+#include <string>
+
+namespace pbmerge {
+
+const std::string Version = std::string("@PbMerge_VERSION@");
+
+} // namespace pbmerge
+
+#endif // PBMERGEVERSION_H
diff --git a/tools/pbmerge/src/main.cpp b/tools/pbmerge/src/main.cpp

new file mode 100644 (file)

index 0000000..3056dc1
--- /dev/null
+++ b/tools/pbmerge/src/main.cpp
@@ -0,0 +1,174 @@
+// Copyright (c) 2015, Pacific Biosciences of California, Inc.
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted (subject to the limitations in the
+// disclaimer below) provided that the following conditions are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//
+//  * Neither the name of Pacific Biosciences nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+// GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+// BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+// OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+
+// Author: Derek Barnett
+
+#include "../common/OptionParser.h"
+#include "../common/BamFileMerger.h"
+#include "PbMergeVersion.h"
+#include <cassert>
+#include <iostream>
+using namespace std;
+
+namespace pbmerge {
+
+class Settings
+{
+public:
+    static Settings FromCommandLine(optparse::OptionParser& parser,
+                                    int argc, char* argv[])
+    {
+        pbmerge::Settings settings;
+        const optparse::Values options = parser.parse_args(argc, argv);
+
+        // input
+        const vector<string> positionalArgs = parser.args();
+        if (positionalArgs.empty())
+            settings.errors_.push_back("at least input one file must be specified");
+        else
+            settings.inputFilenames_ = positionalArgs;
+
+        // output
+        if (options.is_set("output"))
+            settings.outputFilename_ = options["output"];
+        else
+            settings.outputFilename_ = "-"; // stdout
+
+        // PBI?
+        if (settings.outputFilename_ == "-")
+            settings.createPbi_ = false; // always skip PBI if writing to stdout
+        else {
+            if (options.is_set("no_pbi"))
+                settings.createPbi_ = !options.get("no_pbi"); // user-disabled
+            else
+                settings.createPbi_ = true; // not specified, go ahead and generate by default
+        }
+
+        return settings;
+    }
+
+public:
+    std::vector<std::string> inputFilenames_;
+    std::string outputFilename_;
+    bool createPbi_;
+    std::vector<std::string> errors_;
+
+private:
+    Settings(void) { }
+};
+
+} // namespace pbmerge
+
+int main(int argc, char* argv[])
+{
+    // setup help & options
+    optparse::OptionParser parser;
+    parser.description("pbmerge merges PacBio BAM files. If the input is DataSetXML, "
+                       "any filters will be applied. If no output filename is specified, "
+                       "new BAM will be written to stdout."
+                       );
+    parser.prog("pbmerge");
+    parser.usage("pbmerge [options] [-o <out.bam>] <INPUT>");
+    parser.version(pbmerge::Version);
+    parser.add_version_option(true);
+    parser.add_help_option(true);
+
+    auto ioGroup = optparse::OptionGroup(parser, "Input/Output");
+    ioGroup.add_option("-o")
+           .dest("output")
+           .metavar("output")
+           .help("Output BAM filename. ");
+    ioGroup.add_option("--no-pbi")
+            .dest("no_pbi")
+            .action("store_true")
+            .help("Set this option to skip PBI index file creation. PBI creation is "
+                  "automatically skipped if no output filename is provided."
+                  );
+    ioGroup.add_option("")
+           .dest("input")
+           .metavar("INPUT")
+           .help("Input may be one of:\n"
+                 "    DataSetXML, list of BAM files, or FOFN\n\n"
+                 "    fofn: pbmerge -o merged.bam bams.fofn\n\n"
+                 "    bams: pbmerge -o merged.bam 1.bam 2.bam 3.bam\n\n"
+                 "    xml:  pbmerge -o merged.bam foo.subreadset.xml\n\n"
+                 );
+    parser.add_option_group(ioGroup);
+
+    // parse command line for settings
+    const pbmerge::Settings settings = pbmerge::Settings::FromCommandLine(parser, argc, argv);
+    if (!settings.errors_.empty()) {
+        cerr << endl;
+        for (const auto e : settings.errors_)
+            cerr << "ERROR: " << e << endl;
+        cerr << endl;
+        parser.print_help();
+        return EXIT_FAILURE;
+    }
+
+    // run tool
+    try {
+        // setup our @PG entry to add to header
+        PacBio::BAM::ProgramInfo mergeProgram;
+        mergeProgram.Id(string("pbmerge-")+pbmerge::Version)
+                    .Name("pbmerge")
+                    .Version(pbmerge::Version);
+
+        PacBio::BAM::DataSet dataset;
+        if (settings.inputFilenames_.size() == 1)
+            dataset = PacBio::BAM::DataSet(settings.inputFilenames_.front());
+        else
+            dataset = PacBio::BAM::DataSet(settings.inputFilenames_);
+
+
+        PacBio::BAM::common::BamFileMerger::Merge(dataset,
+                                                  settings.outputFilename_,
+                                                  mergeProgram,
+                                                  settings.createPbi_);
+
+
+//        PacBio::BAM::common::BamFileMerger merger(dataset,
+//                                                  settings.outputFilename_,
+//                                                  mergeProgram,
+//                                                  settings.createPbi_);
+//        merger.Merge();
+
+        return EXIT_SUCCESS;
+    }
+    catch (std::exception& e) {
+        cerr << "ERROR: " << e.what() << endl;
+        return EXIT_FAILURE;
+    }
+}
author	Afif Elghraoui <afif@debian.org>
	Sun, 22 Jan 2017 07:48:21 +0000 (07:48 +0000)
committer	Afif Elghraoui <afif@debian.org>
	Sun, 22 Jan 2017 07:48:21 +0000 (07:48 +0000)