Import pbbam_0.19.0+dfsg.orig.tar.xz

author Andreas Tille <tille@debian.org>

Wed, 10 Oct 2018 10:45:02 +0000 (11:45 +0100)

committer Andreas Tille <tille@debian.org>

Wed, 10 Oct 2018 10:45:02 +0000 (11:45 +0100)
author Andreas Tille <tille@debian.org>
Wed, 10 Oct 2018 10:45:02 +0000 (11:45 +0100)
committer Andreas Tille <tille@debian.org>
Wed, 10 Oct 2018 10:45:02 +0000 (11:45 +0100)
diff --git a/.clang-format b/.clang-format

new file mode 100644 (file)

index 0000000..1519f35
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,13 @@
+BasedOnStyle:  Google
+BreakBeforeBraces: Mozilla
+
+AllowShortLoopsOnASingleLine: false
+AccessModifierOffset: -4
+BreakConstructorInitializersBeforeComma: true
+ColumnLimit: 100
+IndentWidth: 4
+PointerAlignment: Left
+TabWidth: 4
+
+ReflowComments: false  # protect ASCII art in comments
+KeepEmptyLinesAtTheStartOfBlocks: true
diff --git a/.gitignore b/.gitignore

new file mode 100644 (file)

index 0000000..da3b0c2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,19 @@
+*.o
+*.pico
+*.so
+*.a
+*.dylib
+*.pyc
+*~
+CMakeLists.txt.user
+bin/
+build/
+docs/Doxyfile
+lib/
+tests/bin/test_pbbam
+tests/data/test_group_query/group.fofn
+tests/src/TestData.h
+
+# Meson WrapDB stuff
+subprojects/packagecache/
+subprojects/googletest*
diff --git a/.travis.yml b/.travis.yml

new file mode 100644 (file)

index 0000000..33b861e
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,61 @@
+language: cpp
+compiler:
+  - gcc
+
+before_install:
+
+  # Travis's default installs of gcc, boost, & cmake currently lag behind the minimums we need.
+  # So we need to manually setup them up. 
+  #
+  #  - gcc 4.8 (current default on Travis is 4.7, which is no good for C++11 work)
+  #  - boost 1.55
+  #  - cmake 3.x
+  
+  # add external repos
+  - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test           # gcc
+  - sudo add-apt-repository -y ppa:boost-latest/ppa                  # boost
+  - sudo add-apt-repository -y ppa:george-edison55/precise-backports # cmake
+  
+  # remove existing cmake install
+  - sudo apt-get remove -qq cmake cmake-data
+  - sudo apt-get autoremove -qq
+  
+  # update apt 
+  - sudo apt-get update -y -qq
+
+  # install
+  - sudo apt-get install -y -qq g++-4.8 boost1.55 cmake-data cmake 
+  
+  # make sure we're using new gcc tools
+  - sudo update-alternatives --install /usr/bin/g++  g++  /usr/bin/g++-4.8  90
+  - sudo update-alternatives --install /usr/bin/gcc  gcc  /usr/bin/gcc-4.8  90 
+  - sudo update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-4.8 90
+
+  # prep zlib
+  - sudo apt-get install -y -qq zlib1g-dev
+
+  # prep GoogleTest 
+  - sudo apt-get install -y -qq libgtest-dev
+
+before_script:
+  # run cmake
+  - mkdir build 
+  - cd build
+  - cmake .. -DGTEST_SRC_DIR=/usr/src/gtest -DCMAKE_BUILD_TYPE=Debug
+    
+script:
+  # build & test
+  - make -j 3
+  - make test
+
+branches:
+  only:
+    - master
+    
+notifications:
+  recipients:
+    - dbarnett@pacb.com
+  email:
+    on_success: change
+    on_failure: always 
+   
diff --git a/CHANGELOG.md b/CHANGELOG.md

new file mode 100644 (file)

index 0000000..50a0aad
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,516 @@
+# PacBio::BAM - change log\r
+\r
+All notable changes to this project will be documented in this file.\r
+This project adheres to [Semantic Versioning](http://semver.org/). \r
+\r
+**NOTE:** The current series (0.y.z) is under initial development. Anything may\r
+change at any time. The public API should not be considered stable yet. Once we\r
+lock down a version 1.0.0, this will define a reference point & compatibility\r
+guarantees will be maintained within each major version series.\r
+\r
+## Active\r
+\r
+## [0.19.0] - 2018-09-11\r
+\r
+### Added\r
+ - TranscriptAlignmentSet to XML support\r
+\r
+## [0.17.0] - 2018-03-18\r
+\r
+### Added\r
+- CompressionLevel/NumThreads parameter implementation to PbiBuilder.\r
+- Dataset ctor to PbiFileQuery.\r
+- TranscriptSet to XML support.\r
+- Auto-enabled "permissive CIGAR mode" for pbbamify tool.\r
+- IndexedBamWriter, for more efficient writing of BAM & PBI simultaneously.\r
+\r
+## [0.16.0] - 2018-01-17\r
+\r
+### Removed\r
+- Removed the PbiIndex class and its "lookup data"-related helpers. These were \r
+never as useful as initially intended. PbiRawData and its related classes are the \r
+recommended interface for working with PBI index data.\r
+\r
+## [0.15.0] - 2018-01-12\r
+\r
+### Added\r
+- Support for long CIGARs (>64K operations).\r
+\r
+## [0.14.0] - 2017-12-12\r
+\r
+### Added\r
+- Support for newer style QNAMEs. Recent version of htslib (1.4+) have started\r
+adding extra null terminators to make the subsequent CIGAR section 32-bit aligned.\r
+\r
+### Changed\r
+- Requirements for htslib version used. Must now be htslib v1.4+.\r
+\r
+## [0.13.2] - 2017-09-25\r
+\r
+### Added\r
+- Backward compatibility for C++11 (std::make_unique which is 11/14 agnostic).\r
+\r
+## [0.13.1] - 2017-09-25\r
+\r
+### Added\r
+- Support for "pe" tag in stitched, virtual reads. \r
+\r
+## [0.13.0] - 2017-09-25\r
+\r
+### Changed\r
+- Ran clang-tidy (modernize) over codebase to clean up legacy coding styles.\r
+\r
+## [0.12.2] - 2017-09-22\r
+\r
+### Added \r
+- HasPulseExclusion() to BamRecord (& derived types). \r
+\r
+## [0.12.1] - 2017-09-21\r
+\r
+### Added\r
+- Pulse exclusion base feature to read group.\r
+\r
+## [0.12.0] - 2017-09-19\r
+\r
+### Added\r
+- NumReads() for PBI filter-based queries. This allows fetching of the number\r
+of reads that pass the filter, without needing to iterate over the entire \r
+file(s).\r
+\r
+## [0.11.0] - 2017-09-15\r
+\r
+### Added\r
+- Support for internal tag: pulse exclusion reason ("pe"). New methods on \r
+BamRecord, and new enum PulseExclusionReason.\r
+\r
+### Changed\r
+- Default PacBioBAM format version now 3.0.5\r
+\r
+## [0.10.2] - 2017-09-14\r
+\r
+### Changed\r
+- Explicitly trim all whitespace from FASTA input.\r
+\r
+## [0.10.1] - 2017-09-11\r
+\r
+### Changed\r
+- Frames, add mutex to avoid race condition in InitIpdDownsampling(void)\r
+\r
+## [0.10.0] - 2017-09-08\r
+\r
+### Changed\r
+- PbiBuilder backend for generating PBI index files "on-the-fly" along with\r
+writing BAM files. The previous implementation's memory usage scaled linearly \r
+with the number of reads, sometimes reaching huge numbers (several gigs or more).\r
+The new implementation's memory usage remains constant for any number of reads, \r
+without any runtime hit on files/architectures tested. \r
+\r
+### Removed\r
+- PbiBuilder::Result(). Returned an intermediate snapshot of the index under\r
+construction. This method isn't usable with the new PbiBuilder backend and was \r
+really only useful for initial debugging/testing. It is no longer used in the \r
+test framework and is unlikely to be used by client code either. Dropping this \r
+method from the API, and thus bumping the version number. \r
+\r
+## [0.9.0] - 2017-08-07\r
+\r
+### Removed\r
+- Bundled htslib. Now using 'stock' htslib (v1.3.1+).\r
+- Built-in SWIG wrappers. \r
+\r
+## [0.8.0] - 2017-07-24\r
+\r
+### Added\r
+- Default DataSet 'Version' attribute if none already present (currently 4.0.0)\r
+- Added whitelist support for filtering ZMWs via DataSetXML.\r
+- Added iterable query over FASTA files & ReferenceSet datasets.\r
+- Added DataSet::AllFiles to access primary resources AND their child files (indices,\r
+scraps, etc).\r
+\r
+### Fixed\r
+- Bug in the build system preventing clean rebuilds.\r
+\r
+### Removed\r
+- Dropped the bundled, PacBio-forked version of htslib. Now using stock htslib (v1.3.1+). \r
+\r
+## [0.7.4] - 2016-11-18\r
+\r
+### Changed\r
+- Compatibility for merging BAM files no longer requires exact match of PacBioBAM\r
+version number (header @HD:pb tag). As long as both files meet the minimum \r
+supported version number, the merge is allowed.\r
+\r
+## [0.7.3] - 2016-11-11\r
+\r
+### Added\r
+- Support for S/P2-C2 chemistry and forthcoming 4.0 basecaller\r
+\r
+## [0.7.2] - 2016-11-10\r
+\r
+### Removed\r
+- SAM header version equality check for merging BAM files. PacBioBAM version \r
+number carries more meaning for PacBio data and thus will be the basis of \r
+ensuring compatible merging.\r
+\r
+## [0.7.1] - 2016-11-09\r
+\r
+### Added\r
+- (Unindexed) FASTA reader & FastaSequence data structure.\r
+- Missing unit tests for internal BAM tag access.\r
+- Chemistry data for basecaller v3.3.\r
+- Missing parsers for filtering barcode quality ("bq"), barcode forward ("bcf"), \r
+and barcode reverse ("bcr") from DataSetXML.\r
+- Integrated htslib into project.\r
+\r
+### Fixed\r
+- Reverse complement on padding base.\r
+\r
+## [0.7.0] - 2016-09-26 \r
+\r
+### Added\r
+- Clipping for CCS records\r
+\r
+### Fixed\r
+- Cached position data leaking across records while iterating.\r
+- Rolled back default pulse behavior in internal BAM API, to be backward-\r
+compatible with existing client code (for now at least). v0.6.0 introduced\r
+returning basecalled positions ONLY by default, rather than return ALL \r
+pulses. \r
+- Fixed crash when attempting to read from empty BAM/PBI files using the \r
+PbiFilter-enabled APIs.\r
+\r
+## [0.6.0] - 2016-09-13\r
+\r
+### Added\r
+- BamWriter writes to a BAM file with the target name plus a ".tmp" suffix. On\r
+successful completion (i.e. normal BamWriter destruction, not triggered by a\r
+thrown exception) the file is renamed to the actual requested filename.\r
+- PBI file creation follows the same temporary naming convention.\r
+- Support for barcode pair (forward, reverse) in DataSetXML filter.\r
+- Validation API & 'auto-validate' compile-time switch. \r
+- Added support for a batched QNAME whitelist filter in DataSet XML. Uses (new) \r
+Property name 'qname_file', with the value being the filepath containing the \r
+whitelist.\r
+- Exposed MD5 hashing to API.\r
+- Ability to remove base features from a ReadGroupInfo object.\r
+- Can construct an aggregate PbiRawData index object from a DataSet: essentially\r
+concatenates all PBI data within the dataset.\r
+- New SamWriter class to create SAM-formatted output of PacBio BAM data.\r
+- Extended APIs for accessing "internal BAM" data, including PulseBehavior\r
+switch for selecting between all pulses & basecalls only. \r
+\r
+### Fixed\r
+- Improper 'clip to reference' product for BamRecord in some cases.\r
+- Improper behavior in tag accessors (e.g. BamRecord::IPD()) on reverse strand-\r
+aligned reads (bug 31339).\r
+- Improper basecaller version parsing in ReadGroupInfo.\r
+\r
+### Changed\r
+- RecordType::POLYMERASE renamed to RecordType::ZMW to reflect changes in\r
+PacBio BAM spec v3.0.4\r
+- Refactored the 'virtual' reader classes - to match the new nomenclature,\r
+and to combine the virtual reader & composite readers behind a shared \r
+interface. The old class names still exist, as typedefs to the new ones, \r
+and the interfaces are completely source-compatible - so as not to break \r
+existing code. However, the old classes should be considered deprecated and \r
+the new ones preferred. Below is the mapping of old -> new:\r
+\r
+   VirtualPolymeraseBamRecord        ->  VirtualZmwBamRecord\r
+   VirtualPolymeraseReader           ->  ZmwReadStitcher\r
+   VirtualPolymeraseCompositeReader  ->  ZmwReadStitcher\r
+   ZmwWhitelistVirtualReader         ->  WhitelistedZmwReadStitcher\r
+\r
+\r
+## [0.5.0] - 2016-02-22\r
+\r
+### Added\r
+- Platform model tag added to read group as RG::PM\r
+- New scrap zmw type sz\r
+- pbmerge accepts DataSetXML as input - using top-level resource BAMs as input,\r
+applying filters, and generating a merged BAM. Also added FOFN support, instead\r
+of listing out BAMs as command line args.\r
+- PbiLocalContextFilter to allow filtering on subread local context.\r
+- PbiBuilder: multithreading & zlib compression-level tuning for PBI output\r
+\r
+### Fixed\r
+- Fixed mishandling of relative BAM filenames in the filename constructor for\r
+DataSet (e.g. DataSet ds("../data.bam")).\r
+\r
+## [0.4.5] - 2016-01-14\r
+\r
+### Changed\r
+- PbiFilterQuery (and any other PBI-backed query, e.g. ZmwQuery ) now throws if\r
+PBI file(s) missing insted of returning empty result.\r
+- GenomicIntervalQuery now throws if BAI file(s) missing instead of returning\r
+empty result.\r
+- BamFile will throw if file is truncated (e.g. missing the EOF block). Disable\r
+by defining PBBAM_NO_CHECK_EOF .\r
+\r
+## [0.4.4] - 2016-01-07\r
+\r
+### Added\r
+- bam2sam command line utility. The primary benefit is removing the dependency\r
+on samtools during tests, but also provides users a functioning BAM -> SAM\r
+converter in the absence of samtools.\r
+- pbmerge command line utility. Allows merging N BAM files into one, optionally\r
+creating the PBI file alongside.\r
+- Added BamRecord::Pkmean2 & Pkmid2, 2D equivalent of Pkmean/Pkmid, for internal\r
+BAMs.\r
+\r
+### Removed \r
+- samtools dependency\r
+\r
+## [0.4.3] - 2015-12-22\r
+\r
+### Added\r
+- Compile using ccache by default, if available. Can be manually disabled using\r
+-DPacBioBAM_use_ccache=OFF with cmake.\r
+- pbindexdump: command-line utility that converts PBI file data into human-\r
+readable formats. (JSON by default).\r
+\r
+### Changed\r
+- CMake option PacBioBAM_build_pbindex is being deprecated. Use\r
+PacBioBAM_build_tools instead.\r
+\r
+## [0.4.2] - 2015-12-22\r
+\r
+### Changed\r
+- BamFile::PacBioIndexExists & StandardIndexExists no longer check timestamps.\r
+Copying/moving files around can yield timestamps that are not helpful (no longer\r
+guaranteed that the .pbi will be "newer" than the .bam, even though no content\r
+changed). Added methods (e.g. bool BamFile::PacBioIndexIsNewer()) to do that\r
+lookup if needed, but it is no longer done automatically.\r
+\r
+## [0.4.1] - 2015-12-18\r
+\r
+### Added\r
+- BamRecord::HasNumPasses\r
+\r
+### Changed\r
+- VirtualPolymeraseBamRecord::VirtualRegionsTable(type) returns an empty vector\r
+of regions if none are associated with the requested type, instead of throwing.\r
+\r
+## [0.4.0] - 2015-12-15\r
+\r
+### Changed\r
+- Redesigned PbiFilter interface and backend. Previous implementation did not\r
+scale well as intermediate results were far too unwieldy. This redesign provides\r
+speedups of orders of magnitude in many cases.\r
+\r
+## [0.3.2] - 2015-12-10\r
+\r
+### Added \r
+- Support for ReadGroupInfo sequencing chemistry data.\r
+InvalidSequencingChemistryException thrown if an unsupported combination is\r
+encountered.\r
+- VirtualPolymeraseCompositeReader - for re-stitching records, across multiple\r
+resources (e.g. from DataSetXML). Reader respects DataSet filter criteria.\r
+\r
+## [0.3.1] - 2015-10-30\r
+\r
+### Added\r
+- ZmwWhitelistVirtualReader: similar to VirtualPolymeraseReader but restricts\r
+iteration to a whitelist of ZMW hole numbers, leveraging PBI index data for\r
+random-access.\r
+\r
+### Fixed\r
+- Fixed error in PBI construction, in which entire file sections (e.g.\r
+BarcodeData or MappedData) where being dropped when any one record lacked data.\r
+Correct behavior is to allow file section ommission if all records lack that\r
+data type.\r
+\r
+## [0.3.0] - 2015-10-29\r
+\r
+### Fixed\r
+- Improper reporting of current offset from multi-threaded BamWriter. This had\r
+the effect of creating broken PBIs that were written alongside the BAM. Added a\r
+flush step, which incurs a performance hit, but restores correctness.\r
+\r
+## [0.2.4] - 2015-10-26\r
+\r
+### Fixed\r
+- Empty PbiFilter now returns all records, instead of filtering away all records.\r
+\r
+## [0.2.3] - 2015-10-26\r
+\r
+### Added/Fixed\r
+- Syncing DataSetXML across APIs. Primary changes include output of Version\r
+attribute ("3.0.1") on appropriate elements, as well as resolution of namespace\r
+issues.\r
+\r
+## [0.2.2] - 2015-10-22\r
+\r
+### Added\r
+- Added BAI bin calculation to BamWriter::Write, to ensure maximal compatibility\r
+with downstream tools (e.g. 'samtools index'). A new BinCalculationMode enum\r
+flag in BamWriter constructor cotnrols whether this behavior is enabled[default]\r
+or not.\r
+\r
+## [0.2.1] - 2015-10-19\r
+\r
+### Added\r
+- Exposed the following classes to public API:\r
+  - BamReader\r
+  - BaiIndexedBamReader\r
+  - PbiIndexedBamReader\r
+  - GenomicIntervalCompositeBamReader\r
+  - PbiFilterCompositeBamReader\r
+\r
+## [0.2.0] - 2015-10-09\r
+\r
+### Changed\r
+- BAM spec v3.0.1 compliance. Previous (betas) versions of the BAM spec are not\r
+supported and will causean exception to be throw if encountered.\r
+- PBI lookup interface & backend, see PbiIndex.h & PbiLookupData.h for details.\r
+\r
+### Added \r
+- BamFile::PacBioIndexExists() & BamFile::StandardIndexExists() - query the\r
+existence of index files without auto-building them if they are missing, as in\r
+BamFile::Ensure*IndexExists().\r
+- GenomicInterval now accepts an htslib/samtools-style REGION string in the\r
+constructor: GenomicInterval("chr1:1000-2000"). Please note though, that pbbam\r
+uses 0-based coordinates throughout, whereas samtools expects 1-based. The above\r
+string is equivalent to "chr1:1001-2000" in samtools.\r
+- Built-in PBI filters. See PbiFlter.h & PbiFilterTypes.h for built-in filters\r
+and constructing composite filters. These can be used in conjunction with the\r
+new PbiFilterQuery, which takes a generic PbiFilter and applies that to a\r
+DataSet for iteration.\r
+- New built-in queries: BarcodeQuery, ReadAccuracyQuery, SubreadLengthQuery.\r
+These leverage the new filter API to construct a PbiFilter and apply to a\r
+DataSet.\r
+- Built-in BamRecord comparators that are STL-compatible. See Compare.h for full\r
+list. This allows for statements like the following, which sorts records by ZMW\r
+number:\r
+``` c++\r
+    vector<BamRecord> data;\r
+    std::sort(data.begin(), data.end(), Compare::Zmw());\r
+```\r
+- "exciseSoftClips" option to BamRecord::CigarData()\r
+\r
+## [0.1.0] - 2015-07-17\r
+\r
+### Changed\r
+- BAM spec v3.0b7 compliance\r
+ - Removal of 'M' as allowed CIGAR operation. Attempt to use such a CIGAR op\r
+ will throw an exception.\r
+ - Addition of IPD/PulseWidth codec version info in header\r
+  \r
+### Added\r
+- Auto-generation of UTC timestamp for DataSet objects\r
+- PbiBuilder - allows generation of PBI index data alongside generation or\r
+modification of BAM record data. This obviates the need to wait for a completed\r
+BAM, then go through the zlib decompression, etc.\r
+- Added DataSet::FromXml(string xml) to create DataSets from "raw" XML string,\r
+rather than building up using DataSet API or loading from existing file.\r
+- "pbindex" command line tool to generate ".pbi" files from BAM data. The\r
+executable is built by default, but can be disabled using the cmake option\r
+"-DPacBioBAM_build_pbindex=OFF".\r
+  \r
+### Fixed\r
+- PBI construction failing on CCS reads\r
+\r
+## [0.0.8] - 2015-07-02\r
+\r
+### Changed\r
+- Build system refactoring.\r
+\r
+## [0.0.7] - 2015-07-02\r
+\r
+### Added\r
+- PBI index lookup API. Not so much intended for client use directly, but will\r
+enable construction of higher-level semantic queries: grouping by, filtering,\r
+etc.\r
+- DataSet & PBI-aware queries (e.g. ZmwGroupQuery). More PBI-enabled queries to\r
+follow.\r
+- More flexibility in tag access. Samtools has a habit of performing a\r
+"shrink-to-fit" when it handles integer-valued tag data. Thus we cannot\r
+**guarantee** the binary type that our API will have to process. Safe\r
+conversions are allowed on integer-like data only. Under- or overflows in\r
+casting will trigger an exception. All other tag data types must be asked for\r
+explicitly, or else an exception will be raised, as before.\r
+- BamHeader::DeepCopy - allows creation of editable header data, without\r
+overwriting all shared instances\r
+\r
+### Fixed\r
+- XSD compliance for DataSet APIs.\r
+\r
+### Changed\r
+- The functionality provided by ZmwQuery (group by hole number), is now\r
+available using the ZmwGroupQuery object. The new ZmwQuery returns a single-\r
+record iterator (a la EntireFileQuery), but limited to a whitelist of requested\r
+hole numbers.\r
+\r
+### Removed\r
+- XSD non-compliant classes (e.g. ExternalDataReference)\r
+\r
+## [0.0.6] - 2015-06-07\r
+\r
+### Added\r
+\r
+- Accessor methods for pulse bam support:\r
+ - LabelQV()\r
+ - AltLabelQV()\r
+ - LabelTag()\r
+ - AltLabelTag()\r
+ - Pkmean()\r
+ - Pkmid()\r
+ - PrePulseFrames() only RC, no clipping\r
+ - PulseCallWidth() only RC, no clipping\r
+ - PulseCall() case-sensitive RC, no clipping\r
+ - IPDRaw() to avoid up and downscaling for stitching\r
+- BamRecord::ParseTagName and BamRecord::ParseTagString to convert a two \r
+  character tag string to a TagName enum and back. Allows a switch over tags.\r
+- VirtualPolymeraseReader to create VirtualPolymeraseBamRecord from a \r
+  subreads|hqregion+scraps.bam\r
+- VirtualRegion represents annotations of the polymerase reads, for adapters, \r
+  barcodes, lqregions, and hqregions.\r
+- ReadGroupInfo operator== \r
+\r
+### Fixed\r
+\r
+- Reimplemented QueryStart(int), QueryEnd(int), UpdateName(void), \r
+  ReadGroup(ReadGroupInfo&), ReadGroupId(std::string&);\r
+\r
+## [0.0.5] - 2015-05-29\r
+\r
+### Added\r
+\r
+- DataSet support. This includes XML I/O, basic dataset query/manipulation, and\r
+multi-BAM-file queries. New classes are located in <pbbam/dataset/>. DataSet-\r
+capable queries currently reside in the PacBio::BAM::staging namespace. These\r
+will be ported over to the main namespace once the support is stabilized and\r
+works seamlessly with either a single BamFile or DataSet object as input. (bug\r
+25941)\r
+- PBI support. This includes read/write raw data & building from a BamFile. The\r
+lookup API for random-access queries is under development, but the raw data is\r
+available - for creating PBI files & generating summary statistics. (bug 26025)\r
+- C# SWIG bindings, alongside existing Python and R wrappers.\r
+- LocalContextFlags support in BamRecord (bug 26623)\r
+\r
+### Fixed\r
+\r
+- BamRecord[Impl] map quality now  initialized with 255 (missing) value, instead\r
+of 0. (bug 26228)\r
+- ReadGroupId calculation. (bug 25940)\r
+  \r
+## [0.0.4] - 2015-04-22\r
+\r
+### Added\r
+\r
+- This changelog. Hope it helps.\r
+- Hook to set verbosity of underlying htslib warnings.\r
+- Grouped queries. (bug 26361)\r
+\r
+### Changed\r
+\r
+- Now using exceptions instead of return codes, output parameters, etc.\r
+- Removed "messy" shared_ptrs across interface (see especially BamHeader). These\r
+are now taken care of within the API, not exposed to client code.\r
+\r
+### Removed\r
+\r
+- BamReader \r
+\r
+### Fixed\r
+\r
+- ASCII tag output. (bug 26381)\r
diff --git a/CMakeLists.txt b/CMakeLists.txt

new file mode 100644 (file)

index 0000000..61ae3ac
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,68 @@
+########################################################################
+# CMake build script for PacBioBAM library.
+########################################################################
+
+cmake_policy(SET CMP0048 NEW)  # lets us set version in project()
+project(PacBioBAM VERSION 0.19.0 LANGUAGES CXX C)
+cmake_minimum_required(VERSION 3.0)
+
+# project name & version
+set(PacBioBAM_NAME pbbam)
+set(PacBioBAM_VERSION
+  "${PacBioBAM_VERSION_MAJOR}.${PacBioBAM_VERSION_MINOR}.${PacBioBAM_VERSION_PATCH}"
+)
+
+# list build-time options
+option(PacBioBAM_build_docs    "Build PacBioBAM's API documentation."                   ON)
+option(PacBioBAM_build_tests   "Build PacBioBAM's unit tests."                          ON)
+option(PacBioBAM_build_shared  "Build PacBioBAM as shared library as well."             OFF)
+option(PacBioBAM_build_tools   "Build PacBioBAM command line utilities (e.g. pbindex)"  ON)
+option(PacBioBAM_use_modbuild  "Build PacBioBAM using Modular Build System."            OFF)
+option(PacBioBAM_use_ccache    "Build PacBioBAM using ccache, if available."            ON)
+option(PacBioBAM_auto_validate "Build PacBioBAM with auto-validation enabled."          OFF)
+
+if(PacBioBAM_build_tests)
+    enable_testing()
+endif()
+
+set(PacBioBAM_permissive_cigar  OFF)
+STRING(TOLOWER "${PBBAM_PERMISSIVE_CIGAR}" PBBAM_PERMISSIVE_CIGAR_LOWER)
+if (DEFINED PBBAM_PERMISSIVE_CIGAR)
+    set(PacBioBAM_permissive_cigar  ON)
+    add_definitions(-DPBBAM_PERMISSIVE_CIGAR)
+endif()
+
+# project paths
+set(PacBioBAM_RootDir       ${CMAKE_CURRENT_LIST_DIR})
+set(PacBioBAM_DocsDir       ${PacBioBAM_RootDir}/docs)
+set(PacBioBAM_IncludeDir    ${PacBioBAM_RootDir}/include)
+set(PacBioBAM_SourceDir     ${PacBioBAM_RootDir}/src)
+set(PacBioBAM_TestsDir      ${PacBioBAM_RootDir}/tests)
+set(PacBioBAM_ThirdPartyDir ${PacBioBAM_RootDir}/third-party)
+set(PacBioBAM_ToolsDir      ${PacBioBAM_RootDir}/tools)
+
+if(NOT PacBioBAM_OutputDir)
+    set(PacBioBAM_OutputDir ${CMAKE_CURRENT_BINARY_DIR})
+endif()
+set(PacBioBAM_BinDir    ${PacBioBAM_OutputDir}/bin)
+set(PacBioBAM_LibDir    ${PacBioBAM_OutputDir}/lib)
+
+set(GeneratedDir ${CMAKE_BINARY_DIR}/generated)
+set(GeneratedTestDataDir ${GeneratedDir}/data)
+file(MAKE_DIRECTORY ${PacBioBAM_BinDir})
+file(MAKE_DIRECTORY ${PacBioBAM_LibDir})
+file(MAKE_DIRECTORY ${GeneratedDir})
+file(MAKE_DIRECTORY ${GeneratedTestDataDir})
+
+# project configuration (keep this order)
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake ${CMAKE_MODULE_PATH})
+include(pbbam-ccache)
+include(pbbam-compilerflags)
+include(pbbam-libtype)
+include(pbbam-dependencies)
+
+# project components (keep this order)
+add_subdirectory(src)
+add_subdirectory(tools)
+add_subdirectory(docs)
+add_subdirectory(tests)
diff --git a/INSTALL.md b/INSTALL.md

new file mode 100644 (file)

index 0000000..86dddda
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,3 @@
+# PacBio::BAM - building & integrating\r
+\r
+Detailed build instructions can be found [here](http://pbbam.readthedocs.org/en/latest/getting_started.html).\r
diff --git a/LICENSE.txt b/LICENSE.txt

new file mode 100644 (file)

index 0000000..fc6affb
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,34 @@
+Copyright (c) 2014-2018, Pacific Biosciences of California, Inc.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted (subject to the limitations in the
+disclaimer below) provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following
+   disclaimer in the documentation and/or other materials provided
+   with the distribution.
+
+ * Neither the name of Pacific Biosciences nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
diff --git a/README.md b/README.md

new file mode 100644 (file)

index 0000000..c9db996
--- /dev/null
+++ b/README.md
@@ -0,0 +1,62 @@
+# pbbam
+
+[![Build Status](https://travis-ci.org/PacificBiosciences/pbbam.svg?branch=master)](https://travis-ci.org/PacificBiosciences/pbbam) [![Documentation Status](https://readthedocs.org/projects/pbbam/badge/?version=latest)](http://pbbam.readthedocs.org/en/latest/?badge=latest)
+
+As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard BAM
+format for (both aligned and unaligned) basecall data files. We have also formulated
+a BAM companion file format (bam.pbi) enabling fast access to a richer set of per-read
+information as well as compatibility for software built around the legacy cmp.h5 format.
+
+The **pbbam** software package provides components to create, query, & edit PacBio BAM
+files and associated indices. These components include a core C++ library, bindings for
+additional languages, and command-line utilities.
+
+### Note:
+
+This library is **not** intended to be used as a general-purpose BAM utility - all input & output BAMs must adhere to the [PacBio BAM format specification](https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst). Non-PacBio BAMs will cause exceptions to be thrown.
+
+##  Documentation
+
+  - [Documentation Home](http://pbbam.readthedocs.org/en/latest/index.html)
+    - [Getting Started](http://pbbam.readthedocs.org/en/latest/getting_started.html)
+    - [C++ API Reference](http://pbbam.readthedocs.org/en/latest/api_reference.html)
+
+  - [Changelog](https://github.com/PacificBiosciences/pbbam/blob/master/CHANGELOG.md)
+
+## FAQ
+
+### [Help! I am getting "unsupported sequencing chemistry combination"!](#chemistry-bundle)
+
+**pbbam** validates all BAM files, and as part of this validation, it checks whether the
+`BindingKit` and `SequencingKit` variables in every ReadGroup of the provided BAM file are
+known. As part of ongoing chemistry developments, we might need to introduce new part numbers
+to identify novel reagents and/or SMRT Cells. You are unlikely to encounter such issues
+when using SMRT Link, as it has an integrated auto-updater that will periodically check and
+install new chemistries automatically. All PacBio tools being used without a proper SMRT Link
+installation will require manual intervention to download new chemistries:
+
+  ```sh
+  cd <some persistent dir>
+  export SMRT_CHEMISTRY_BUNDLE_DIR="${PWD}"
+
+  wget https://raw.githubusercontent.com/PacificBiosciences/pbcore/develop/pbcore/chemistry/resources/mapping.xml -O chemistry.xml
+  ```
+
+This will cause **pbbam** to try to load the out-of-band `chemistry.xml` from
+`SMRT_CHEMISTRY_BUNDLE_DIR` and should allow you to use somewhat older software
+with somewhat newer BAMs. **Note:** this only allows **pbbam**'s internal validation
+to pass, this will not automatically make other chemistry-dependent software work
+with newer chemistries. For instance, Arrow's backend ([Unanimity](https://github.com/PacificBiosciences/unanimity))
+is parametrized on chemistry too, and it will fail should a completely new chemistry
+be introduced. See Unanimity's FAQ on how to employ `SMRT_CHEMISTRY_BUNDLE_DIR`
+to load models for new chemistries.
+
+
+## License
+
+ - [PacBio open source license](https://github.com/PacificBiosciences/pbbam/blob/master/LICENSE.txt)
+
+DISCLAIMER
+----------
+THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, ARE PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF THIS SITE, ALL SITE-RELATED SERVICES, AND ANY THIRD PARTY WEBSITES OR APPLICATIONS. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A WARRANTY OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACIFIC BIOSCIENCES.
+
diff --git a/bamboo_build.sh b/bamboo_build.sh

new file mode 100644 (file)

index 0000000..2f6c06e
--- /dev/null
+++ b/bamboo_build.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+set -vex
+
+################
+# DEPENDENCIES #
+################
+
+## Load modules
+type module >& /dev/null || . /mnt/software/Modules/current/init/bash
+
+module purge
+
+module load meson
+module load ninja
+
+module load zlib
+module load htslib
+module load samtools
+
+module load boost
+
+module load cram
+
+
+BOOST_ROOT="${BOOST_ROOT%/include}"
+# unset these variables to have meson discover all
+# boost-dependent variables from BOOST_ROOT alone
+unset BOOST_INCLUDEDIR
+unset BOOST_LIBRARYDIR
+
+export CC="ccache gcc"
+export CXX="ccache g++"
+export CCACHE_BASEDIR="${PWD}"
+
+if [[ $USER == bamboo ]]; then
+  export CCACHE_DIR=/mnt/secondary/Share/tmp/bamboo.${bamboo_shortPlanKey}.ccachedir
+  export CCACHE_TEMPDIR=/scratch/bamboo.ccache_tempdir
+fi
+
+case "${bamboo_planRepository_branchName}" in
+  develop|master)
+    export PREFIX_ARG="/mnt/software/p/pbbam/${bamboo_planRepository_branchName}"
+    export BUILD_NUMBER="${bamboo_globalBuildNumber:-0}"
+    ;;
+  *)
+    export BUILD_NUMBER="0"
+    ;;
+esac
+
+# in order to make shared libraries consumable
+# by conda and other package managers
+export LDFLAGS="-static-libstdc++ -static-libgcc"
+
+# i : unity build
+for i in "on" "off"; do
+  for j in "system-gcc" "gcc/8.1.0" "gcc"; do
+    # 1. load either current MOBS GCC or RHEL-default GCC
+    if [[ ${j} == system-gcc ]]; then
+      module load gtest/gcc48
+    else
+      module load ${j} gtest
+    fi
+    module load ccache
+
+    export CURRENT_BUILD_DIR="build_unity=${i^^}_gcc=${j/\//_}"
+    export ENABLED_TESTS="true"
+    export ENABLED_UNITY_BUILD="${i}"
+
+    bash scripts/ci/build.sh
+    bash scripts/ci/test.sh
+
+    module unload ccache gtest
+    [[ ${j} != system-gcc ]] && module unload gcc
+  done
+done
+
+# create symlink so Bamboo can find the xunit output
+ln -s "${CURRENT_BUILD_DIR}" build
+
+if [[ -z ${PREFIX_ARG+x} ]]; then
+  echo "Not installing anything (branch: ${bamboo_planRepository_branchName}), exiting."
+  exit 0
+fi
+
+bash scripts/ci/install.sh
+
+if [[ ${BUILD_NUMBER} == 0 ]]; then
+  echo "Build number is 0, hence not creating artifact"
+  exit 0
+fi
+
+echo "## Creating artifact"
+# install into staging dir with --prefix /usr/local
+# in order to sidestep all the artifact policy
+rm -rf staging
+meson configure -Dprefix=/usr/local -Dtests=false "${CURRENT_BUILD_DIR}"
+DESTDIR="${PWD}/staging" ninja -C "${CURRENT_BUILD_DIR}" -v install
+
+if [[ ${BUILD_NUMBER} = 0 ]]; then
+  exit 0
+elif [[ $bamboo_planRepository_branchName == master ]]; then
+  VERSION="$(${CURRENT_BUILD_DIR}/tools/bam2sam --version)".${BUILD_NUMBER}
+  NEXUS_REPO=maven-releases
+elif [[ $bamboo_planRepository_branchName == develop ]]; then
+  VERSION="$(${CURRENT_BUILD_DIR}/tools/bam2sam --version)".SNAPSHOT${BUILD_NUMBER}
+  NEXUS_REPO=maven-snapshots
+  rm -rf /mnt/secondary/builds/unsupported/pbbam.previous
+  if [[ -e /mnt/secondary/builds/unsupported/pbbam ]]; then
+    mv /mnt/secondary/builds/unsupported/pbbam \
+       /mnt/secondary/builds/unsupported/pbbam.previous
+  fi
+  DESTDIR="/mnt/secondary/builds/unsupported/pbbam/" ninja -C "${CURRENT_BUILD_DIR}" -v install
+else
+  exit 0
+fi
+NEXUS_VERSION="$(${CURRENT_BUILD_DIR}/tools/bam2sam --version)".${BUILD_NUMBER}
+
+( cd staging && tar zcf ../pbbam-${VERSION}-x86_64.tgz . )
+md5sum  pbbam-${VERSION}-x86_64.tgz | awk -e '{print $1}' >| pbbam-${VERSION}-x86_64.tgz.md5
+sha1sum pbbam-${VERSION}-x86_64.tgz | awk -e '{print $1}' >| pbbam-${VERSION}-x86_64.tgz.sha1
+
+NEXUS_URL=http://ossnexus.pacificbiosciences.com/repository/${NEXUS_REPO}/pacbio/sat/pbbam/pbbam/${NEXUS_VERSION}
+curl -vn --upload-file pbbam-${VERSION}-x86_64.tgz      ${NEXUS_URL}/gcc-6.4.0/pbbam-${VERSION}-x86_64.tgz
+curl -vn --upload-file pbbam-${VERSION}-x86_64.tgz.md5  ${NEXUS_URL}/gcc-6.4.0/pbbam-${VERSION}-x86_64.tgz.md5
+curl -vn --upload-file pbbam-${VERSION}-x86_64.tgz.sha1 ${NEXUS_URL}/gcc-6.4.0/pbbam-${VERSION}-x86_64.tgz.sha1
diff --git a/bamboo_build_itg.sh b/bamboo_build_itg.sh

new file mode 100755 (executable)

index 0000000..6e27e06
--- /dev/null
+++ b/bamboo_build_itg.sh
@@ -0,0 +1,80 @@
+#!/bin/bash -vex
+type module >& /dev/null || . /mnt/software/Modules/current/init/bash
+
+module load cmake
+module load ccache
+export CCACHE_DIR="/mnt/secondary/Share/tmp/bamboo.${bamboo_shortPlanKey}.ccache"
+
+HTSLIB_VERSION=$(/bin/ls -d src/htslib-*|sed -e 's/.*htslib-//'|sort -V|tail -1)
+PBBAM_VERSION=$(grep 'PacBioBAM VERSION ' src/pbbam/CMakeLists.txt|sed -e 's/.*VERSION //'|awk '{print $1}')
+# project(PacBioBAM VERSION 0.14.0 LANGUAGES CXX C)
+BUILD_NUMBER=0
+if [ -n "$bamboo_planRepository_branchName" ]; then
+  BUILD_NUMBER=${bamboo_globalBuildNumber:-0}
+fi
+
+rm -f *.tgz || true
+rm -rf pbbam* || true
+rm -rf prefix && mkdir -p prefix
+PREFIX=$(readlink -f prefix)
+cd src/htslib-${HTSLIB_VERSION}
+export CCACHE_BASEDIR=$PWD
+make distclean
+CC=gcc CFLAGS='-fPIC -O' bash ./configure --prefix=$PWD/../../prefix --disable-bz2 --disable-lzma --disable-libcurl
+VERBOSE=1 make install
+rm -rf $PWD/../../prefix/lib/pkgconfig
+
+cd -
+cd src/pbbam
+export CCACHE_BASEDIR=$PWD
+rm -rf build && mkdir -p build
+cd build
+curl -sL -O http://nexus/repository/maven-thirdparty/libquadmath/4.8.5-11/libquadmath-devel-4.8.5-11.el7.x86_64.rpm
+rpm2cpio libquadmath-devel-4.8.5-11.el7.x86_64.rpm | cpio -vid
+CXXFLAGS="-fPIC -I$PWD/usr/lib/gcc/x86_64-redhat-linux/4.8.5/include" \
+CFLAGS="-fPIC" \
+cmake \
+  -DCMAKE_CXX_COMPILER="g++" \
+  -DCMAKE_C_COMPILER="gcc" \
+  -DPacBioBAM_build_shared=ON \
+  -DPacBioBAM_build_docs=OFF \
+  -DPacBioBAM_build_tests=OFF \
+  -DHTSLIB_INCLUDE_DIRS=$PREFIX/include \
+  -DHTSLIB_LIBRARIES=$PREFIX/lib/libhts.a \
+  -DBoost_INCLUDE_DIRS=/mnt/software/b/boost/1.60/include \
+  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+  -DCMAKE_SKIP_BUILD_RPATH=FALSE ..
+VERBOSE=1 make -j
+tar c bin lib | tar xv -C ../../../prefix/
+cd ..
+tar c include/pbbam | tar xv -C ../../prefix/
+cd ../..
+
+if [ ! -n "$bamboo_planRepository_branchName" ]; then
+  SNAPSHOT="_branch_"
+fi
+if [ "$bamboo_planRepository_branchName" = "develop" ]; then
+  SNAPSHOT="SNAPSHOT"
+elif [ "$bamboo_planRepository_branchName" = "master" ]; then
+  SNAPSHOT=""
+else
+  SNAPSHOT="_branch_"
+fi
+
+rsync -ax --delete prefix/ pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}/
+
+tar zcf pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}
+sha1sum pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz | awk -e '{print $1}' >| pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.sha1
+md5sum  pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz | awk -e '{print $1}' >| pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.md5
+
+if [ "$bamboo_planRepository_branchName" = "develop" ]; then
+  NEXUS_URL=http://ossnexus.pacificbiosciences.com/repository/maven-snapshots/pacbio/itg/pbbam/${PBBAM_VERSION}.${BUILD_NUMBER}
+elif [ "$bamboo_planRepository_branchName" = "master" ]; then
+  NEXUS_URL=http://ossnexus.pacificbiosciences.com/repository/maven-releases/pacbio/itg/pbbam/${PBBAM_VERSION}.${BUILD_NUMBER}
+else
+  echo "[INFO] pbbam-${PBBAM_VERSION}.SNAPSHOT${BUILD_NUMBER}-x86_64.tgz if the branch was develop"
+  exit
+fi
+curl -L -fvn --upload-file pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.md5  $NEXUS_URL/pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.md5
+curl -L -fvn --upload-file pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.sha1 $NEXUS_URL/pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.sha1
+curl -L -fvn --upload-file pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz      $NEXUS_URL/pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz
diff --git a/bamboo_build_pa.sh b/bamboo_build_pa.sh

new file mode 100755 (executable)

index 0000000..0ee2484
--- /dev/null
+++ b/bamboo_build_pa.sh
@@ -0,0 +1,144 @@
+#!/bin/bash -vex
+type module >& /dev/null || . /mnt/software/Modules/current/init/bash
+
+module use /mnt/software/modulefiles
+module use /pbi/dept/primary/modulefiles
+
+module load cmake
+module load ccache
+export CCACHE_DIR="/mnt/secondary/Share/tmp/bamboo.${bamboo_shortPlanKey}.ccache"
+# module load composer_xe/2017.4.196
+
+HTSLIB_VERSION=$(/bin/ls -d src/htslib-*|sed -e 's/.*htslib-//'|sort -V|tail -1)
+PBBAM_VERSION=$(grep 'PacBioBAM VERSION ' src/pbbam/CMakeLists.txt|sed -e 's/.*VERSION //'|awk '{print $1}')
+# project(PacBioBAM VERSION 0.14.0 LANGUAGES CXX C)
+BUILD_NUMBER=0
+if [ -n "$bamboo_planRepository_branchName" ]; then
+  BUILD_NUMBER=${bamboo_globalBuildNumber:-0}
+fi
+
+rm -f *.tgz || true
+rm -rf pbbam* || true
+# rm -rf prefix && mkdir -p prefix
+# cd src/htslib-${HTSLIB_VERSION}
+# export CCACHE_BASEDIR=$PWD
+# CC=icc CXX=icpc \
+# CFLAGS='-fPIC -Os' bash ./configure --prefix=$PWD/../../prefix
+# VERBOSE=1 make CC='ccache icc' install
+# rm -rf $PWD/../../prefix/lib/pkgconfig
+# 
+# cd -
+# cd src/pbbam
+# export CCACHE_BASEDIR=$PWD
+# rm -rf build && mkdir -p build
+# cd build
+# cmake \
+#   -DCMAKE_CXX_COMPILER="ccache" \
+#   -DCMAKE_CXX_COMPILER_ARG1="icpc -fPIC" \
+#   -DCMAKE_C_COMPILER="ccache" \
+#   -DCMAKE_C_COMPILER_ARG1="icc -fPIC" \
+#   -DPacBioBAM_build_shared=OFF \
+#   -DPacBioBAM_build_docs=OFF \
+#   -DPacBioBAM_build_tests=OFF \
+#   -DHTSLIB_INCLUDE_DIRS=$PWD/../../../prefix/include \
+#   -DHTSLIB_LIBRARIES=$PWD/../../../prefix/lib/libhts.a \
+#   -DBoost_INCLUDE_DIRS=/mnt/software/b/boost/1.60/include \
+#   -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+#   -DCMAKE_SKIP_BUILD_RPATH=FALSE ..
+# VERBOSE=1 make -j
+# tar c bin lib | tar xv -C ../../../prefix/
+# cd ..
+# tar c include/pbbam | tar xv -C ../../prefix/
+# cd ../..
+# 
+# if [ ! -n "$bamboo_planRepository_branchName" ]; then
+#   SNAPSHOT="_branch_"
+# fi
+# if [ "$bamboo_planRepository_branchName" = "develop" ]; then
+#   SNAPSHOT="SNAPSHOT"
+# elif [ "$bamboo_planRepository_branchName" = "master" ]; then
+#   SNAPSHOT=""
+# else
+#   SNAPSHOT="_branch_"
+# fi
+# 
+# rsync -ax --delete prefix/ pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}/
+# 
+# tar zcf pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}
+# sha1sum pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz | awk -e '{print $1}' >| pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.sha1
+# md5sum  pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz | awk -e '{print $1}' >| pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.md5
+# if [ "$bamboo_planRepository_branchName" = "develop" -o "$bamboo_planRepository_branchName" = "master" ]; then
+#   NEXUS_URL=http://ossnexus.pacificbiosciences.com/repository/maven-snapshots/pacbio/seq/pa/pbbam/${PBBAM_VERSION}.${BUILD_NUMBER}
+#   curl -L -fvn --upload-file pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.md5  $NEXUS_URL/pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.md5
+#   curl -L -fvn --upload-file pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.sha1 $NEXUS_URL/pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.sha1
+#   curl -L -fvn --upload-file pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz      $NEXUS_URL/pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz
+# else
+#   echo "[INFO] pbbam-${PBBAM_VERSION}.SNAPSHOT${BUILD_NUMBER}-x86_64.tgz if the branch is develop"
+# fi
+
+# GCC BUILD
+
+# module unload composer_xe/2017.4.196
+# module load gcc/4.9.2
+rm -rf prefix && mkdir -p prefix
+cd src/htslib-${HTSLIB_VERSION}
+export CCACHE_BASEDIR=$PWD
+make distclean
+CC=gcc CFLAGS='-fPIC -O' bash ./configure --prefix=$PWD/../../prefix --disable-bz2 --disable-lzma --disable-libcurl
+VERBOSE=1 make install
+rm -rf $PWD/../../prefix/lib/pkgconfig
+
+cd -
+cd src/pbbam
+export CCACHE_BASEDIR=$PWD
+rm -rf build && mkdir -p build
+cd build
+curl -sL -O http://nexus/repository/maven-thirdparty/libquadmath/4.8.5-11/libquadmath-devel-4.8.5-11.el7.x86_64.rpm
+rpm2cpio libquadmath-devel-4.8.5-11.el7.x86_64.rpm | cpio -vid
+CXXFLAGS="-fPIC -I$PWD/usr/lib/gcc/x86_64-redhat-linux/4.8.5/include" \
+CFLAGS="-fPIC" \
+cmake \
+  -DCMAKE_CXX_COMPILER="g++" \
+  -DCMAKE_C_COMPILER="gcc" \
+  -DPacBioBAM_build_shared=OFF \
+  -DPacBioBAM_build_docs=OFF \
+  -DPacBioBAM_build_tests=OFF \
+  -DHTSLIB_INCLUDE_DIRS=$PWD/../../../prefix/include \
+  -DHTSLIB_LIBRARIES=$PWD/../../../prefix/lib/libhts.a \
+  -DBoost_INCLUDE_DIRS=/mnt/software/b/boost/1.60/include \
+  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+  -DCMAKE_SKIP_BUILD_RPATH=FALSE ..
+VERBOSE=1 make -j
+tar c bin lib | tar xv -C ../../../prefix/
+cd ..
+tar c include/pbbam | tar xv -C ../../prefix/
+cd ../..
+
+if [ ! -n "$bamboo_planRepository_branchName" ]; then
+  SNAPSHOT="_branch_"
+fi
+if [ "$bamboo_planRepository_branchName" = "develop" ]; then
+  SNAPSHOT="SNAPSHOT"
+elif [ "$bamboo_planRepository_branchName" = "master" ]; then
+  SNAPSHOT=""
+else
+  SNAPSHOT="_branch_"
+fi
+
+rsync -ax --delete prefix/ pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}/
+
+tar zcf pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}
+sha1sum pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz | awk -e '{print $1}' >| pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.sha1
+md5sum  pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz | awk -e '{print $1}' >| pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.md5
+if [ "$bamboo_planRepository_branchName" = "develop" ]; then
+  NEXUS_URL=http://ossnexus.pacificbiosciences.com/repository/maven-snapshots/pacbio/seq/pa/pbbam/${PBBAM_VERSION}.${BUILD_NUMBER}
+elif [ "$bamboo_planRepository_branchName" = "master" ]; then
+  NEXUS_URL=http://ossnexus.pacificbiosciences.com/repository/maven-releases/pacbio/seq/pa/pbbam/${PBBAM_VERSION}.${BUILD_NUMBER}
+else
+  echo "[INFO] pbbam-${PBBAM_VERSION}.SNAPSHOT${BUILD_NUMBER}-x86_64.tgz if the branch was develop"
+  echo "[INFO] pbbam-${PBBAM_VERSION}.${BUILD_NUMBER}-x86_64.tgz if the branch was master"
+  exit
+fi
+curl -L -fvn --upload-file pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.md5  $NEXUS_URL/pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.md5
+curl -L -fvn --upload-file pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.sha1 $NEXUS_URL/pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz.sha1
+curl -L -fvn --upload-file pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz      $NEXUS_URL/pbbam-${PBBAM_VERSION}.${SNAPSHOT}${BUILD_NUMBER}-x86_64.tgz
diff --git a/bamboo_coverage.sh b/bamboo_coverage.sh

new file mode 100644 (file)

index 0000000..1e40169
--- /dev/null
+++ b/bamboo_coverage.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+set -xve
+
+source /mnt/software/Modules/current/init/bash
+module load gcc meson ccache ninja zlib htslib samtools cram boost gtest gcov
+
+echo "#####################"
+echo "# BUILD & RUN TESTS #"
+echo "#####################"
+
+rm -rf build
+mkdir build 
+cd build
+
+meson \
+  --werror \
+  --backend ninja \
+  --buildtype debug \
+  --default-library shared \
+  --libdir lib \
+  --wrap-mode nofallback \
+  --prefix "${PREFIX_ARG:-/usr/local}" \
+  -Db_coverage=true \
+  ..
+
+ninja test
+
+echo "################"
+echo "# COVERAGE     #"
+echo "################"
+
+find . -type f -iname '*.o' | xargs gcov -acbrfu {} \; >/dev/null && \
+mkdir coverage && pushd coverage && mv ../*.gcov . && \
+sed -i -e 's@Source:@Source:../@' *.gcov && \
+sed -i -e 's@Graph:@Graph:../@' *.gcov && \
+sed -i -e 's@Data:@Data:../@' *.gcov && \
+rm pugixml*
+
diff --git a/cmake/FindHTSlib.cmake b/cmake/FindHTSlib.cmake

new file mode 100644 (file)

index 0000000..72cce35
--- /dev/null
+++ b/cmake/FindHTSlib.cmake
@@ -0,0 +1,11 @@
+# Find HTSlib
+find_package(PkgConfig REQUIRED)
+pkg_check_modules(HTSlib REQUIRED htslib)
+
+# because CMake is trying to be extra clever,
+# it will not properly load libraries with
+# absolute paths in *_LIBRARIES
+set(HTSlib_LIBRARIES "${HTSlib_LDFLAGS}")
+
+message(STATUS "   HTSlib include dirs: ${HTSlib_INCLUDE_DIRS}")
+message(STATUS "   HTSlib libraries: ${HTSlib_LIBRARIES}")
diff --git a/cmake/PbbamTool.cmake b/cmake/PbbamTool.cmake

new file mode 100644 (file)

index 0000000..daed917
--- /dev/null
+++ b/cmake/PbbamTool.cmake
@@ -0,0 +1,23 @@
+include(CMakeParseArguments)
+
+function(create_pbbam_tool)
+
+    # parse args
+    set(oneValueArgs TARGET)
+    set(multiValueArgs SOURCES)
+    cmake_parse_arguments(create_pbbam_tool "" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    # create executable
+    include_directories(
+        ${ToolsCommonDir}           # shared tool code
+        ${GeneratedDir}             # generated version headers
+        ${PacBioBAM_INCLUDE_DIRS}   # pbbam/htslib includes
+    )
+    add_executable(${create_pbbam_tool_TARGET} ${create_pbbam_tool_SOURCES})
+    set_target_properties(
+        ${create_pbbam_tool_TARGET} PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY ${PacBioBAM_BinDir}
+    )
+    target_link_libraries(${create_pbbam_tool_TARGET} pbbam)
+
+endfunction(create_pbbam_tool)
diff --git a/cmake/pbbam-ccache.cmake b/cmake/pbbam-ccache.cmake

new file mode 100644 (file)

index 0000000..21b8ac5
--- /dev/null
+++ b/cmake/pbbam-ccache.cmake
@@ -0,0 +1,8 @@
+
+if(PacBioBAM_use_ccache)
+    find_program(CCACHE_FOUND ccache)
+    if(CCACHE_FOUND)
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK    ccache)
+    endif()
+endif()
diff --git a/cmake/pbbam-compilerflags.cmake b/cmake/pbbam-compilerflags.cmake

new file mode 100644 (file)

index 0000000..3dd1b60
--- /dev/null
+++ b/cmake/pbbam-compilerflags.cmake
@@ -0,0 +1,44 @@
+
+include(CheckCXXCompilerFlag)
+
+# C++11 check & enabling
+if (CMAKE_VERSION VERSION_LESS "3.1")
+    if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")    # clang
+    else()
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")        # gcc
+    endif()
+else() # 3.1+
+    set(CMAKE_CXX_STANDARD          14)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+endif()
+
+# shared CXX flags for src & tests
+if (MSVC)
+    set(PacBioBAM_CXX_FLAGS "/Wall")
+else()
+    set(PacBioBAM_CXX_FLAGS "-Wall")
+endif()
+
+# NOTE: -Wno-unused-local-typedefs used to quash clang warnings w/ Boost
+check_cxx_compiler_flag("-Wno-unused-local-typedefs" HAS_NO_UNUSED_LOCAL_TYPEDEFS)
+if(HAS_NO_UNUSED_LOCAL_TYPEDEFS)
+    set(PacBioBAM_CXX_FLAGS "${PacBioBAM_CXX_FLAGS} -Wno-unused-local-typedefs")
+endif()
+
+check_cxx_compiler_flag("-Wno-sign-compare" HAS_NO_SIGN_COMPARE)
+if(HAS_NO_SIGN_COMPARE)
+    set(PacBioBAM_CXX_FLAGS "${PacBioBAM_CXX_FLAGS} -Wno-sign-compare")
+endif()
+
+# Turn on windows-style filepath resolution.
+# We need to add this #define early (not just in the C# SWIG wrapper)
+if(WIN32)
+    add_definitions(-DPBBAM_WIN_FILEPATHS)
+endif()
+
+# For now, keep @rpath out of install names on OS X, as it causes SWIG
+# tests to fail.
+if(APPLE)
+    set(CMAKE_MACOSX_RPATH OFF)
+endif()
diff --git a/cmake/pbbam-dependencies.cmake b/cmake/pbbam-dependencies.cmake

new file mode 100644 (file)

index 0000000..8da899d
--- /dev/null
+++ b/cmake/pbbam-dependencies.cmake
@@ -0,0 +1,32 @@
+
+# pthreads
+find_package(Threads REQUIRED)
+
+# boost
+if(NOT Boost_INCLUDE_DIRS)
+    find_package(Boost REQUIRED)
+endif()
+
+# Winsock for htslib on Windows
+if(WIN32)
+    set(SOCKET_LIBRARIES "ws2_32")
+endif()
+
+# zlib
+if (NOT ZLIB_INCLUDE_DIRS OR NOT ZLIB_LIBRARIES)
+    find_package(PkgConfig REQUIRED)
+    pkg_check_modules(ZLIB zlib)
+    set(ZLIB_LIBRARIES ${ZLIB_LDFLAGS})
+else()
+    set(ZLIB_LDFLAGS ${ZLIB_LIBRARIES})
+endif()
+
+# htslib
+if(NOT HTSLIB_INCLUDE_DIRS OR NOT HTSLIB_LIBRARIES)
+    find_package(HTSlib)
+    set(hts_INCLUDE_DIRS ${HTSlib_INCLUDE_DIRS})
+    set(hts_LIBRARIES    ${HTSlib_LIBRARIES})
+else()    
+    set(hts_INCLUDE_DIRS ${HTSLIB_INCLUDE_DIRS})
+    set(hts_LIBRARIES    ${HTSLIB_LIBRARIES})
+endif()
diff --git a/cmake/pbbam-libtype.cmake b/cmake/pbbam-libtype.cmake

new file mode 100644 (file)

index 0000000..827cbe4
--- /dev/null
+++ b/cmake/pbbam-libtype.cmake
@@ -0,0 +1,21 @@
+
+# determine if we need a shared lib
+if(PacBioBAM_build_shared)
+    set(BUILD_SHARED_LIBS ON)
+    set(htslib_build_shared ON CACHE BOOL "force htslibConfig to export proper library name")
+    set(PB_LIB_MODE SHARED)
+    set(PB_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
+else()
+    set(BUILD_SHARED_LIBS OFF)
+    set(PB_LIB_MODE STATIC)
+    set(PB_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
+endif()
+
+if(WIN32)
+    # Limit the number of DLLs we will have to bundle
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -static-libstdc++")
+  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libgcc -static-libstdc++")
+endif()
+
+
+
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt

new file mode 100644 (file)

index 0000000..ff044b9
--- /dev/null
+++ b/docs/CMakeLists.txt
@@ -0,0 +1,11 @@
+find_package(Doxygen)
+
+if(DOXYGEN_FOUND)
+    configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${PacBioBAM_DocsDir}/Doxyfile @ONLY )
+    add_custom_target(doc
+        ${DOXYGEN_EXECUTABLE} ${PacBioBAM_DocsDir}/Doxyfile
+        WORKING_DIRECTORY ${PacBioBAM_DocsDir}
+        COMMENT "Generating API documentation with Doxygen"
+        VERBATIM
+    )
+endif()
diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in

new file mode 100644 (file)

index 0000000..90f6f63
--- /dev/null
+++ b/docs/Doxyfile.in
@@ -0,0 +1,1602 @@
+# Doxyfile 1.6.3
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file 
+# that follow. The default is UTF-8 which is also the encoding used for all 
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the 
+# iconv built into libc) for the transcoding. See 
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded 
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = @PacBioBAM_NAME@
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. 
+# This could be handy for archiving the generated documentation or 
+# if some version control system is used.
+
+PROJECT_NUMBER         = @PacBioBAM_VERSION@
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
+# base path where the generated documentation will be put. 
+# If a relative path is entered, it will be relative to the location 
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = @PacBioBAM_DocsDir@
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
+# 4096 sub-directories (in 2 levels) under the output directory of each output 
+# format and will distribute the generated files over these directories. 
+# Enabling this option can be useful when feeding doxygen a huge amount of 
+# source files, where putting all generated files in the same directory would 
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all 
+# documentation generated by doxygen is written. Doxygen will use this 
+# information to generate all constant output in the proper language. 
+# The default language is English, other supported languages are: 
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, 
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, 
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English 
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, 
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, 
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will 
+# include brief member descriptions after the members that are listed in 
+# the file and class documentation (similar to JavaDoc). 
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend 
+# the brief description of a member or function before the detailed description. 
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the 
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator 
+# that is used to form the text in various listings. Each string 
+# in this list, if found as the leading text of the brief description, will be 
+# stripped from the text and the result after processing the whole list, is 
+# used as the annotated text. Otherwise, the brief description is used as-is. 
+# If left blank, the following values are used ("$name" is automatically 
+# replaced with the name of the entity): "The $name class" "The $name widget" 
+# "The $name file" "is" "provides" "specifies" "contains" 
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then 
+# Doxygen will generate a detailed section even if there is only a brief 
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all 
+# inherited members of a class in the documentation of that class as if those 
+# members were ordinary class members. Constructors, destructors and assignment 
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full 
+# path before files name in the file list and in the header files. If set 
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag 
+# can be used to strip a user-defined part of the path. Stripping is 
+# only done if one of the specified strings matches the left-hand part of 
+# the path. The tag can be used to show relative paths in the file list. 
+# If left blank the directory from which doxygen is run is used as the 
+# path to strip.
+
+STRIP_FROM_PATH        = 
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of 
+# the path mentioned in the documentation of a class, which tells 
+# the reader which header file to include in order to use a class. 
+# If left blank only the name of the header file containing the class 
+# definition is used. Otherwise one should specify the include paths that 
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    = @PacBioBAM_IncludeDir@
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 
+# (but less readable) file names. This can be useful is your file systems 
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen 
+# will interpret the first line (until the first dot) of a JavaDoc-style 
+# comment as the brief description. If set to NO, the JavaDoc 
+# comments will behave just like regular Qt-style comments 
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will 
+# interpret the first line (until the first dot) of a Qt-style 
+# comment as the brief description. If set to NO, the comments 
+# will behave just like regular Qt-style comments (thus requiring 
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen 
+# treat a multi-line C++ special comment block (i.e. a block of //! or /// 
+# comments) as a brief description. This used to be the default behaviour. 
+# The new default is to treat a multi-line C++ comment block as a detailed 
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented 
+# member inherits the documentation from any documented member that it 
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce 
+# a new page for each member. If set to NO, the documentation of a member will 
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. 
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 1
+
+# This tag can be used to specify a number of aliases that acts 
+# as commands in the documentation. An alias has the form "name=value". 
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to 
+# put the command \sideeffect (or @sideeffect) in the documentation, which 
+# will result in a user-defined paragraph with heading "Side Effects:". 
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+#samSpecURL=http://samtools.sourceforge.net/SAM1.pdf
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C 
+# sources only. Doxygen will then generate output that is more tailored for C. 
+# For instance, some of the names that are used will be different. The list 
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Java. For instance, namespaces will be presented as packages, qualified 
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran 
+# sources only. Doxygen will then generate output that is more tailored for 
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL 
+# sources. Doxygen will then generate output that is tailored for 
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it parses. 
+# With this tag you can assign which parser to use for a given extension. 
+# Doxygen has a built-in mapping, but you can override or extend it using this tag. 
+# The format is ext=language, where ext is a file extension, and language is one of 
+# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP, 
+# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat 
+# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran), 
+# use: inc=Fortran f=C. Note that for custom extensions you also need to set
+# FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      = 
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want 
+# to include (a tag file for) the STL sources as input, then you should 
+# set this tag to YES in order to let doxygen match functions declarations and 
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. 
+# func(std::string) {}). This also make the inheritance and collaboration 
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to 
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. 
+# Doxygen will parse them like normal C++ but will assume all classes use public 
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter 
+# and setter methods for a property. Setting this option to YES (the default) 
+# will make doxygen to replace the get and set methods by a property in the 
+# documentation. This will only work if the methods are indeed getting or 
+# setting a simple type. If this is not the case, or you want to show the 
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC 
+# tag is set to YES, then doxygen will reuse the documentation of the first 
+# member in the group (if any) for the other members of the group. By default 
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of 
+# the same type (for instance a group of public functions) to be put as a 
+# subgroup of that type (e.g. under the Public Functions section). Set it to 
+# NO to prevent subgrouping. Alternatively, this can be done per class using 
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum 
+# is documented as struct, union, or enum with the name of the typedef. So 
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct 
+# with name TypeT. When disabled the typedef will appear as a member of a file, 
+# namespace, or class. And the struct will be named TypeS. This can typically 
+# be useful for C code in case the coding convention dictates that all compound 
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to 
+# determine which symbols to keep in memory and which to flush to disk. 
+# When the cache is full, less often used symbols will be written to disk. 
+# For small to medium size projects (<1000 input files) the default value is 
+# probably good enough. For larger projects a too small cache size can cause 
+# doxygen to be busy swapping symbols to and from disk most of the time 
+# causing a significant performance penality. 
+# If the system has enough physical memory increasing the cache will improve the 
+# performance by keeping more symbols in memory. Note that the value works on 
+# a logarithmic scale so increasing the size by one will rougly double the 
+# memory usage. The cache size is given by this formula: 
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, 
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in 
+# documentation are documented, even if no documentation was available. 
+# Private class members and static file members will be hidden unless 
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class 
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file 
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) 
+# defined locally in source files will be included in the documentation. 
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local 
+# methods, which are defined in the implementation section but not in 
+# the interface are included in the documentation. 
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be 
+# extracted and appear in the documentation as a namespace called 
+# 'anonymous_namespace{file}', where file will be replaced with the base 
+# name of the file that contains the anonymous namespace. By default 
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all 
+# undocumented members of documented classes, files or namespaces. 
+# If set to NO (the default) these members will be included in the 
+# various overviews, but no documentation section is generated. 
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all 
+# undocumented classes that are normally visible in the class hierarchy. 
+# If set to NO (the default) these classes will be included in the various 
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all 
+# friend (class|struct|union) declarations. 
+# If set to NO (the default) these declarations will be included in the 
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any 
+# documentation blocks found inside the body of a function. 
+# If set to NO (the default) these blocks will be appended to the 
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation 
+# that is typed after a \internal command is included. If the tag is set 
+# to NO (the default) then the documentation will be excluded. 
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate 
+# file names in lower-case letters. If set to YES upper-case letters are also 
+# allowed. This is useful if you have classes or files whose names only differ 
+# in case and if your file system supports case sensitive file names. Windows 
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen 
+# will show members with their full class and namespace scopes in the 
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen 
+# will put a list of the files that are included by a file in the documentation 
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen 
+# will list include files with double quotes in the documentation 
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] 
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen 
+# will sort the (detailed) documentation of file and class members 
+# alphabetically by member name. If set to NO the members will appear in 
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the 
+# brief documentation of file, namespace and class members alphabetically 
+# by member name. If set to NO (the default) the members will appear in 
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the 
+# hierarchy of group names into alphabetical order. If set to NO (the default) 
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be 
+# sorted by fully-qualified names, including namespaces. If set to 
+# NO (the default), the class list will be sorted only by class name, 
+# not including the namespace part. 
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. 
+# Note: This option applies only to the class list, not to the 
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or 
+# disable (NO) the todo list. This list is created by putting \todo 
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or 
+# disable (NO) the test list. This list is created by putting \test 
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or 
+# disable (NO) the bug list. This list is created by putting \bug 
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or 
+# disable (NO) the deprecated list. This list is created by putting 
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional 
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       = 
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines 
+# the initial value of a variable or define consists of for it to appear in 
+# the documentation. If the initializer consists of more lines than specified 
+# here it will be hidden. Use a value of 0 to hide initializers completely. 
+# The appearance of the initializer of individual variables and defines in the 
+# documentation can be controlled using \showinitializer or \hideinitializer 
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated 
+# at the bottom of the documentation of classes and structs. If set to YES the 
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories 
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy 
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. 
+# This will remove the Files entry from the Quick Index and from the 
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the 
+# Namespaces page.  This will remove the Namespaces entry from the Quick Index 
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that 
+# doxygen should invoke to get the current version for each file (typically from 
+# the version control system). Doxygen will invoke the program by executing (via 
+# popen()) the command <command> <input-file>, where <command> is the value of 
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file 
+# provided by doxygen. Whatever the program writes to standard output 
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    = 
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by 
+# doxygen. The layout file controls the global structure of the generated output files 
+# in an output format independent way. The create the layout file that represents 
+# doxygen's defaults, run doxygen with the -l option. You can optionally specify a 
+# file name after the option, if omitted DoxygenLayout.xml will be used as the name 
+# of the layout file.
+
+LAYOUT_FILE            = 
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated 
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are 
+# generated by doxygen. Possible values are YES and NO. If left blank 
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings 
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will 
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for 
+# potential errors in the documentation, such as not documenting some 
+# parameters in a documented function, or documenting parameters that 
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for 
+# functions that are documented, but have no documentation for their parameters 
+# or return value. If set to NO (the default) doxygen will only warn about 
+# wrong or incomplete parameter documentation, but not about the absence of 
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that 
+# doxygen can produce. The string should contain the $file, $line, and $text 
+# tags, which will be replaced by the file and line number from which the 
+# warning originated and the warning text. Optionally the format may contain 
+# $version, which will be replaced by the version of the file (if it could 
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning 
+# and error messages should be written. If left blank the output is written 
+# to stderr.
+
+WARN_LOGFILE           = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain 
+# documented source files. You may enter file names like "myfile.cpp" or 
+# directories like "/usr/src/myproject". Separate the files or directories 
+# with spaces.
+
+INPUT                  = @PacBioBAM_IncludeDir@
+
+# This tag can be used to specify the character encoding of the source files 
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
+# also the default input encoding. Doxygen uses libiconv (or the iconv built 
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for 
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the 
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank the following patterns are tested: 
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx 
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.d \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.idl \
+                         *.odl \
+                         *.cs \
+                         *.php \
+                         *.php3 \
+                         *.inc \
+                         *.m \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.f90 \
+                         *.f \
+                         *.vhd \
+                         *.vhdl
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories 
+# should be searched for input files as well. Possible values are YES and NO. 
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should 
+# excluded from the INPUT source files. This way you can easily exclude a 
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                = @PacBioBAM_IncludeDir@/pbbam/internal 
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or 
+# directories that are symbolic links (a Unix filesystem feature) are excluded 
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the 
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude 
+# certain files from those directories. Note that the wildcards are matched 
+# against the file with absolute path, so to exclude all test directories 
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = 
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names 
+# (namespaces, classes, functions, etc.) that should be excluded from the 
+# output. The symbol name can be a fully qualified name, a word, or if the 
+# wildcard * is used, a substring. Examples: ANamespace, AClass, 
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        = pugi, PacBio::BAM::internal
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or 
+# directories that contain example code fragments that are included (see 
+# the \include command).
+
+EXAMPLE_PATH           = examples 
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the 
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
+# and *.h) to filter out the source-files in the directories. If left 
+# blank all files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be 
+# searched for input files to be used with the \include or \dontinclude 
+# commands irrespective of the value of the RECURSIVE tag. 
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or 
+# directories that contain image that are included in the documentation (see 
+# the \image command).
+
+IMAGE_PATH             = 
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should 
+# invoke to filter for each input file. Doxygen will invoke the filter program 
+# by executing (via popen()) the command <filter> <input-file>, where <filter> 
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an 
+# input file. Doxygen will then use the output that the filter program writes 
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be 
+# ignored.
+
+INPUT_FILTER           = 
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern 
+# basis.  Doxygen will compare the file name with each pattern and apply the 
+# filter if there is a match.  The filters are a list of the form: 
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further 
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER 
+# is applied to all files.
+
+FILTER_PATTERNS        = 
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using 
+# INPUT_FILTER) will be used to filter the input files when producing source 
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will 
+# be generated. Documented entities will be cross-referenced with these sources. 
+# Note: To get rid of all source code in the generated output, make sure also 
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body 
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct 
+# doxygen to hide any special comment blocks from generated source code 
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES 
+# then for each documented function all documented 
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES 
+# then for each documented function all documented entities 
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) 
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from 
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will 
+# link to the source code.  Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code 
+# will point to the HTML generated by the htags(1) tool instead of doxygen 
+# built-in source browser. The htags tool is part of GNU's global source 
+# tagging system (see http://www.gnu.org/software/global/global.html). You 
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen 
+# will generate a verbatim copy of the header file for each class for 
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index 
+# of all compounds will be generated. Enable this if the project 
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then 
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns 
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all 
+# classes will be put under the same header in the alphabetical index. 
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that 
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will 
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for 
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank 
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard header.
+
+HTML_HEADER            = 
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for 
+# each generated HTML page. If it is left blank doxygen will generate a 
+# standard footer.
+
+HTML_FOOTER            = 
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading 
+# style sheet that is used by each HTML page. It can be used to 
+# fine-tune the look of the HTML output. If the tag is left blank doxygen 
+# will generate a default style sheet. Note that doxygen will try to copy 
+# the style sheet file to the HTML output directory, so don't put your own 
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        = 
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML 
+# page will contain the date and time when the page was generated. Setting 
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, 
+# files or namespaces will be aligned in HTML using tables. If set to 
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML 
+# documentation will contain sections that can be hidden and shown after the 
+# page has loaded. For this to work a browser that supports 
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox 
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = YES
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files 
+# will be generated that can be used as input for Apple's Xcode 3 
+# integrated development environment, introduced with OSX 10.5 (Leopard). 
+# To create a documentation set, doxygen will generate a Makefile in the 
+# HTML output directory. Running make will produce the docset in that 
+# directory and running "make install" will install the docset in 
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find 
+# it at startup. 
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the 
+# feed. A documentation feed provides an umbrella under which multiple 
+# documentation sets from a single provider (such as a company or product suite) 
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that 
+# should uniquely identify the documentation set bundle. This should be a 
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen 
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files 
+# will be generated that can be used as input for tools like the 
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) 
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can 
+# be used to specify the file name of the resulting .chm file. You 
+# can add a path in front of the file if the result should not be 
+# written to the html output directory.
+
+CHM_FILE               = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can 
+# be used to specify the location (absolute path including file name) of 
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run 
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag 
+# controls if a separate .chi index file is generated (YES) or that 
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING 
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file 
+# content.
+
+CHM_INDEX_ENCODING     = 
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag 
+# controls whether a binary table of contents is generated (YES) or a 
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members 
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER 
+# are set, an additional index file will be generated that can be used as input for 
+# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated 
+# HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can 
+# be used to specify the file name of the resulting .qch file. 
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               = 
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating 
+# Qt Help Project output. For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating 
+# Qt Help Project output. For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add. 
+# For more information please see 
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   = 
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see 
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  = 
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's 
+# filter section matches. 
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  = 
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can 
+# be used to specify the location of Qt's qhelpgenerator. 
+# If non-empty doxygen will try to run qhelpgenerator on the generated 
+# .qhp file.
+
+QHG_LOCATION           = 
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files  
+# will be generated, which together with the HTML files, form an Eclipse help  
+# plugin. To install this plugin and make it available under the help contents 
+# menu in Eclipse, the contents of the directory containing the HTML and XML 
+# files needs to be copied into the plugins directory of eclipse. The name of 
+# the directory within the plugins directory should be the same as 
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin 
+# the directory name containing the HTML and XML files should also have 
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at 
+# top of each HTML page. The value NO (the default) enables the index and 
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20]) 
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index 
+# structure should be generated to display hierarchical information. 
+# If the tag value is set to YES, a side panel will be generated 
+# containing a tree-like index structure (just like the one that 
+# is generated for HTML Help). For this to work a browser that supports 
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). 
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, 
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be 
+# used to set the initial width (in pixels) of the frame in which the tree 
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# Use this tag to change the font size of Latex formulas included 
+# as images in the HTML documentation. The default is 10. Note that 
+# when you change the font size after a successful doxygen run you need 
+# to manually remove any form_*.png images from the HTML output directory 
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript 
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should 
+# typically be disabled. For large projects the javascript based search engine 
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index 
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvances is that it is more difficult to setup 
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 
+# invoked. If left blank `latex' will be used as the default command name. 
+# Note that when enabling USE_PDFLATEX this option is only used for 
+# generating bitmaps for formulas in the HTML output, but not in the 
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 
+# generate index for LaTeX. If left blank `makeindex' will be used as the 
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact 
+# LaTeX documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used 
+# by the printer. Possible values are: a4, a4wide, letter, legal and 
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX 
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         = 
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for 
+# the generated latex document. The header should contain everything until 
+# the first chapter. If it is left blank doxygen will generate a 
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           = 
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated 
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will 
+# contain links (just like the HTML output) instead of page references 
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of 
+# plain latex in the generated Makefile. Set this option to YES to get a 
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. 
+# command to the generated LaTeX files. This will instruct LaTeX to keep 
+# running if errors occur, instead of asking the user for help. 
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not 
+# include the index chapters (such as File Index, Compound Index, etc.) 
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output 
+# The RTF output is optimized for Word 97 and may not look very pretty with 
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact 
+# RTF documents. This may be useful for small projects and may help to 
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated 
+# will contain hyperlink fields. The RTF file will 
+# contain links (just like the HTML output) instead of page references. 
+# This makes the output suitable for online browsing using WORD or other 
+# programs which support those fields. 
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's 
+# config file, i.e. a series of assignments. You only have to provide 
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    = 
+
+# Set optional variables used in the generation of an rtf document. 
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will 
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to 
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output, 
+# then it will generate one additional man file for each entity 
+# documented in the real man page(s). These additional files 
+# only source the real man page, but without them the man command 
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will 
+# generate an XML file that captures the structure of 
+# the code including all documentation.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. 
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_SCHEMA             = 
+
+# The XML_DTD tag can be used to specify an XML DTD, 
+# which can be used by a validating XML parser to check the 
+# syntax of the XML files.
+
+XML_DTD                = 
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will 
+# dump the program listings (including syntax highlighting 
+# and cross-referencing information) to the XML output. Note that 
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will 
+# generate an AutoGen Definitions (see autogen.sf.net) file 
+# that captures the structure of the code including all 
+# documentation. Note that this feature is still experimental 
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will 
+# generate a Perl module file that captures the structure of 
+# the code including all documentation. Note that this 
+# feature is still experimental and incomplete at the 
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate 
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able 
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be 
+# nicely formatted so it can be parsed by a human reader.  This is useful 
+# if you want to understand what is going on.  On the other hand, if this 
+# tag is set to NO the size of the Perl module output will be much smaller 
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file 
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. 
+# This is useful so different doxyrules.make files included by the same 
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX = 
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will 
+# evaluate all C-preprocessor directives found in the sources and include 
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro 
+# names in the source code. If set to NO (the default) only conditional 
+# compilation will be performed. Macro expansion can be done in a controlled 
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES 
+# then the macro expansion is limited to the macros specified with the 
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files 
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that 
+# contain include files that are not input files but should be processed by 
+# the preprocessor.
+
+INCLUDE_PATH           = 
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard 
+# patterns (like *.h and *.hpp) to filter out the header-files in the 
+# directories. If left blank, the patterns specified with FILE_PATTERNS will 
+# be used.
+
+INCLUDE_FILE_PATTERNS  = 
+
+# The PREDEFINED tag can be used to specify one or more macro names that 
+# are defined before the preprocessor is started (similar to the -D option of 
+# gcc). The argument of the tag is a list of macros of the form: name 
+# or name=definition (no spaces). If the definition and the = are 
+# omitted =1 is assumed. To prevent a macro definition from being 
+# undefined via #undef or recursively expanded use the := operator 
+# instead of the = operator.
+
+PREDEFINED             = 
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then 
+# this tag can be used to specify a list of macro names that should be expanded. 
+# The macro definition that is found in the sources will be used. 
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      = 
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then 
+# doxygen's preprocessor will remove all function-like macros that are alone 
+# on a line, have an all uppercase name, and do not end with a semicolon. Such 
+# function macros are typically used for boiler-plate code, and will confuse 
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. 
+# Optionally an initial location of the external documentation 
+# can be added for each tagfile. The format of a tag file without 
+# this location is as follows: 
+#   TAGFILES = file1 file2 ... 
+# Adding location for the tag files is done as follows: 
+#   TAGFILES = file1=loc1 "file2 = loc2" ... 
+# where "loc1" and "loc2" can be relative or absolute paths or 
+# URLs. If a location is present for each tag, the installdox tool 
+# does not have to be run to correct the links. 
+# Note that each tag file must have a unique name 
+# (where the name does NOT include the path) 
+# If a tag file is not located in the directory in which doxygen 
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               = 
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create 
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       = 
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed 
+# in the class index. If set to NO only the inherited external classes 
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed 
+# in the modules index. If set to NO, only the current project's groups will 
+# be listed.
+
+EXTERNAL_GROUPS        = NO
+
+# The PERL_PATH should be the absolute path and name of the perl script 
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will 
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base 
+# or super classes. Setting the tag to NO turns the diagrams off. Note that 
+# this option is superseded by the HAVE_DOT option below. This is only a 
+# fallback. It is recommended to install and use dot, since it yields more 
+# powerful graphs.
+
+CLASS_DIAGRAMS         = NO
+
+# You can define message sequence charts within doxygen comments using the \msc 
+# command. Doxygen will then run the mscgen tool (see 
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the 
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where 
+# the mscgen tool resides. If left empty the tool is assumed to be found in the 
+# default search path.
+
+MSCGEN_PATH            = 
+
+# If set to YES, the inheritance and collaboration graphs will hide 
+# inheritance and usage relations if the target is undocumented 
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is 
+# available from the path. This tool is part of Graphviz, a graph visualization 
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section 
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# By default doxygen will write a font called FreeSans.ttf to the output 
+# directory and reference it in all dot files that doxygen generates. This 
+# font does not include all possible unicode characters however, so when you need 
+# these (or just want a differently looking font) you can specify the font name 
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font, 
+# which can be done by putting it in a standard location or by setting the 
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory 
+# containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. 
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the output directory to look for the 
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a 
+# different font using DOT_FONTNAME you can set the path where dot 
+# can find it using this tag.
+
+DOT_FONTPATH           = 
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect inheritance relations. Setting this tag to YES will force the 
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for each documented class showing the direct and 
+# indirect implementation dependencies (inheritance, containment, and 
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen 
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and 
+# collaboration diagrams in a style similar to the OMG's Unified Modeling 
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the 
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT 
+# tags are set to YES then doxygen will generate a graph for each documented 
+# file showing the direct and indirect include dependencies of the file with 
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and 
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each 
+# documented header file showing the documented files that directly or 
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then 
+# doxygen will generate a call dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable call graphs 
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then 
+# doxygen will generate a caller dependency graph for every global function 
+# or class method. Note that enabling this option will significantly increase 
+# the time of a run. So in most cases it will be better to enable caller 
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen 
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES 
+# then doxygen will show the dependencies a directory has on other directories 
+# in a graphical way. The dependency relations are determined by the #include 
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images 
+# generated by dot. Possible values are png, jpg, or gif 
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be 
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               = 
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that 
+# contain dot files that are included in the documentation (see the 
+# \dotfile command).
+
+DOTFILE_DIRS           = 
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of 
+# nodes that will be shown in the graph. If the number of nodes in a graph 
+# becomes larger than this value, doxygen will truncate the graph, which is 
+# visualized by representing a node as a red box. Note that doxygen if the 
+# number of direct children of the root node in a graph is already larger than 
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note 
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the 
+# graphs generated by dot. A depth value of 3 means that only nodes reachable 
+# from the root by following a path via at most 3 edges will be shown. Nodes 
+# that lay further from the root node will be omitted. Note that setting this 
+# option to 1 or 2 may greatly reduce the computation time needed for large 
+# code bases. Also note that the size of a graph can be further restricted by 
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent 
+# background. This is disabled by default, because dot on Windows does not 
+# seem to support this out of the box. Warning: Depending on the platform used, 
+# enabling this option may lead to badly anti-aliased labels on the edges of 
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output 
+# files in one run (i.e. multiple -o and -T options on the command line). This 
+# makes dot run faster, but since only newer versions of dot (>1.8.10) 
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will 
+# generate a legend page explaining the meaning of the various boxes and 
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will 
+# remove the intermediate dot files that are used to generate 
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/docs/Makefile b/docs/Makefile

new file mode 100644 (file)

index 0000000..14e0fb1
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,168 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = build
+SOURCEDIR        = source
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SOURCEDIR) 
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SOURCEDIR)
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext fig
+
+help:
+       @echo "Please use \`make <target>' where <target> is one of"
+       @echo "  html       to make standalone HTML files"
+       @echo "  dirhtml    to make HTML files named index.html in directories"
+       @echo "  singlehtml to make a single large HTML file"
+       @echo "  pickle     to make pickle files"
+       @echo "  json       to make JSON files"
+       @echo "  htmlhelp   to make HTML files and a HTML help project"
+       @echo "  qthelp     to make HTML files and a qthelp project"
+       @echo "  devhelp    to make HTML files and a Devhelp project"
+       @echo "  epub       to make an epub"
+       @echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+       @echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+       @echo "  text       to make text files"
+       @echo "  man        to make manual pages"
+       @echo "  texinfo    to make Texinfo files"
+       @echo "  info       to make Texinfo files and run them through makeinfo"
+       @echo "  gettext    to make PO message catalogs"
+       @echo "  changes    to make an overview of all changed/added/deprecated items"
+       @echo "  linkcheck  to check all external links for integrity"
+       @echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+       -rm -rf $(BUILDDIR)/*
+
+html: basefig MANY_CLUSTER.png
+       $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+       @echo
+       @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+       $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+       @echo
+       @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+       $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+       @echo
+       @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+       $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+       @echo
+       @echo "Build finished; now you can process the pickle files."
+
+json:
+       $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+       @echo
+       @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+       $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+       @echo
+       @echo "Build finished; now you can run HTML Help Workshop with the" \
+             ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+       $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+       @echo
+       @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+             ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+       @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pbtoolkits.qhcp"
+       @echo "To view the help file:"
+       @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pbtoolkits.qhc"
+
+devhelp:
+       $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+       @echo
+       @echo "Build finished."
+       @echo "To view the help file:"
+       @echo "# mkdir -p $$HOME/.local/share/devhelp/pbtoolkits"
+       @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pbtoolkits"
+       @echo "# devhelp"
+
+epub:
+       $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+       @echo
+       @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo
+       @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+       @echo "Run \`make' in that directory to run these through (pdf)latex" \
+             "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo "Running LaTeX files through pdflatex..."
+       $(MAKE) -C $(BUILDDIR)/latex all-pdf
+       @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+       $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+       @echo
+       @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+       $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+       @echo
+       @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+       $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+       @echo
+       @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+       @echo "Run \`make' in that directory to run these through makeinfo" \
+             "(use \`make info' here to do that automatically)."
+
+info:
+       $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+       @echo "Running Texinfo files through makeinfo..."
+       make -C $(BUILDDIR)/texinfo info
+       @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+       $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+       @echo
+       @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+       $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+       @echo
+       @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+       $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+       @echo
+       @echo "Link check complete; look for any errors in the above output " \
+             "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+       $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+       @echo "Testing of doctests in the sources finished, look at the " \
+             "results in $(BUILDDIR)/doctest/output.txt."
+
+basefig:
+       dot -Tpng $(SOURCEDIR)/dependencies.dot > $(SOURCEDIR)/$@
+       grep -v "\"pbsmrtpipe\" ->" $(SOURCEDIR)/dependencies.dot  \
+               | grep -v "> \"pbcore\"" \
+               | sed 's/All/Sparse/' > $(SOURCEDIR)/sparse_dependencies.dot  
+       dot -Tpng $(SOURCEDIR)/sparse_dependencies.dot \
+               > $(SOURCEDIR)/sparse_dependencies.png
+
+%.png: basefig
+       grep -v $* $(SOURCEDIR)/sparse_dependencies.dot | \
+       grep -v \? | sed 's/Sparse dependencies/Module bundles/' | \
+       dot -Tpng > $(SOURCEDIR)/$@
+
diff --git a/docs/examples/code/BarcodeQuery.txt b/docs/examples/code/BarcodeQuery.txt

new file mode 100644 (file)

index 0000000..3fe8fce
--- /dev/null
+++ b/docs/examples/code/BarcodeQuery.txt
@@ -0,0 +1,17 @@
+// using C++11 range-based for loop
+BarcodeQuery query(42, dataset);
+for (const BamRecord& r : query) {
+    assert(r.HasBarcodes());
+    assert(r.BarcodeForward() == 42 || r.barcodeReverse() == 42);
+}
+
+// OR
+
+// using iterators directly
+BarcodeQuery query(42, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert(iter->HasBarcodes());
+    assert(iter->BarcodeForward() == 42 || iter->barcodeReverse() == 42);
+} 
diff --git a/docs/examples/code/Compare.txt b/docs/examples/code/Compare.txt

new file mode 100644 (file)

index 0000000..deecd8d
--- /dev/null
+++ b/docs/examples/code/Compare.txt
@@ -0,0 +1,3 @@
+// sort on increasing ZMW hole number
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::Zmw());
diff --git a/docs/examples/code/Compare_AlignedEnd.txt b/docs/examples/code/Compare_AlignedEnd.txt

new file mode 100644 (file)

index 0000000..d34ed67
--- /dev/null
+++ b/docs/examples/code/Compare_AlignedEnd.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::AlignedEnd());
diff --git a/docs/examples/code/Compare_AlignedStart.txt b/docs/examples/code/Compare_AlignedStart.txt

new file mode 100644 (file)

index 0000000..68de3e2
--- /dev/null
+++ b/docs/examples/code/Compare_AlignedStart.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::AlignedStart());
diff --git a/docs/examples/code/Compare_AlignedStrand.txt b/docs/examples/code/Compare_AlignedStrand.txt

new file mode 100644 (file)

index 0000000..6c22cdc
--- /dev/null
+++ b/docs/examples/code/Compare_AlignedStrand.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::AlignedStrand());
diff --git a/docs/examples/code/Compare_BarcodeForward.txt b/docs/examples/code/Compare_BarcodeForward.txt

new file mode 100644 (file)

index 0000000..1967341
--- /dev/null
+++ b/docs/examples/code/Compare_BarcodeForward.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::BarcodeForward());
diff --git a/docs/examples/code/Compare_BarcodeQuality.txt b/docs/examples/code/Compare_BarcodeQuality.txt

new file mode 100644 (file)

index 0000000..144f483
--- /dev/null
+++ b/docs/examples/code/Compare_BarcodeQuality.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::BarcodeQuality());
diff --git a/docs/examples/code/Compare_BarcodeReverse.txt b/docs/examples/code/Compare_BarcodeReverse.txt

new file mode 100644 (file)

index 0000000..9d3b245
--- /dev/null
+++ b/docs/examples/code/Compare_BarcodeReverse.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::BarcodeReverse());
diff --git a/docs/examples/code/Compare_FullName.txt b/docs/examples/code/Compare_FullName.txt

new file mode 100644 (file)

index 0000000..4b392b9
--- /dev/null
+++ b/docs/examples/code/Compare_FullName.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::FullName());
diff --git a/docs/examples/code/Compare_LocalContextFlag.txt b/docs/examples/code/Compare_LocalContextFlag.txt

new file mode 100644 (file)

index 0000000..aeab944
--- /dev/null
+++ b/docs/examples/code/Compare_LocalContextFlag.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::LocalContextFlag());
diff --git a/docs/examples/code/Compare_MapQuality.txt b/docs/examples/code/Compare_MapQuality.txt

new file mode 100644 (file)

index 0000000..fe22821
--- /dev/null
+++ b/docs/examples/code/Compare_MapQuality.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::MapQuality());
diff --git a/docs/examples/code/Compare_MovieName.txt b/docs/examples/code/Compare_MovieName.txt

new file mode 100644 (file)

index 0000000..cddcb64
--- /dev/null
+++ b/docs/examples/code/Compare_MovieName.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::MovieName());
diff --git a/docs/examples/code/Compare_NumDeletedBases.txt b/docs/examples/code/Compare_NumDeletedBases.txt

new file mode 100644 (file)

index 0000000..aa6dd4b
--- /dev/null
+++ b/docs/examples/code/Compare_NumDeletedBases.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumDeletedBases());
diff --git a/docs/examples/code/Compare_NumInsertedBases.txt b/docs/examples/code/Compare_NumInsertedBases.txt

new file mode 100644 (file)

index 0000000..917d87f
--- /dev/null
+++ b/docs/examples/code/Compare_NumInsertedBases.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumInsertedBases());
diff --git a/docs/examples/code/Compare_NumMatches.txt b/docs/examples/code/Compare_NumMatches.txt

new file mode 100644 (file)

index 0000000..47e3081
--- /dev/null
+++ b/docs/examples/code/Compare_NumMatches.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumMatches());
diff --git a/docs/examples/code/Compare_NumMismatches.txt b/docs/examples/code/Compare_NumMismatches.txt

new file mode 100644 (file)

index 0000000..12affb1
--- /dev/null
+++ b/docs/examples/code/Compare_NumMismatches.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::NumMismatches());
diff --git a/docs/examples/code/Compare_QueryEnd.txt b/docs/examples/code/Compare_QueryEnd.txt

new file mode 100644 (file)

index 0000000..d664d28
--- /dev/null
+++ b/docs/examples/code/Compare_QueryEnd.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::QueryEnd());
diff --git a/docs/examples/code/Compare_QueryStart.txt b/docs/examples/code/Compare_QueryStart.txt

new file mode 100644 (file)

index 0000000..12f6244
--- /dev/null
+++ b/docs/examples/code/Compare_QueryStart.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::QueryStart());
diff --git a/docs/examples/code/Compare_ReadAccuracy.txt b/docs/examples/code/Compare_ReadAccuracy.txt

new file mode 100644 (file)

index 0000000..9454309
--- /dev/null
+++ b/docs/examples/code/Compare_ReadAccuracy.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReadAccuracy());
diff --git a/docs/examples/code/Compare_ReadGroupId.txt b/docs/examples/code/Compare_ReadGroupId.txt

new file mode 100644 (file)

index 0000000..dab3497
--- /dev/null
+++ b/docs/examples/code/Compare_ReadGroupId.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReadGroupId());
diff --git a/docs/examples/code/Compare_ReadGroupNumericId.txt b/docs/examples/code/Compare_ReadGroupNumericId.txt

new file mode 100644 (file)

index 0000000..5ad8f9d
--- /dev/null
+++ b/docs/examples/code/Compare_ReadGroupNumericId.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReadGroupNumericId());
diff --git a/docs/examples/code/Compare_ReferenceEnd.txt b/docs/examples/code/Compare_ReferenceEnd.txt

new file mode 100644 (file)

index 0000000..ed42d05
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceEnd.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceEnd());
diff --git a/docs/examples/code/Compare_ReferenceId.txt b/docs/examples/code/Compare_ReferenceId.txt

new file mode 100644 (file)

index 0000000..5628427
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceId.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceId());
diff --git a/docs/examples/code/Compare_ReferenceName.txt b/docs/examples/code/Compare_ReferenceName.txt

new file mode 100644 (file)

index 0000000..1f76e7e
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceName.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceName());
diff --git a/docs/examples/code/Compare_ReferenceStart.txt b/docs/examples/code/Compare_ReferenceStart.txt

new file mode 100644 (file)

index 0000000..0ccaf36
--- /dev/null
+++ b/docs/examples/code/Compare_ReferenceStart.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::ReferenceStart());
diff --git a/docs/examples/code/Compare_TypeFromOperator.txt b/docs/examples/code/Compare_TypeFromOperator.txt

new file mode 100644 (file)

index 0000000..afb0848
--- /dev/null
+++ b/docs/examples/code/Compare_TypeFromOperator.txt
@@ -0,0 +1,2 @@
+Compare::Type type = Compare::TypeFromOperator("!=");
+assert(type == Compare::NOT_EQUAL);
diff --git a/docs/examples/code/Compare_TypeToName.txt b/docs/examples/code/Compare_TypeToName.txt

new file mode 100644 (file)

index 0000000..c44e1cb
--- /dev/null
+++ b/docs/examples/code/Compare_TypeToName.txt
@@ -0,0 +1,2 @@
+string name = Compare::TypeToName(Compare::LESS_THAN);
+assert(name = "Compare::LESS_THAN");
diff --git a/docs/examples/code/Compare_Zmw.txt b/docs/examples/code/Compare_Zmw.txt

new file mode 100644 (file)

index 0000000..b02c426
--- /dev/null
+++ b/docs/examples/code/Compare_Zmw.txt
@@ -0,0 +1,2 @@
+std::vector<BamRecord> records;
+std::sort(records.begin(), records.end(), Compare::Zmw());
diff --git a/docs/examples/code/EntireFileQuery.txt b/docs/examples/code/EntireFileQuery.txt

new file mode 100644 (file)

index 0000000..d3fcc2c
--- /dev/null
+++ b/docs/examples/code/EntireFileQuery.txt
@@ -0,0 +1,15 @@
+// using C++11 range-based for loop
+EntireFileQuery query(dataset);
+for (const BamRecord& record : query) {
+    // ... do stuff ...
+}
+
+// OR
+
+// using iterators
+EntireFileQuery query(dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    // ... do stuff ...
+}  
diff --git a/docs/examples/code/EntireFileQuery_BamFilename.txt b/docs/examples/code/EntireFileQuery_BamFilename.txt

new file mode 100644 (file)

index 0000000..484db61
--- /dev/null
+++ b/docs/examples/code/EntireFileQuery_BamFilename.txt
@@ -0,0 +1,4 @@
+EntireFileQuery query("foo.bam");
+for (const BamRecord& record : query) {
+    // do stuff
+}
diff --git a/docs/examples/code/EntireFileQuery_NonConst.txt b/docs/examples/code/EntireFileQuery_NonConst.txt

new file mode 100644 (file)

index 0000000..a0a092e
--- /dev/null
+++ b/docs/examples/code/EntireFileQuery_NonConst.txt
@@ -0,0 +1,4 @@
+EntireFileQuery query("foo.bam");
+for (BamRecord& record : query) {
+    // ok to modify 'record' here
+} 
diff --git a/docs/examples/code/GenomicIntervalQuery.txt b/docs/examples/code/GenomicIntervalQuery.txt

new file mode 100644 (file)

index 0000000..651f254
--- /dev/null
+++ b/docs/examples/code/GenomicIntervalQuery.txt
@@ -0,0 +1,16 @@
+// using C++11 range-based for loop
+GenomicIntervalQuery query(GenomicInterval("chr1:1000-2000"), dataset);
+for (const BamRecord& record : query) {
+    // ... do stuff ...
+}
+
+// OR
+
+// using iterators directly
+GenomicIntervalQuery query(GenomicInterval("chr1:1000-2000"), dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    // ... do stuff ...
+}
+
diff --git a/docs/examples/code/GenomicIntervalQuery_Reuse.txt b/docs/examples/code/GenomicIntervalQuery_Reuse.txt

new file mode 100644 (file)

index 0000000..339ae95
--- /dev/null
+++ b/docs/examples/code/GenomicIntervalQuery_Reuse.txt
@@ -0,0 +1,8 @@
+DataSet ds("data.xml");
+GenomicIntervalQuery query(GenomicInterval(), ds);
+for (const GenomicInterval& interval : intervals) {
+    query.Interval(interval);
+    for (const BamRecord& record : query) {}
+        // do stuff
+    }
+}
+\ No newline at end of file
diff --git a/docs/examples/code/PbiAlignedEndFilter.txt b/docs/examples/code/PbiAlignedEndFilter.txt

new file mode 100644 (file)

index 0000000..bac1a46
--- /dev/null
+++ b/docs/examples/code/PbiAlignedEndFilter.txt
@@ -0,0 +1,4 @@
+PbiFilterQuery query(PbiAlignedEndFilter{3000, Compare::GREATER_THAN});
+for (const BamRecord& record : query) {
+    assert(record.AlignedEnd() > 3000);
+}
diff --git a/docs/examples/code/PbiAlignedLengthFilter.txt b/docs/examples/code/PbiAlignedLengthFilter.txt

new file mode 100644 (file)

index 0000000..38dc3ff
--- /dev/null
+++ b/docs/examples/code/PbiAlignedLengthFilter.txt
@@ -0,0 +1,4 @@
+PbiFilterQuery query(PbiAlignedLengthFilter{1000, Compare::GREATER_THAN});
+for (const BamRecord& record : query) {
+    assert((record.AlignedEnd() - record.AlignedStart()) > 1000);
+}
diff --git a/docs/examples/code/PbiAlignedStartFilter.txt b/docs/examples/code/PbiAlignedStartFilter.txt

new file mode 100644 (file)

index 0000000..b78bb2c
--- /dev/null
+++ b/docs/examples/code/PbiAlignedStartFilter.txt
@@ -0,0 +1,4 @@
+PbiFilterQuery query(PbiAlignedStartFilter{3000, Compare::GREATER_THAN});
+for (const BamRecord& record : query) {
+    assert(record.AlignedStart() > 3000);
+}
diff --git a/docs/examples/code/PbiAlignedStrandFilter.txt b/docs/examples/code/PbiAlignedStrandFilter.txt

new file mode 100644 (file)

index 0000000..9f9a885
--- /dev/null
+++ b/docs/examples/code/PbiAlignedStrandFilter.txt
@@ -0,0 +1,5 @@
+PbiFilterQuery query(PbiAlignedStrandFilter{Strand::FORWARD});
+for (const BamRecord& record : query) {
+    assert(record.AlignedStrand() == Strand::FORWARD);
+}
+
diff --git a/docs/examples/code/PbiBarcodeFilter.txt b/docs/examples/code/PbiBarcodeFilter.txt

new file mode 100644 (file)

index 0000000..c7ce5cb
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeFilter.txt
@@ -0,0 +1,17 @@
+// single value
+PbiFilter filter{ PbiBarcodeFilter{17} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const auto barcodes = record.Barcodes();
+    assert(barcodes.first == 17 || barcodes.second == 17);
+}
+
+// whitelist
+vector<int16_t> whitelist = { 50, 100 };
+PbiFilter filter{ PbiBarcodeFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const auto barcodes = record.Barcodes();
+    assert(barcodes.first == 50  || barcodes.second == 50 ||
+           barcodes.first == 100 || barcodes.second == 100);
+}
diff --git a/docs/examples/code/PbiBarcodeForwardFilter.txt b/docs/examples/code/PbiBarcodeForwardFilter.txt

new file mode 100644 (file)

index 0000000..a6c12fd
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeForwardFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiBarcodeForwardFilter{50} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeForward() == 50);
+}
+
+// whitelist
+vector<int16_t> whitelist = { 50, 100 };
+PbiFilter filter{ PbiBarcodeForwardFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeForward() == 50 || record.BarcodeForward() == 100);
+}
+
diff --git a/docs/examples/code/PbiBarcodeQualityFilter.txt b/docs/examples/code/PbiBarcodeQualityFilter.txt

new file mode 100644 (file)

index 0000000..34311d0
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeQualityFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiBarcodeQualityFilter{42, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeQuality() >= 42);
+}
diff --git a/docs/examples/code/PbiBarcodeReverseFilter.txt b/docs/examples/code/PbiBarcodeReverseFilter.txt

new file mode 100644 (file)

index 0000000..24134f8
--- /dev/null
+++ b/docs/examples/code/PbiBarcodeReverseFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiBarcodeReverseFilter{50} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeReverse() == 50);
+}
+
+// whitelist
+vector<int16_t> whitelist = { 50, 100 };
+PbiFilter filter{ PbiBarcodeReverseFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeReverse() == 50 || record.BarcodeReverse() == 100);
+}
+
diff --git a/docs/examples/code/PbiBarcodesFilter.txt b/docs/examples/code/PbiBarcodesFilter.txt

new file mode 100644 (file)

index 0000000..a655c57
--- /dev/null
+++ b/docs/examples/code/PbiBarcodesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiBarcodesFilter{17, 18} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.BarcodeForward() == 17 && 
+           record.BarcodeReverse() == 18);
+}
diff --git a/docs/examples/code/PbiBuilder_WithReader.txt b/docs/examples/code/PbiBuilder_WithReader.txt

new file mode 100644 (file)

index 0000000..e2748c2
--- /dev/null
+++ b/docs/examples/code/PbiBuilder_WithReader.txt
@@ -0,0 +1,30 @@
+// To simply create a PBI file from BAM, the following is the easiest method:
+//
+#include <pbbam/BamFile.h>
+#include <pbbam/PbiFile.h>
+
+BamFile bamFile("data.bam");
+PbiFile::CreateFrom(bamFile);
+
+
+// However if you need to perform additional operations while reading the BAM file, 
+// you can do something like the following:
+//
+{
+    BamFile bamFile("data.bam");
+    PbiBuilder builder(bamFile.PacBioIndexFilename(), 
+                       bamFile.Header().Sequences().size());
+    BamReader reader(bamFile);
+    BamRecord b;
+    int64_t offset = reader.VirtualTell(); // first record's vOffset
+    while (reader.GetNext(b)) {
+
+        // store PBI recrod entry & get next record's vOffset
+        builder.AddRecord(b, offset);
+        offset = reader.VirtualTell();
+   
+        // ... additional stuff as needed ...
+    }
+
+} // <-- PBI data will only be written here, as PbiBuilder goes out of scope
+
diff --git a/docs/examples/code/PbiBuilder_WithWriter.txt b/docs/examples/code/PbiBuilder_WithWriter.txt

new file mode 100644 (file)

index 0000000..0c7d6d1
--- /dev/null
+++ b/docs/examples/code/PbiBuilder_WithWriter.txt
@@ -0,0 +1,12 @@
+BamWriter writer(...);
+PbiBuilder pbiBuilder(...);
+int64_t vOffset;
+BamRecord record;
+while (...) {
+
+    // ... populate record data ...
+
+    // write record to BAM and add PBI entry
+    writer.Write(record, &vOffset);
+    pbiBuilder.AddRecord(record, vOffset);
+}
diff --git a/docs/examples/code/PbiFilterQuery.txt b/docs/examples/code/PbiFilterQuery.txt

new file mode 100644 (file)

index 0000000..4914eab
--- /dev/null
+++ b/docs/examples/code/PbiFilterQuery.txt
@@ -0,0 +1,22 @@
+// setup filter
+PbiFilter filter;
+filter.Add(PbiZmwFilter(42));
+filter.Add(PbiReadAccuracyFilter(0.9, Compare::GREATER_THAN_EQUAL));
+
+// using C++11 range-based for loop
+PbiFilterQuery query(filter, dataset);
+for (const BamRecord& r : query) {
+    assert(r.HoleNumber() == 42);
+    assert(r.ReadAccuracy() >= 0.9);
+}
+
+// OR
+
+// using iterators directly
+PbiFilterQuery query(filter, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert(iter->HoleNumber() == 42);
+    assert(iter->ReadAccuracy() >= 0.9);
+} 
diff --git a/docs/examples/code/PbiFilter_Composition.txt b/docs/examples/code/PbiFilter_Composition.txt

new file mode 100644 (file)

index 0000000..22cc6ff
--- /dev/null
+++ b/docs/examples/code/PbiFilter_Composition.txt
@@ -0,0 +1,8 @@
+// (f1 && f2) || f3
+
+PbiFilter f1;
+PbiFilter f2;
+PbiFilter intersect_f1_f2 = PbiFilter::Intersection(f1, f2);
+
+PbiFilter f3;
+PbiFilter final = PbiFilter::Union(intersect_f1_f2, f3);
diff --git a/docs/examples/code/PbiFilter_CustomFilter.txt b/docs/examples/code/PbiFilter_CustomFilter.txt

new file mode 100644 (file)

index 0000000..f9cdd21
--- /dev/null
+++ b/docs/examples/code/PbiFilter_CustomFilter.txt
@@ -0,0 +1,21 @@
+struct MyCustomFilter
+{
+    bool Accepts(const PbiRawData& index, const size_t row) const
+    {
+        // Look up data for record at the provided row. Do any calculations
+        // necessary, then return whether that record passes your 
+        // filter criteria. 
+        
+        return true;
+    }
+};
+
+// use in composite filters
+PbiFilter f;
+f.Add(PbiMovieNameFilter("foo"));
+f.Add(MyCustomFilter());
+
+// pass directly to PbiFilterQuery
+PbiFilterQuery query(MyCustomFilter(), "foo.bam");
+for (const BamRecord& record : query)
+    // ... do stuff ...
diff --git a/docs/examples/code/PbiFilter_Interface.txt b/docs/examples/code/PbiFilter_Interface.txt

new file mode 100644 (file)

index 0000000..0fea900
--- /dev/null
+++ b/docs/examples/code/PbiFilter_Interface.txt
@@ -0,0 +1 @@
+bool Accepts(const PbiRawData& index, const size_t row) const;
diff --git a/docs/examples/code/PbiIdentityFilter.txt b/docs/examples/code/PbiIdentityFilter.txt

new file mode 100644 (file)

index 0000000..6fcb8d0
--- /dev/null
+++ b/docs/examples/code/PbiIdentityFilter.txt
@@ -0,0 +1,6 @@
+// single value
+PbiFilter filter{ PbiIdentityFilter{ 0.5, Compare::GREATER_THAN_EQUAL } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    // ... at least 50% of record was aligned ...
+}
diff --git a/docs/examples/code/PbiLocalContextFilter.txt b/docs/examples/code/PbiLocalContextFilter.txt

new file mode 100644 (file)

index 0000000..0aaa3eb
--- /dev/null
+++ b/docs/examples/code/PbiLocalContextFilter.txt
@@ -0,0 +1,22 @@
+
+// --------------------
+// has adapter_before
+// --------------------
+
+PbiFilter filter{ PbiLocalContextFilter{LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const bool hasAdapterBefore = (record.LocalContextFlags() & LocalContextFlags::ADAPTER_BEFORE) != 0;
+    assert(hasAdapterBefore);
+}
+
+// ----------------------------------
+// has any adapters, barcodes, etc.
+// ----------------------------------
+
+PbiFilter filter{ PbiLocalContextFilter{LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    const bool hasContext = (record.LocalContextFlags() != LocalContextFlags::NO_LOCAL_CONTEXT);
+    assert(hasContext);
+}
diff --git a/docs/examples/code/PbiMapQualityFilter.txt b/docs/examples/code/PbiMapQualityFilter.txt

new file mode 100644 (file)

index 0000000..67fb5dc
--- /dev/null
+++ b/docs/examples/code/PbiMapQualityFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiMapQualityFilter{75, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.MapQuality() >= 75);
+} 
diff --git a/docs/examples/code/PbiMovieNameFilter.txt b/docs/examples/code/PbiMovieNameFilter.txt

new file mode 100644 (file)

index 0000000..dd124e2
--- /dev/null
+++ b/docs/examples/code/PbiMovieNameFilter.txt
@@ -0,0 +1,14 @@
+// single value
+PbiFilter filter{ PbiMovieFilter{ "foo" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.MovieName() == "foo");
+}
+
+// whitelist
+vector<string> whitelist = { "foo", "bar" };
+PbiFilter filter{ PbiMovieNameFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.MovieName() == "foo" || record.MovieName() == "bar");
+}
diff --git a/docs/examples/code/PbiNumDeletedBasesFilter.txt b/docs/examples/code/PbiNumDeletedBasesFilter.txt

new file mode 100644 (file)

index 0000000..e1e3d1f
--- /dev/null
+++ b/docs/examples/code/PbiNumDeletedBasesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumDeletedBasesFilter{50, Compare::LESS_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumDeletedBases() < 50);
+}
+
diff --git a/docs/examples/code/PbiNumInsertedBasesFilter.txt b/docs/examples/code/PbiNumInsertedBasesFilter.txt

new file mode 100644 (file)

index 0000000..ab385e4
--- /dev/null
+++ b/docs/examples/code/PbiNumInsertedBasesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumInsertedBasesFilter{50, Compare::LESS_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumInsertedBases() < 50);
+}
+
diff --git a/docs/examples/code/PbiNumMatchesFilter.txt b/docs/examples/code/PbiNumMatchesFilter.txt

new file mode 100644 (file)

index 0000000..4e1b97d
--- /dev/null
+++ b/docs/examples/code/PbiNumMatchesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumMatchesFilter{2000, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumMatches() >= 2000);
+}
+
diff --git a/docs/examples/code/PbiNumMismatchesFilter.txt b/docs/examples/code/PbiNumMismatchesFilter.txt

new file mode 100644 (file)

index 0000000..690e4a1
--- /dev/null
+++ b/docs/examples/code/PbiNumMismatchesFilter.txt
@@ -0,0 +1,6 @@
+PbiFilter filter{ PbiNumMismatchesFilter{500, Compare::LESS_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.NumMismatches() < 500);
+}
+
diff --git a/docs/examples/code/PbiQueryEndFilter.txt b/docs/examples/code/PbiQueryEndFilter.txt

new file mode 100644 (file)

index 0000000..f85166b
--- /dev/null
+++ b/docs/examples/code/PbiQueryEndFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiQueryEndFilter{3000, Compare::GREATER_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.QueryEnd() > 3000);
+} 
diff --git a/docs/examples/code/PbiQueryLengthFilter.txt b/docs/examples/code/PbiQueryLengthFilter.txt

new file mode 100644 (file)

index 0000000..123412a
--- /dev/null
+++ b/docs/examples/code/PbiQueryLengthFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiQueryLengthFilter{2000, Compare::GREATER_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert( (record.QueryEnd() - record.QueryStart()) > 2000 );
+}
diff --git a/docs/examples/code/PbiQueryNameFilter.txt b/docs/examples/code/PbiQueryNameFilter.txt

new file mode 100644 (file)

index 0000000..f1e51c7
--- /dev/null
+++ b/docs/examples/code/PbiQueryNameFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiQueryNameFilter{ "movie_1/42/100_200" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.FullName() == "movie_1/42/100_200");
+}
+
+// whitelist
+vector<string> whitelist = { "movie_1/42/100_200", "movie_3/24/300_500" };
+PbiFilter filter{ PbiQueryNameFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.FullName() == "movie_1/42/100_200" || 
+           record.FullName() == "movie_3/24/300_500");
+}
diff --git a/docs/examples/code/PbiQueryStartFilter.txt b/docs/examples/code/PbiQueryStartFilter.txt

new file mode 100644 (file)

index 0000000..56353df
--- /dev/null
+++ b/docs/examples/code/PbiQueryStartFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiQueryStartFilter{3000, Compare::GREATER_THAN} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.QueryStart() > 3000);
+} 
diff --git a/docs/examples/code/PbiReadAccuracyFilter.txt b/docs/examples/code/PbiReadAccuracyFilter.txt

new file mode 100644 (file)

index 0000000..dd2df32
--- /dev/null
+++ b/docs/examples/code/PbiReadAccuracyFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiReadAccuracyFilter{0.8, Compare::GREATER_THAN_EQUAL} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadAccuracy() >= 0.8);
+}
diff --git a/docs/examples/code/PbiReadGroupFilter.txt b/docs/examples/code/PbiReadGroupFilter.txt

new file mode 100644 (file)

index 0000000..9af096d
--- /dev/null
+++ b/docs/examples/code/PbiReadGroupFilter.txt
@@ -0,0 +1,64 @@
+// -------------------------
+// numeric ID
+// -------------------------
+
+// single value
+PbiFilter filter{ PbiReadGroupFilter{ 2458765 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupNumericId() == 2458765);
+}
+
+// whitelist
+vector<int32_t> whitelist = { 2458765, -32143 };
+PbiFilter filter{ PbiReadGroupFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupNumericId() == 2458765 ||
+           record.ReadGroupNumericId() == -32143);
+}
+
+// -------------------------
+// printable ID
+// -------------------------
+
+// single value 
+PbiFilter filter{ PbiReadGroupFilter{ "12B33F00" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupId() == "12B33F00");
+}
+
+// whitelist
+vector<string> whitelist = { "12B33F00", "123ABC77" };
+PbiFilter filter{ PbiReadGroupFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroupId() == "12B33F00" ||
+           record.ReadGroupId() == "123ABC77");
+}
+
+
+// -------------------------
+// read group 
+// -------------------------
+
+BamFile file("foo.bam");
+BamHeader header = file.Header();
+assert(header.ReadGroups().size() > 1);
+
+// single value 
+PbiFilter filter{ PbiReadGroupFilter{ header.ReadGroups()[0] } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroup() == header.ReadGroups()[0]);
+}
+
+// whitelist
+vector<ReadGroupInfo> whitelist = { header.ReadGroups()[0], header.ReadGroups()[1] };
+PbiFilter filter{ PbiReadGroupFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReadGroup() == header.ReadGroups()[0] ||
+           record.ReadGroup() == header.ReadGroups()[1]);
+}
diff --git a/docs/examples/code/PbiReferenceEndFilter.txt b/docs/examples/code/PbiReferenceEndFilter.txt

new file mode 100644 (file)

index 0000000..ce005c6
--- /dev/null
+++ b/docs/examples/code/PbiReferenceEndFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiReferenceEndFilter{ 2000 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceEnd() == 2000);
+}
diff --git a/docs/examples/code/PbiReferenceIdFilter.txt b/docs/examples/code/PbiReferenceIdFilter.txt

new file mode 100644 (file)

index 0000000..d963d28
--- /dev/null
+++ b/docs/examples/code/PbiReferenceIdFilter.txt
@@ -0,0 +1,16 @@
+// single value
+PbiFilter filter{ PbiReferenceIdFilter{ 4 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceId() == 4);
+}
+
+// whitelist
+vector<int32_t> whitelist = { 0, 1 };
+PbiFilter filter{ PbiReferenceIdFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceId() == 0 || 
+           record.ReferenceId() == 1);
+}
+
diff --git a/docs/examples/code/PbiReferenceNameFilter.txt b/docs/examples/code/PbiReferenceNameFilter.txt

new file mode 100644 (file)

index 0000000..c86b14a
--- /dev/null
+++ b/docs/examples/code/PbiReferenceNameFilter.txt
@@ -0,0 +1,15 @@
+// single value
+PbiFilter filter{ PbiReferenceNameFilter{ "chr1" } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceName() == "chr1");
+}
+
+// whitelist
+vector<string> whitelist = { "chr1", "chr5" };
+PbiFilter filter{ PbiReferenceNameFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceName() == "chr1" ||
+           record.ReferenceName() == "chr5");
+}
diff --git a/docs/examples/code/PbiReferenceStartFilter.txt b/docs/examples/code/PbiReferenceStartFilter.txt

new file mode 100644 (file)

index 0000000..d3ffdbb
--- /dev/null
+++ b/docs/examples/code/PbiReferenceStartFilter.txt
@@ -0,0 +1,5 @@
+PbiFilter filter{ PbiReferenceStartFilter{ 2000 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.ReferenceStart() == 2000);
+}
diff --git a/docs/examples/code/PbiZmwFilter.txt b/docs/examples/code/PbiZmwFilter.txt

new file mode 100644 (file)

index 0000000..c63a804
--- /dev/null
+++ b/docs/examples/code/PbiZmwFilter.txt
@@ -0,0 +1,16 @@
+// single value
+PbiFilter filter{ PbiZmwFilter{ 4000 } };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.HoleNumber() == 4000);
+}
+
+// whitelist
+vector<int32_t> whitelist = { 4000, 8000 };
+PbiFilter filter{ PbiZmwFilter{whitelist} };
+PbiFilterQuery query(filter);
+for (const BamRecord& record : query) {
+    assert(record.HoleNumber() == 4000 || 
+           record.HoleNumber() == 8000);
+}
+
diff --git a/docs/examples/code/ReadAccuracyQuery.txt b/docs/examples/code/ReadAccuracyQuery.txt

new file mode 100644 (file)

index 0000000..5b0404f
--- /dev/null
+++ b/docs/examples/code/ReadAccuracyQuery.txt
@@ -0,0 +1,15 @@
+// using C++11 range-based for loop
+ReadAccuracyQuery query(0.9, Compare::GREATER_THAN_EQUAL, dataset);
+for (const BamRecord& r : query) {
+    assert(r.ReadAccuracy() >= 0.9);
+}
+
+// OR
+
+// using iterators directly
+ReadAccuracyQuery query(0.9, Compare::GREATER_THAN_EQUAL, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert(iter->ReadAccuracy() >= 0.9);
+} 
diff --git a/docs/examples/code/SubreadLengthQuery.txt b/docs/examples/code/SubreadLengthQuery.txt

new file mode 100644 (file)

index 0000000..466a1d9
--- /dev/null
+++ b/docs/examples/code/SubreadLengthQuery.txt
@@ -0,0 +1,15 @@
+// using C++11 range-based for loop
+SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, dataset);
+for (const BamRecord& r : query) {
+    assert((r.QueryEnd() - r.QueryStart()) >= 500);  
+}
+
+// OR
+
+// using iterators directly
+SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, dataset);
+auto iter = query.cbegin();
+auto end  = query.cend();
+for (; iter != end; ++iter) {
+    assert((iter->QueryEnd() - iter->QueryStart()) >= 500);
+} 
diff --git a/docs/examples/code/Tag_AsciiCtor.txt b/docs/examples/code/Tag_AsciiCtor.txt

new file mode 100644 (file)

index 0000000..057d22f
--- /dev/null
+++ b/docs/examples/code/Tag_AsciiCtor.txt
@@ -0,0 +1,10 @@
+// One-step construction
+// 
+// This is useful in situations that require a const Tag.
+//
+const auto t = Tag('A', TagModifier::ASCII_CHAR);
+
+// or two-step construction
+auto t = Tag('A');
+t.Modifier(TagModifier::ASCII_CHAR);
+
diff --git a/docs/examples/code/WhitelistedZmwReadStitcher.txt b/docs/examples/code/WhitelistedZmwReadStitcher.txt

new file mode 100644 (file)

index 0000000..a94c27b
--- /dev/null
+++ b/docs/examples/code/WhitelistedZmwReadStitcher.txt
@@ -0,0 +1,6 @@
+vector<int32_t> zmws = { ... };
+WhitelistedZmwReadStitcher reader(zmws, "primary.bam", "scraps.bam");
+while(reader.HasNext()) {
+    auto virtualRecord = reader.Next();
+    // ... do stuff ...
+}
diff --git a/docs/examples/code/ZmwGroupQuery.txt b/docs/examples/code/ZmwGroupQuery.txt

new file mode 100644 (file)

index 0000000..1d728ac
--- /dev/null
+++ b/docs/examples/code/ZmwGroupQuery.txt
@@ -0,0 +1,23 @@
+bool allHoleNumbersEqual(const vector<BamRecord>& group) 
+{
+    if (group.empty()) 
+        return true;
+    const auto firstHoleNumber = group[0].HoleNumber();
+    for (size_t i = 1; i < group.size(); ++i) {
+       if (group[i].HoleNumber() != firstHoleNumber)
+           return false;
+    }
+    return true;
+}
+
+vector<int32_t> whitelist = { 50, 100 };
+ZmwGroupQuery query(whitelist, dataset);
+for(const vector<BamRecord>& group : query) {
+
+    assert(allHoleNumbersEqual(group));
+
+    for (const BamRecord& record : group) {
+        assert(record.HoleNumber() == 50 ||
+               record.HoleNumber() == 100);
+    }
+}
diff --git a/docs/examples/code/ZmwQuery.txt b/docs/examples/code/ZmwQuery.txt

new file mode 100644 (file)

index 0000000..59c22c4
--- /dev/null
+++ b/docs/examples/code/ZmwQuery.txt
@@ -0,0 +1,6 @@
+vector<int32_t> whitelist = { 50, 100 };
+ZmwQuery query(whitelist, dataset);
+for (const BamRecord& record : query) {
+    assert(record.HoleNumber() == 50 ||
+           record.HoleNumber() == 100);
+}
diff --git a/docs/examples/plaintext/AlignmentPrinterOutput.txt b/docs/examples/plaintext/AlignmentPrinterOutput.txt

new file mode 100644 (file)

index 0000000..21d948b
--- /dev/null
+++ b/docs/examples/plaintext/AlignmentPrinterOutput.txt
@@ -0,0 +1,13 @@
+Read        : singleInsertion2
+Reference   : lambda_NEB3011
+
+Read-length : 49
+Concordance : 0.96
+
+5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249
+       |||||||| ||||||||||||||||||| |||||||||||
+   0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG :   39
+
+5249 : ACTGGCTGAT : 5259
+       ||||||||||
+  39 : ACTGGCTGAT :   49
diff --git a/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt b/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt

new file mode 100644 (file)

index 0000000..5b5e8c2
--- /dev/null
+++ b/docs/examples/plaintext/PbiFilter_DataSetXmlFilters.txt
@@ -0,0 +1,14 @@
+<Filters>
+  <Filter>
+    <Properties>
+      <Property />  # A
+      <Property />  # B
+    </Properties>
+  </Filter>
+  <Filter>
+    <Properties>
+      <Property />  # C
+      <Property />  # D
+    </Properties> 
+  </Filter>
+</Filters>
diff --git a/docs/meson.build b/docs/meson.build

new file mode 100644 (file)

index 0000000..cffad5c
--- /dev/null
+++ b/docs/meson.build
@@ -0,0 +1,24 @@
+#################
+# documentation #
+#################
+
+doxygen = find_program('doxygen', required : true)
+
+pbbam_doxygen_config = configuration_data()
+pbbam_doxygen_config.set('PacBioBAM_NAME', meson.project_name())
+pbbam_doxygen_config.set('PacBioBAM_VERSION', meson.project_version())
+pbbam_doxygen_config.set('PacBioBAM_DocsDir', '.')
+pbbam_doxygen_config.set('PacBioBAM_IncludeDir', join_paths([meson.current_source_dir(), '../include']))
+
+doxyfile = configure_file(
+  input : 'Doxyfile.in',
+  output : 'Doxyfile',
+  configuration : pbbam_doxygen_config,
+  install : false)
+
+custom_target('docs',
+  input : doxyfile,
+  output : 'docs',
+  command : [doxygen, doxyfile],
+  build_by_default : true,
+  install : false)
diff --git a/docs/source/api/Accuracy.rst b/docs/source/api/Accuracy.rst

new file mode 100644 (file)

index 0000000..f88b722
--- /dev/null
+++ b/docs/source/api/Accuracy.rst
@@ -0,0 +1,11 @@
+Accuracy
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/Accuracy.h>
+
+.. doxygenclass:: PacBio::BAM::Accuracy
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/AlignmentPrinter.rst b/docs/source/api/AlignmentPrinter.rst

new file mode 100644 (file)

index 0000000..ef0b191
--- /dev/null
+++ b/docs/source/api/AlignmentPrinter.rst
@@ -0,0 +1,11 @@
+AlignmentPrinter
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/AlignmentPrinter.h>
+
+.. doxygenclass:: PacBio::BAM::AlignmentPrinter 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/AlignmentSet.rst b/docs/source/api/AlignmentSet.rst

new file mode 100644 (file)

index 0000000..1817962
--- /dev/null
+++ b/docs/source/api/AlignmentSet.rst
@@ -0,0 +1,11 @@
+AlignmentSet
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::AlignmentSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BaiIndexedBamReader.rst b/docs/source/api/BaiIndexedBamReader.rst

new file mode 100644 (file)

index 0000000..aab136f
--- /dev/null
+++ b/docs/source/api/BaiIndexedBamReader.rst
@@ -0,0 +1,11 @@
+BaiIndexedBamReader
+===================
+
+.. code-block:: cpp
+
+   #include <pbbam/BaiIndexedBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::BaiIndexedBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamFile.rst b/docs/source/api/BamFile.rst

new file mode 100644 (file)

index 0000000..c7e48fb
--- /dev/null
+++ b/docs/source/api/BamFile.rst
@@ -0,0 +1,11 @@
+BamFile
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/BamFile.h>
+
+.. doxygenclass:: PacBio::BAM::BamFile
+   :members:
+   :protected-members:
+   :undoc-members:
diff --git a/docs/source/api/BamHeader.rst b/docs/source/api/BamHeader.rst

new file mode 100644 (file)

index 0000000..6cf06af
--- /dev/null
+++ b/docs/source/api/BamHeader.rst
@@ -0,0 +1,11 @@
+BamHeader
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamHeader.h>
+
+.. doxygenclass:: PacBio::BAM::BamHeader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamReader.rst b/docs/source/api/BamReader.rst

new file mode 100644 (file)

index 0000000..e0b6f3c
--- /dev/null
+++ b/docs/source/api/BamReader.rst
@@ -0,0 +1,11 @@
+BamReader
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamReader.h>
+
+.. doxygenclass:: PacBio::BAM::BamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecord.rst b/docs/source/api/BamRecord.rst

new file mode 100644 (file)

index 0000000..a749775
--- /dev/null
+++ b/docs/source/api/BamRecord.rst
@@ -0,0 +1,17 @@
+BamRecord
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecord.h>
+
+.. doxygenenum:: PacBio::BAM::ClipType
+
+.. doxygenenum:: PacBio::BAM::RecordType
+
+.. doxygenenum:: PacBio::BAM::FrameEncodingType
+
+.. doxygenclass:: PacBio::BAM::BamRecord
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecordBuilder.rst b/docs/source/api/BamRecordBuilder.rst

new file mode 100644 (file)

index 0000000..ce477b4
--- /dev/null
+++ b/docs/source/api/BamRecordBuilder.rst
@@ -0,0 +1,11 @@
+BamRecordBuilder
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecordBuilder.h>
+
+.. doxygenclass:: PacBio::BAM::BamRecordBuilder
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecordImpl.rst b/docs/source/api/BamRecordImpl.rst

new file mode 100644 (file)

index 0000000..92b6759
--- /dev/null
+++ b/docs/source/api/BamRecordImpl.rst
@@ -0,0 +1,11 @@
+BamRecordImpl
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecordImpl.h>
+
+.. doxygenclass:: PacBio::BAM::BamRecordImpl
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamRecordView.rst b/docs/source/api/BamRecordView.rst

new file mode 100644 (file)

index 0000000..2bc8fc4
--- /dev/null
+++ b/docs/source/api/BamRecordView.rst
@@ -0,0 +1,11 @@
+BamRecordView
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/BamRecord.h>
+
+.. doxygenclass:: PacBio::BAM::BamRecordView
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamTagCodec.rst b/docs/source/api/BamTagCodec.rst

new file mode 100644 (file)

index 0000000..9307421
--- /dev/null
+++ b/docs/source/api/BamTagCodec.rst
@@ -0,0 +1,11 @@
+BamTagCodec
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamTagCodec.h>
+
+.. doxygenclass:: PacBio::BAM::BamTagCodec
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BamWriter.rst b/docs/source/api/BamWriter.rst

new file mode 100644 (file)

index 0000000..2e2951b
--- /dev/null
+++ b/docs/source/api/BamWriter.rst
@@ -0,0 +1,11 @@
+BamWriter
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/BamWriter.h>
+
+.. doxygenclass:: PacBio::BAM::BamWriter
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BarcodeQuery.rst b/docs/source/api/BarcodeQuery.rst

new file mode 100644 (file)

index 0000000..5836059
--- /dev/null
+++ b/docs/source/api/BarcodeQuery.rst
@@ -0,0 +1,11 @@
+BarcodeQuery
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/BarcodeQuery.h>
+
+.. doxygenclass:: PacBio::BAM::BarcodeQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/BarcodeSet.rst b/docs/source/api/BarcodeSet.rst

new file mode 100644 (file)

index 0000000..a7ee056
--- /dev/null
+++ b/docs/source/api/BarcodeSet.rst
@@ -0,0 +1,11 @@
+BarcodeSet
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::BarcodeSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Cigar.rst b/docs/source/api/Cigar.rst

new file mode 100644 (file)

index 0000000..cea30d5
--- /dev/null
+++ b/docs/source/api/Cigar.rst
@@ -0,0 +1,11 @@
+Cigar
+=====
+
+.. code-block:: cpp
+
+   #include <pbbam/Cigar.h>
+
+.. doxygenclass:: PacBio::BAM::Cigar
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/CigarOperation.rst b/docs/source/api/CigarOperation.rst

new file mode 100644 (file)

index 0000000..856400a
--- /dev/null
+++ b/docs/source/api/CigarOperation.rst
@@ -0,0 +1,13 @@
+CigarOperation
+==============
+
+.. code-block:: cpp
+
+   #include <pbbam/CigarOperation.h>
+   
+.. doxygenenum:: PacBio::BAM::CigarOperationType   
+
+.. doxygenclass:: PacBio::BAM::CigarOperation
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Compare.rst b/docs/source/api/Compare.rst

new file mode 100644 (file)

index 0000000..bb28a7e
--- /dev/null
+++ b/docs/source/api/Compare.rst
@@ -0,0 +1,8 @@
+Compare
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/Compare.h>
+
+.. doxygenfile:: Compare.h
+\ No newline at end of file
diff --git a/docs/source/api/Config.rst b/docs/source/api/Config.rst

new file mode 100644 (file)

index 0000000..c4be9e4
--- /dev/null
+++ b/docs/source/api/Config.rst
@@ -0,0 +1,8 @@
+Config
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/Conifig.h>
+
+.. doxygenfile:: Config.h
+\ No newline at end of file
diff --git a/docs/source/api/ConsensusAlignmentSet.rst b/docs/source/api/ConsensusAlignmentSet.rst

new file mode 100644 (file)

index 0000000..bc5a7e5
--- /dev/null
+++ b/docs/source/api/ConsensusAlignmentSet.rst
@@ -0,0 +1,11 @@
+ConsensusAlignmentSet
+=====================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ConsensusAlignmentSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ConsensusReadSet.rst b/docs/source/api/ConsensusReadSet.rst

new file mode 100644 (file)

index 0000000..846698d
--- /dev/null
+++ b/docs/source/api/ConsensusReadSet.rst
@@ -0,0 +1,11 @@
+ConsensusReadSet
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ConsensusReadSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ContigSet.rst b/docs/source/api/ContigSet.rst

new file mode 100644 (file)

index 0000000..96bb20b
--- /dev/null
+++ b/docs/source/api/ContigSet.rst
@@ -0,0 +1,11 @@
+ContigSet
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ContigSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/DataSet.rst b/docs/source/api/DataSet.rst

new file mode 100644 (file)

index 0000000..8b3f0db
--- /dev/null
+++ b/docs/source/api/DataSet.rst
@@ -0,0 +1,11 @@
+DataSet
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSet.h>
+
+.. doxygenclass:: PacBio::BAM::DataSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/DataSetBase.rst b/docs/source/api/DataSetBase.rst

new file mode 100644 (file)

index 0000000..f23fbb5
--- /dev/null
+++ b/docs/source/api/DataSetBase.rst
@@ -0,0 +1,11 @@
+DataSetBase
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::DataSetBase
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/DataSetMetadata.rst b/docs/source/api/DataSetMetadata.rst

new file mode 100644 (file)

index 0000000..eea260d
--- /dev/null
+++ b/docs/source/api/DataSetMetadata.rst
@@ -0,0 +1,11 @@
+DataSetMetadata
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::DataSetMetadata
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/EntireFileQuery.rst b/docs/source/api/EntireFileQuery.rst

new file mode 100644 (file)

index 0000000..4e7b86b
--- /dev/null
+++ b/docs/source/api/EntireFileQuery.rst
@@ -0,0 +1,11 @@
+EntireFileQuery
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/EntireFileQuery.h>
+
+.. doxygenclass:: PacBio::BAM::EntireFileQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ExtensionElement.rst b/docs/source/api/ExtensionElement.rst

new file mode 100644 (file)

index 0000000..980303e
--- /dev/null
+++ b/docs/source/api/ExtensionElement.rst
@@ -0,0 +1,11 @@
+ExtensionElement
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ExtensionElement
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Extensions.rst b/docs/source/api/Extensions.rst

new file mode 100644 (file)

index 0000000..6704807
--- /dev/null
+++ b/docs/source/api/Extensions.rst
@@ -0,0 +1,11 @@
+Extensions
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::Extensions
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ExternalResource.rst b/docs/source/api/ExternalResource.rst

new file mode 100644 (file)

index 0000000..03ab0d3
--- /dev/null
+++ b/docs/source/api/ExternalResource.rst
@@ -0,0 +1,11 @@
+ExternalResource
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ExternalResource
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ExternalResources.rst b/docs/source/api/ExternalResources.rst

new file mode 100644 (file)

index 0000000..bd72ea4
--- /dev/null
+++ b/docs/source/api/ExternalResources.rst
@@ -0,0 +1,11 @@
+ExternalResources
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ExternalResources
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/FileIndex.rst b/docs/source/api/FileIndex.rst

new file mode 100644 (file)

index 0000000..c117214
--- /dev/null
+++ b/docs/source/api/FileIndex.rst
@@ -0,0 +1,11 @@
+FileIndex
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::FileIndex
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/FileIndices.rst b/docs/source/api/FileIndices.rst

new file mode 100644 (file)

index 0000000..b25720c
--- /dev/null
+++ b/docs/source/api/FileIndices.rst
@@ -0,0 +1,11 @@
+FileIndices
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::FileIndices
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Filter.rst b/docs/source/api/Filter.rst

new file mode 100644 (file)

index 0000000..6faa8aa
--- /dev/null
+++ b/docs/source/api/Filter.rst
@@ -0,0 +1,11 @@
+Filter
+======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::Filter
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Filters.rst b/docs/source/api/Filters.rst

new file mode 100644 (file)

index 0000000..7ea1620
--- /dev/null
+++ b/docs/source/api/Filters.rst
@@ -0,0 +1,11 @@
+Filters
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::Filters
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Frames.rst b/docs/source/api/Frames.rst

new file mode 100644 (file)

index 0000000..cf260f2
--- /dev/null
+++ b/docs/source/api/Frames.rst
@@ -0,0 +1,11 @@
+Frames
+======
+
+.. code-block:: cpp
+
+   #include <pbbam/Frames.h>
+
+.. doxygenclass:: PacBio::BAM::Frames
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/GenomicInterval.rst b/docs/source/api/GenomicInterval.rst

new file mode 100644 (file)

index 0000000..811b83a
--- /dev/null
+++ b/docs/source/api/GenomicInterval.rst
@@ -0,0 +1,11 @@
+GenomicInterval
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/GenomicInterval.h>
+
+.. doxygenclass:: PacBio::BAM::GenomicInterval
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/GenomicIntervalCompositeBamReader.rst b/docs/source/api/GenomicIntervalCompositeBamReader.rst

new file mode 100644 (file)

index 0000000..f658621
--- /dev/null
+++ b/docs/source/api/GenomicIntervalCompositeBamReader.rst
@@ -0,0 +1,11 @@
+GenomicIntervalCompositeBamReader
+=================================
+
+.. code-block:: cpp
+
+   #include <pbbam/CompositeBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::GenomicIntervalCompositeBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/GenomicIntervalQuery.rst b/docs/source/api/GenomicIntervalQuery.rst

new file mode 100644 (file)

index 0000000..7bae558
--- /dev/null
+++ b/docs/source/api/GenomicIntervalQuery.rst
@@ -0,0 +1,11 @@
+GenomicIntervalQuery
+====================
+
+.. code-block:: cpp
+
+   #include <pbbam/GenomicIntervalQuery.h>
+
+.. doxygenclass:: PacBio::BAM::GenomicIntervalQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/HdfSubreadSet.rst b/docs/source/api/HdfSubreadSet.rst

new file mode 100644 (file)

index 0000000..88bf008
--- /dev/null
+++ b/docs/source/api/HdfSubreadSet.rst
@@ -0,0 +1,11 @@
+HdfSubreadSet
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::HdfSubreadSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/IndexResultBlock.rst b/docs/source/api/IndexResultBlock.rst

new file mode 100644 (file)

index 0000000..fac804a
--- /dev/null
+++ b/docs/source/api/IndexResultBlock.rst
@@ -0,0 +1,17 @@
+IndexResultBlock
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiBasicTypes.h>
+
+.. doxygenstruct:: PacBio::BAM::IndexResultBlock
+   :members:
+   :protected-members:
+   :undoc-members:
+   
+.. doxygentypedef:: PacBio::BAM::IndexResultBlocks
+
+.. doxygentypedef:: PacBio::BAM::IndexList
+   
+.. doxygentypedef:: PacBio::BAM::IndexRange
+\ No newline at end of file
diff --git a/docs/source/api/IndexedFastaReader.rst b/docs/source/api/IndexedFastaReader.rst

new file mode 100644 (file)

index 0000000..7c46064
--- /dev/null
+++ b/docs/source/api/IndexedFastaReader.rst
@@ -0,0 +1,11 @@
+IndexedFastaReader
+==================
+
+.. code-block:: cpp
+
+   #include <pbbam/IndexedFastaReader.h>
+
+.. doxygenclass:: PacBio::BAM::IndexedFastaReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Interval.rst b/docs/source/api/Interval.rst

new file mode 100644 (file)

index 0000000..f506a19
--- /dev/null
+++ b/docs/source/api/Interval.rst
@@ -0,0 +1,11 @@
+Interval
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/Interval.h>
+
+.. doxygenclass:: PacBio::BAM::Interval
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/InvalidSequencingChemistryException.rst b/docs/source/api/InvalidSequencingChemistryException.rst

new file mode 100644 (file)

index 0000000..d521ecc
--- /dev/null
+++ b/docs/source/api/InvalidSequencingChemistryException.rst
@@ -0,0 +1,11 @@
+InvalidSequencingChemistryException
+===================================
+
+.. code-block:: cpp
+
+   #include <pbbam/exception/InvalidSequencingChemistryException.h>
+
+.. doxygenclass:: PacBio::BAM::InvalidSequencingChemistryException
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/LocalContextFlags.rst b/docs/source/api/LocalContextFlags.rst

new file mode 100644 (file)

index 0000000..8cd63be
--- /dev/null
+++ b/docs/source/api/LocalContextFlags.rst
@@ -0,0 +1,8 @@
+LocalContextFlags
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/LocalContextFlags.h>
+
+.. doxygenenum:: PacBio::BAM::LocalContextFlags
diff --git a/docs/source/api/NamespaceInfo.rst b/docs/source/api/NamespaceInfo.rst

new file mode 100644 (file)

index 0000000..c7613ec
--- /dev/null
+++ b/docs/source/api/NamespaceInfo.rst
@@ -0,0 +1,11 @@
+NamespaceInfo
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetXsd.h>
+
+.. doxygenclass:: PacBio::BAM::NamespaceInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/NamespaceRegistry.rst b/docs/source/api/NamespaceRegistry.rst

new file mode 100644 (file)

index 0000000..2f8f9a7
--- /dev/null
+++ b/docs/source/api/NamespaceRegistry.rst
@@ -0,0 +1,11 @@
+NamespaceRegistry
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetXsd.h>
+
+.. doxygenclass:: PacBio::BAM::NamespaceRegistry
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Orientation.rst b/docs/source/api/Orientation.rst

new file mode 100644 (file)

index 0000000..e9bbc42
--- /dev/null
+++ b/docs/source/api/Orientation.rst
@@ -0,0 +1,8 @@
+Orientation
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/Orientation.h>
+
+.. doxygenenum:: PacBio::BAM::Orientation
diff --git a/docs/source/api/ParentTool.rst b/docs/source/api/ParentTool.rst

new file mode 100644 (file)

index 0000000..e2ffa1b
--- /dev/null
+++ b/docs/source/api/ParentTool.rst
@@ -0,0 +1,11 @@
+ParentTool
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ParentTool
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiBuilder.rst b/docs/source/api/PbiBuilder.rst

new file mode 100644 (file)

index 0000000..d795d0f
--- /dev/null
+++ b/docs/source/api/PbiBuilder.rst
@@ -0,0 +1,11 @@
+PbiBuilder
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiBuilder.h>
+
+.. doxygenclass:: PacBio::BAM::PbiBuilder
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFile.rst b/docs/source/api/PbiFile.rst

new file mode 100644 (file)

index 0000000..5a8b85a
--- /dev/null
+++ b/docs/source/api/PbiFile.rst
@@ -0,0 +1,14 @@
+PbiFile
+=======
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFile.h>
+
+.. doxygenenum:: PacBio::BAM::PbiFile::Section
+
+.. doxygentypedef:: PacBio::BAM::PbiFile::Sections
+
+.. doxygenenum:: PacBio::BAM::PbiFile::VersionEnum
+
+.. doxygenfunction:: PacBio::BAM::PbiFile::CreateFrom
diff --git a/docs/source/api/PbiFilter.rst b/docs/source/api/PbiFilter.rst

new file mode 100644 (file)

index 0000000..261498b
--- /dev/null
+++ b/docs/source/api/PbiFilter.rst
@@ -0,0 +1,11 @@
+PbiFilter
+=========
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFilter.h>
+
+.. doxygenclass:: PacBio::BAM::PbiFilter
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFilterCompositeBamReader.rst b/docs/source/api/PbiFilterCompositeBamReader.rst

new file mode 100644 (file)

index 0000000..7a69df3
--- /dev/null
+++ b/docs/source/api/PbiFilterCompositeBamReader.rst
@@ -0,0 +1,11 @@
+PbiFilterCompositeBamReader
+===========================
+
+.. code-block:: cpp
+
+   #include <pbbam/CompositeBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::PbiFilterCompositeBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFilterQuery.rst b/docs/source/api/PbiFilterQuery.rst

new file mode 100644 (file)

index 0000000..75bbc12
--- /dev/null
+++ b/docs/source/api/PbiFilterQuery.rst
@@ -0,0 +1,11 @@
+PbiFilterQuery
+==============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFilterQuery.h>
+
+.. doxygenclass:: PacBio::BAM::PbiFilterQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiFilterTypes.rst b/docs/source/api/PbiFilterTypes.rst

new file mode 100644 (file)

index 0000000..052389b
--- /dev/null
+++ b/docs/source/api/PbiFilterTypes.rst
@@ -0,0 +1,8 @@
+PbiFilterTypes
+==============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiFilterTypes.h>
+
+.. doxygenfile:: PbiFilterTypes.h
+\ No newline at end of file
diff --git a/docs/source/api/PbiIndexedBamReader.rst b/docs/source/api/PbiIndexedBamReader.rst

new file mode 100644 (file)

index 0000000..5450c8a
--- /dev/null
+++ b/docs/source/api/PbiIndexedBamReader.rst
@@ -0,0 +1,11 @@
+PbiIndexedBamReader
+===================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiIndexedBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::PbiIndexedBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawBarcodeData.rst b/docs/source/api/PbiRawBarcodeData.rst

new file mode 100644 (file)

index 0000000..c72ebfb
--- /dev/null
+++ b/docs/source/api/PbiRawBarcodeData.rst
@@ -0,0 +1,11 @@
+PbiRawBarcodeData
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawBarcodeData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawBasicData.rst b/docs/source/api/PbiRawBasicData.rst

new file mode 100644 (file)

index 0000000..2282387
--- /dev/null
+++ b/docs/source/api/PbiRawBasicData.rst
@@ -0,0 +1,11 @@
+PbiRawBasicData
+===============
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawBasicData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawData.rst b/docs/source/api/PbiRawData.rst

new file mode 100644 (file)

index 0000000..1a974e8
--- /dev/null
+++ b/docs/source/api/PbiRawData.rst
@@ -0,0 +1,11 @@
+PbiRawData
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawMappedData.rst b/docs/source/api/PbiRawMappedData.rst

new file mode 100644 (file)

index 0000000..42e1de1
--- /dev/null
+++ b/docs/source/api/PbiRawMappedData.rst
@@ -0,0 +1,11 @@
+PbiRawMappedData
+================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawMappedData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiRawReferenceData.rst b/docs/source/api/PbiRawReferenceData.rst

new file mode 100644 (file)

index 0000000..460cde4
--- /dev/null
+++ b/docs/source/api/PbiRawReferenceData.rst
@@ -0,0 +1,11 @@
+PbiRawReferenceData
+===================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiRawReferenceData
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/PbiReferenceEntry.rst b/docs/source/api/PbiReferenceEntry.rst

new file mode 100644 (file)

index 0000000..472e586
--- /dev/null
+++ b/docs/source/api/PbiReferenceEntry.rst
@@ -0,0 +1,11 @@
+PbiReferenceEntry
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/PbiRawData.h>
+
+.. doxygenclass:: PacBio::BAM::PbiReferenceEntry
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Position.rst b/docs/source/api/Position.rst

new file mode 100644 (file)

index 0000000..3c945f2
--- /dev/null
+++ b/docs/source/api/Position.rst
@@ -0,0 +1,10 @@
+Position
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/Position.h>
+
+.. doxygentypedef:: PacBio::BAM::Position
+
+.. doxygenvariable:: PacBio::BAM::UnmappedPosition
+\ No newline at end of file
diff --git a/docs/source/api/ProgramInfo.rst b/docs/source/api/ProgramInfo.rst

new file mode 100644 (file)

index 0000000..b58c93a
--- /dev/null
+++ b/docs/source/api/ProgramInfo.rst
@@ -0,0 +1,11 @@
+ProgramInfo
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/ProgramInfo.h>
+
+.. doxygenclass:: PacBio::BAM::ProgramInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/QNameQuery.rst b/docs/source/api/QNameQuery.rst

new file mode 100644 (file)

index 0000000..b549436
--- /dev/null
+++ b/docs/source/api/QNameQuery.rst
@@ -0,0 +1,11 @@
+QNameQuery
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/QNameQuery.h>
+
+.. doxygenclass:: PacBio::BAM::QNameQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/QualityValue.rst b/docs/source/api/QualityValue.rst

new file mode 100644 (file)

index 0000000..3520c5a
--- /dev/null
+++ b/docs/source/api/QualityValue.rst
@@ -0,0 +1,11 @@
+QualityValue
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/QualityValue.h>
+
+.. doxygenclass:: PacBio::BAM::QualityValue
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/QualityValues.rst b/docs/source/api/QualityValues.rst

new file mode 100644 (file)

index 0000000..8f6dfa5
--- /dev/null
+++ b/docs/source/api/QualityValues.rst
@@ -0,0 +1,11 @@
+QualityValues
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/QualityValues.h>
+
+.. doxygenclass:: PacBio::BAM::QualityValues
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ReadAccuracyQuery.rst b/docs/source/api/ReadAccuracyQuery.rst

new file mode 100644 (file)

index 0000000..abfd1e6
--- /dev/null
+++ b/docs/source/api/ReadAccuracyQuery.rst
@@ -0,0 +1,11 @@
+ReadAccuracyQuery
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/ReadAccuracyQuery.h>
+
+.. doxygenclass:: PacBio::BAM::ReadAccuracyQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ReadGroupInfo.rst b/docs/source/api/ReadGroupInfo.rst

new file mode 100644 (file)

index 0000000..7fb4f69
--- /dev/null
+++ b/docs/source/api/ReadGroupInfo.rst
@@ -0,0 +1,21 @@
+ReadGroupInfo
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/ReadGroupInfo.h>
+
+.. doxygenenum:: PacBio::BAM::BaseFeature
+
+.. doxygenenum:: PacBio::BAM::FrameCodec
+
+.. doxygenenum:: PacBio::BAM::BarcodeModeType
+
+.. doxygenenum:: PacBio::BAM::BarcodeQualityType
+
+.. doxygenclass:: PacBio::BAM::ReadGroupInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+   
+.. doxygenfunction:: PacBio::BAM::MakeReadGroupId
+\ No newline at end of file
diff --git a/docs/source/api/ReferenceSet.rst b/docs/source/api/ReferenceSet.rst

new file mode 100644 (file)

index 0000000..22e4703
--- /dev/null
+++ b/docs/source/api/ReferenceSet.rst
@@ -0,0 +1,11 @@
+ReferenceSet
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::ReferenceSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SamTagCodec.rst b/docs/source/api/SamTagCodec.rst

new file mode 100644 (file)

index 0000000..4f8d65d
--- /dev/null
+++ b/docs/source/api/SamTagCodec.rst
@@ -0,0 +1,11 @@
+SamTagCodec
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/SamTagCodec.h>
+
+.. doxygenclass:: PacBio::BAM::SamTagCodec
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SequenceInfo.rst b/docs/source/api/SequenceInfo.rst

new file mode 100644 (file)

index 0000000..393d5bb
--- /dev/null
+++ b/docs/source/api/SequenceInfo.rst
@@ -0,0 +1,11 @@
+SequenceInfo
+============
+
+.. code-block:: cpp
+
+   #include <pbbam/SequenceInfo.h>
+
+.. doxygenclass:: PacBio::BAM::SequenceInfo
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SequentialCompositeBamReader.rst b/docs/source/api/SequentialCompositeBamReader.rst

new file mode 100644 (file)

index 0000000..31ed3b1
--- /dev/null
+++ b/docs/source/api/SequentialCompositeBamReader.rst
@@ -0,0 +1,11 @@
+SequentialCompositeBamReader
+============================
+
+.. code-block:: cpp
+
+   #include <pbbam/CompositeBamReader.h>
+
+.. doxygenclass:: PacBio::BAM::SequentialCompositeBamReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Strand.rst b/docs/source/api/Strand.rst

new file mode 100644 (file)

index 0000000..4978f72
--- /dev/null
+++ b/docs/source/api/Strand.rst
@@ -0,0 +1,8 @@
+Strand
+======
+
+.. code-block:: cpp
+
+   #include <pbbam/Strand.h>
+
+.. doxygenenum:: PacBio::BAM::Strand 
diff --git a/docs/source/api/SubDataSets.rst b/docs/source/api/SubDataSets.rst

new file mode 100644 (file)

index 0000000..d179065
--- /dev/null
+++ b/docs/source/api/SubDataSets.rst
@@ -0,0 +1,11 @@
+SubDataSets
+===========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::SubDataSets
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SubreadLengthQuery.rst b/docs/source/api/SubreadLengthQuery.rst

new file mode 100644 (file)

index 0000000..23000b3
--- /dev/null
+++ b/docs/source/api/SubreadLengthQuery.rst
@@ -0,0 +1,11 @@
+SubreadLengthQuery
+==================
+
+.. code-block:: cpp
+
+   #include <pbbam/SubreadLengthQuery.h>
+
+.. doxygenclass:: PacBio::BAM::SubreadLengthQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/SubreadSet.rst b/docs/source/api/SubreadSet.rst

new file mode 100644 (file)

index 0000000..bfc3c13
--- /dev/null
+++ b/docs/source/api/SubreadSet.rst
@@ -0,0 +1,11 @@
+SubreadSet
+==========
+
+.. code-block:: cpp
+
+   #include <pbbam/DataSetTypes.h>
+
+.. doxygenclass:: PacBio::BAM::SubreadSet 
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/Tag.rst b/docs/source/api/Tag.rst

new file mode 100644 (file)

index 0000000..50b85c7
--- /dev/null
+++ b/docs/source/api/Tag.rst
@@ -0,0 +1,15 @@
+Tag
+===
+
+.. code-block:: cpp
+
+   #include <pbbam/Tag.h>
+
+.. doxygenenum:: PacBio::BAM::TagDataType
+
+.. doxygenenum:: PacBio::BAM::TagModifier
+
+.. doxygenclass:: PacBio::BAM::Tag
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/TagCollection.rst b/docs/source/api/TagCollection.rst

new file mode 100644 (file)

index 0000000..1314b13
--- /dev/null
+++ b/docs/source/api/TagCollection.rst
@@ -0,0 +1,11 @@
+TagCollection
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/TagCollection.h>
+
+.. doxygenclass:: PacBio::BAM::TagCollection
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualPolymeraseBamRecord.rst b/docs/source/api/VirtualPolymeraseBamRecord.rst

new file mode 100644 (file)

index 0000000..06d5531
--- /dev/null
+++ b/docs/source/api/VirtualPolymeraseBamRecord.rst
@@ -0,0 +1,11 @@
+VirtualPolymeraseBamRecord
+==========================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualPolymeraseBamRecord.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualPolymeraseBamRecord
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualPolymeraseCompositeReader.rst b/docs/source/api/VirtualPolymeraseCompositeReader.rst

new file mode 100644 (file)

index 0000000..e6cab4e
--- /dev/null
+++ b/docs/source/api/VirtualPolymeraseCompositeReader.rst
@@ -0,0 +1,11 @@
+VirtualPolymeraseCompositeReader
+================================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualPolymeraseCompositeReader.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualPolymeraseCompositeReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualPolymeraseReader.rst b/docs/source/api/VirtualPolymeraseReader.rst

new file mode 100644 (file)

index 0000000..14a46e8
--- /dev/null
+++ b/docs/source/api/VirtualPolymeraseReader.rst
@@ -0,0 +1,11 @@
+VirtualPolymeraseReader
+=======================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualPolymeraseReader.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualPolymeraseReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualRegion.rst b/docs/source/api/VirtualRegion.rst

new file mode 100644 (file)

index 0000000..7a09846
--- /dev/null
+++ b/docs/source/api/VirtualRegion.rst
@@ -0,0 +1,11 @@
+VirtualRegion
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualRegion.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualRegion
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/VirtualRegionType.rst b/docs/source/api/VirtualRegionType.rst

new file mode 100644 (file)

index 0000000..4279200
--- /dev/null
+++ b/docs/source/api/VirtualRegionType.rst
@@ -0,0 +1,8 @@
+VirtualRegionType
+=================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualRegionType.h>
+
+.. doxygenenum:: PacBio::BAM::VirtualRegionType
diff --git a/docs/source/api/VirtualRegionTypeMap.rst b/docs/source/api/VirtualRegionTypeMap.rst

new file mode 100644 (file)

index 0000000..eebe637
--- /dev/null
+++ b/docs/source/api/VirtualRegionTypeMap.rst
@@ -0,0 +1,11 @@
+VirtualRegionTypeMap
+====================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/VirtualRegionTypeMap.h>
+
+.. doxygenclass:: PacBio::BAM::VirtualRegionTypeMap
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ZmwGroupQuery.rst b/docs/source/api/ZmwGroupQuery.rst

new file mode 100644 (file)

index 0000000..01fc18a
--- /dev/null
+++ b/docs/source/api/ZmwGroupQuery.rst
@@ -0,0 +1,11 @@
+ZmwGroupQuery
+=============
+
+.. code-block:: cpp
+
+   #include <pbbam/ZmwGroupQuery.h>
+
+.. doxygenclass:: PacBio::BAM::ZmwGroupQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ZmwQuery.rst b/docs/source/api/ZmwQuery.rst

new file mode 100644 (file)

index 0000000..375fcb0
--- /dev/null
+++ b/docs/source/api/ZmwQuery.rst
@@ -0,0 +1,11 @@
+ZmwQuery
+========
+
+.. code-block:: cpp
+
+   #include <pbbam/ZmwQuery.h>
+
+.. doxygenclass:: PacBio::BAM::ZmwQuery
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api/ZmwWhitelistVirtualReader.rst b/docs/source/api/ZmwWhitelistVirtualReader.rst

new file mode 100644 (file)

index 0000000..95d2d1a
--- /dev/null
+++ b/docs/source/api/ZmwWhitelistVirtualReader.rst
@@ -0,0 +1,11 @@
+ZmwWhitelistVirtualReader
+=========================
+
+.. code-block:: cpp
+
+   #include <pbbam/virtual/ZmwWhitelistVirtualReader.h>
+
+.. doxygenclass:: PacBio::BAM::ZmwWhitelistVirtualReader
+   :members:
+   :protected-members:
+   :undoc-members:
+\ No newline at end of file
diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst

new file mode 100644 (file)

index 0000000..354c0de
--- /dev/null
+++ b/docs/source/api_reference.rst
@@ -0,0 +1,12 @@
+.. _api_reference:
+
+C++ API Reference
+=================
+
+Watch this space for more recipes & how-tos. 
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+
+   api/*
diff --git a/docs/source/commandline_utilities.rst b/docs/source/commandline_utilities.rst

new file mode 100644 (file)

index 0000000..7f1bdaf
--- /dev/null
+++ b/docs/source/commandline_utilities.rst
@@ -0,0 +1,15 @@
+.. _command_line:
+
+Command Line Utilities
+======================
+
+In addition to the main library and wrappers, pbbam also provides a few basic
+utilities for working with PacBio indices (".pbi" files).
+
+.. toctree::
+   :maxdepth: 1
+
+   tools/bam2sam
+   tools/pbindex
+   tools/pbindexdump
+   tools/pbmerge
diff --git a/docs/source/conf.py b/docs/source/conf.py

new file mode 100755 (executable)

index 0000000..14922c3
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,332 @@
+# -*- coding: utf-8 -*-
+#
+# pbbam documentation build configuration file, created by
+# sphinx-quickstart on Fri Dec  4 10:08:52 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+import shlex
+import re
+import subprocess
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# get RTD to run doxygen first, per http://breathe.readthedocs.org/en/latest/readthedocs.html
+# but... we generate our actual Doxyfile via CMake in a normal build,
+# so we need to create one here, subbing actual values
+read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
+if read_the_docs_build:
+
+    # fetch directory info
+    this_dir = os.path.abspath(os.getcwd())
+    docs_dir = os.path.abspath(os.path.join(this_dir, '..'))
+    root_dir = os.path.abspath(os.path.join(docs_dir, '..'))
+    include_dir = os.path.abspath(os.path.join(root_dir, 'include'))
+
+    # get project version
+    version = ''
+    with open(os.path.abspath(os.path.join(root_dir, 'CMakeLists.txt')), 'r') as cmakeFile:
+        for line in cmakeFile:
+            if line.startswith('project'):
+                version = re.search(r'VERSION\s*([\d.]+)', line).group(1)
+                break
+
+    # read Doxyfile.in, replace markers with real values, and write Doxyfile
+    inDoxyfile = open(os.path.abspath(os.path.join(docs_dir, 'Doxyfile.in')), 'r')
+    configIn   = inDoxyfile.read()
+    configOut  = re.sub('@PacBioBAM_NAME@',       'pbbam', \
+                 re.sub('@PacBioBAM_VERSION@',    version, \
+                 re.sub('@PacBioBAM_DocsDir@',    docs_dir, \
+                 re.sub('@PacBioBAM_IncludeDir@', include_dir, configIn)))) 
+    outDoxyfile = open(os.path.abspath(os.path.join(docs_dir, 'Doxyfile')), 'w')
+    #print(configOut, outDoxyfile)
+    print >>outDoxyfile, configOut
+    outDoxyfile.close()
+
+    # now run Doxygen
+    subprocess.call('cd ..; doxygen', shell=True)
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['breathe']
+#extensions = [
+#    'sphinx.ext.autodoc',
+ #   'sphinx.ext.coverage',
+ #   'breathe',
+#]
+
+# Setup Breathe extension varialbes
+breathe_projects = { 'pbbam' : os.path.join(os.getcwd(), '..', 'xml') + os.path.sep }
+breathe_default_project = 'pbbam'
+breathe_default_members = ('members', 'undoc-members')
+breathe_implementation_filename_extensions = [ '.cpp', '.inl' ]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'pbbam'
+copyright = u'2015, Derek Barnett'
+author = u'Derek Barnett'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.17.0'
+# The full version, including alpha/beta/rc tags.
+release = '0.17.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = []
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'pacbio-theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+html_theme_path = ['.']
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
+#html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# Now only 'ja' uses this config value
+#html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pbbamdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+
+# Latex figure (float) alignment
+#'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, 'pbbam.tex', u'pbbam Documentation',
+   u'Derek Barnett', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pbbam', u'pbbam Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  (master_doc, 'pbbam', u'pbbam Documentation',
+   author, 'pbbam', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst

new file mode 100644 (file)

index 0000000..349e9fd
--- /dev/null
+++ b/docs/source/getting_started.rst
@@ -0,0 +1,192 @@
+
+.. _getting_started:
+
+Getting Started
+===============
+
+.. _getting_started-requirements:
+
+Requirements
+------------
+
+These components will almost certainly already be on your system. 
+ 
+* `gcc`_ (4.8+) OR `clang`_ (v3.1+)
+* pthreads
+* zlib
+
+Double-check your compiler version, to be sure it is compatible.
+
+.. code-block:: console
+
+   $ g++ -v    
+   $ clang -v  
+
+Additional requirements:
+
+* `Boost`_ (1.55+)
+* `CMake`_ (3.0+)
+* `Google Test`_
+* `htslib`_ (1.4+)
+
+For additional languages:
+
+* `SWIG`_ (3.0.5+)
+
+For building API documentation locally:
+
+* `Doxygen`_
+
+For maximal convenience, install htslib and google test in the same parent directory you plan to install pbbam.
+
+.. _Boost: http://www.boost.org/
+.. _clang: http://clang.llvm.org/
+.. _CMake: https://cmake.org/
+.. _Meson: http://mesonbuild.com/
+.. _Ninja: https://ninja-build.org/ (only required when using Meson, optional for CMake)
+.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/
+.. _gcc: https://gcc.gnu.org/
+.. _Google Test: https://github.com/google/googletest
+.. _htslib: https://github.com/samtools/htslib.git 
+.. _SWIG: http://www.swig.org/
+
+.. _getting_started-build:
+
+Clone & Build
+-------------
+
+.. note::
+
+   The following steps are for building the C++ library and command-line utilities. 
+   If you are integrating pbbam into a C#, Python, or R project, take a look at the 
+   instructions for :ref:`additional languages <swig_bindings>`.
+
+The basic steps for obtaining pbbam and building it from source are as follows:
+
+Build and install htslib, per the project's instructions (or on OSX "brew install htslib").
+
+Clone
+^^^^^
+
+You should first clone the repository:
+
+.. code-block:: console
+
+   $ git clone https://github.com/PacificBiosciences/pbbam.git
+   $ cd pbbam
+
+Building with CMake
+^^^^^^^^^^^^^^^^^^^
+
+When building with CMake, create a separate build directory:
+
+.. code-block:: console
+
+   $ mkdir build
+   $ cd build
+   $ cmake ..
+   $ make -j 4    # compiles using 4 threads
+
+Output:
+
+  * Library   : <pbbam_root>/lib
+  * Headers   : <pbbam_root>/include
+  * Utilities : <pbbam_root>/bin
+ 
+You may need to set a few options on the cmake command, to point to dependencies' install locations. 
+Common installation-related options include:
+
+  * GTEST_SRC_DIR
+  
+Add these using the '-D' argument, like this:
+
+.. code-block:: console
+
+   $ cmake .. -DGTEST_SRC_DIR="path/to/googletest"
+ 
+To run the test suite, run:
+
+.. code-block:: console
+
+   $ make test
+
+To build a local copy of the (Doxygen-style) API documentation, run:
+
+.. code-block:: console
+
+   $ make doc
+   
+And then open <pbbam_root>/docs/html/index.html in your favorite browser.
+
+.. _getting_started-integrate:
+
+Building with Meson
+^^^^^^^^^^^^^^^^^^^
+
+Building with Meson is generally faster and more versatile. Meson strictly requires building out of source:
+
+.. code-block:: console
+
+   $ mkdir build
+   $ cd build
+   $ meson --prefix /my/install/prefix -Dtests=true ..
+   $ ninja
+
+where ninja will by default utilize a number of threads for compilation equal to the number of logical
+cores on your system. Here ``-Dtests=true`` enables pulling in dependencies for testing. In
+order to run the test suite, run:
+
+.. code-block:: console
+
+   $ ninja test
+
+If you wish to install pbbam, run:
+
+.. code-block:: console
+
+   $ ninja install
+
+and ninja will install pbbam to ``/my/install/prefix``.
+
+Integrate
+---------
+
+CMake-based projects
+````````````````````
+
+For CMake-based projects that will "ship with" or otherwise live alongside pbbam, you can 
+use the approach described here.
+
+Before defining your library or executable, add the following:
+
+.. code-block:: cmake
+
+   add_subdirectory(<path/to/pbbam> external/build/pbbam)
+
+When it's time to run "make" this will ensure that pbbam will be built, inside your own project's 
+build directory. After this point in the CMakeLists.txt file(s), a few variables will be available 
+that can be used to setup your include paths and library linking targets:
+
+.. code-block:: cmake
+
+   include_directories( 
+       ${PacBioBAM_INCLUDE_DIRS} 
+       # other includes that your project needs
+   )
+
+   add_executable(foo)
+   
+   target_link_libraries(foo 
+       ${PacBioBAM_LIBRARIES}
+       # other libs that your project needs
+   )
+
+Non-CMake projects
+``````````````````
+
+If you're using something other than CMake for your project's build system, then you need to point 
+it to pbbam's include directory & library, as well as those of its dependencies (primarily htslib).
+
+If you built and installed pbbam using Meson, pkg-config files will be available to be consumed by
+projects wishing to utilize pbbam. Autoconf, CMake, Waf, SCons and Meson all have means to determine
+dependency information from pkg-config files.
diff --git a/docs/source/index.rst b/docs/source/index.rst

new file mode 100644 (file)

index 0000000..426c3c5
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,33 @@
+.. pbbam documentation master file, created by
+   sphinx-quickstart on Fri Dec  4 10:08:52 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+.. _home:
+
+pbbam documentation
+===================
+
+As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard BAM 
+format for (both aligned and unaligned) basecall data files. We have also formulated 
+a BAM companion file format (bam.pbi) enabling fast access to a richer set of per-read 
+information as well as compatibility for software built around the legacy cmp.h5 format.
+
+The **pbbam** software package provides components to create, query, & edit PacBio BAM
+files and associated indices. These components include a core C++ library, bindings for 
+additional languages, and command-line utilities.
+
+.. toctree::
+   :maxdepth: 1
+
+   getting_started
+   api_reference
+   swig_bindings
+   commandline_utilities
+
+
+Search:
+
+* :ref:`genindex`
+* :ref:`search`
+
diff --git a/docs/source/pacbio-theme/static/headerGradient.jpg b/docs/source/pacbio-theme/static/headerGradient.jpg

new file mode 100644 (file)

index 0000000..883f147

Binary files /dev/null and b/docs/source/pacbio-theme/static/headerGradient.jpg differ
diff --git a/docs/source/pacbio-theme/static/pacbio.css b/docs/source/pacbio-theme/static/pacbio.css

new file mode 100644 (file)

index 0000000..b4ab87f
--- /dev/null
+++ b/docs/source/pacbio-theme/static/pacbio.css
@@ -0,0 +1,238 @@
+/**
+ * Sphinx stylesheet -- default theme
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+ 
+@import url("basic.css");
+ 
+/* -- page layout ----------------------------------------------------------- */
+ 
+body {
+    font-family: Arial, sans-serif;
+    font-size: 100%;
+    background-color: #555;
+    color: #555;
+    margin: 0;
+    padding: 0;
+    min-width: 500px;
+    max-width: 956px;
+    margin: 0 auto;
+}
+
+div.documentwrapper {
+    float: left;
+    width: 100%;
+}
+
+div.bodywrapper {
+    margin: 0 0 0 230px;
+}
+
+hr{
+    border: 1px solid #B1B4B6;
+    
+}
+ 
+div.document {
+    background-color: #eee;
+}
+ 
+div.body {
+    background-color: #ffffff;
+    color: #3E4349;
+    padding: 30px 30px 30px 30px;
+    font-size: 0.8em;
+}
+ 
+div.footer {
+    color: #555;
+       background-color: #fff;
+    padding: 13px 0;
+    text-align: center;
+    font-size: 75%;
+
+}
+div.footer a {
+    color: #444;
+    text-decoration: underline;
+}
+ 
+div.related {
+    background: #fff url(headerGradient.jpg);
+    line-height: 80px;
+    color: #fff;
+    font-size: 0.80em;
+    height: 79px;
+    z-index: -1;
+}
+
+div.related ul {
+    background: url(pacbioLogo.png) 10px no-repeat;
+    padding: 0 0 0 200px;
+}
+ 
+div.related a {
+    color: #E2F3CC;
+}
+ 
+div.sphinxsidebar {
+    font-size: 0.75em;
+    line-height: 1.5em;
+}
+
+div.sphinxsidebarwrapper{
+    padding: 20px 0;
+}
+ 
+div.sphinxsidebar h3,
+div.sphinxsidebar h4 {
+    font-family: Arial, sans-serif;
+    color: #222;
+    font-size: 1.2em;
+    font-weight: bold;
+    margin: 0;
+    padding: 5px 10px 0 10px;
+}
+
+div.sphinxsidebar h4{
+    font-size: 1.1em;
+}
+ 
+div.sphinxsidebar h3 a {
+    color: #444;
+}
+ 
+ 
+div.sphinxsidebar p {
+    color: #888;
+    padding: 0px 20px;
+       margin-top: 5px;
+}
+ 
+div.sphinxsidebar p.topless {
+}
+ 
+div.sphinxsidebar ul {
+    margin: 5px 20px 10px 20px;
+    padding: 0;
+    color: #000;
+}
+ 
+div.sphinxsidebar a {
+    color: #444;
+}
+ 
+div.sphinxsidebar input {
+    border: 1px solid #ccc;
+    font-family: sans-serif;
+    font-size: 1em;
+}
+
+div.sphinxsidebar input[type=text]{
+    margin-left: 20px;
+}
+ 
+/* -- body styles ----------------------------------------------------------- */
+ 
+a {
+    color: #005B81;
+    text-decoration: none;
+}
+ 
+a:hover {
+    color: #E32E00;
+    text-decoration: underline;
+}
+ 
+div.body h1,
+div.body h2,
+div.body h3,
+div.body h4,
+div.body h5,
+div.body h6 {
+    font-family: Arial, sans-serif;
+    font-weight: bold;
+    color: #264868;
+    margin: 30px 0px 10px 0px;
+    padding: 5px 0 5px 0px;
+}
+ 
+div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 180%; font-weight: normal; }
+div.body h2 { font-size: 125%; }
+div.body h3 { font-size: 110%; }
+div.body h4 { font-size: 100%; }
+div.body h5 { font-size: 100%; }
+div.body h6 { font-size: 100%; }
+ 
+a.headerlink {
+    color: #c60f0f;
+    font-size: 0.8em;
+    padding: 0 4px 0 4px;
+    text-decoration: none;
+}
+ 
+a.headerlink:hover {
+    background-color: #c60f0f;
+    color: white;
+}
+ 
+div.body p, div.body dd, div.body li {
+    line-height: 1.5em;
+    font-size: 1em;
+}
+ 
+div.admonition p.admonition-title + p {
+    display: inline;
+}
+
+div.highlight{
+    background-color: white;
+}
+
+div.note {
+    background-color: #eee;
+    border: 1px solid #ccc;
+}
+ 
+div.seealso {
+    background-color: #ffc;
+    border: 1px solid #ff6;
+}
+ 
+div.topic {
+    background-color: #eee;
+}
+ 
+div.warning {
+    background-color: #ffe4e4;
+    border: 1px solid #f66;
+}
+ 
+p.admonition-title {
+    display: inline;
+}
+ 
+p.admonition-title:after {
+    content: ":";
+}
+ 
+pre {
+    padding: 10px;
+    background-color: White;
+    color: #222;
+    line-height: 1.2em;
+    border: 1px solid #C6C9CB;
+    font-size: 1.2em;
+    margin: 1.5em 0 1.5em 0;
+    -webkit-box-shadow: 1px 1px 1px #d8d8d8;
+    -moz-box-shadow: 1px 1px 1px #d8d8d8;
+}
+ 
+tt {
+    background-color: #ecf0f3;
+    color: #222;
+    padding: 1px 2px;
+    font-size: 1.2em;
+    font-family: monospace;
+}
+
diff --git a/docs/source/pacbio-theme/static/pacbioLogo.png b/docs/source/pacbio-theme/static/pacbioLogo.png

new file mode 100644 (file)

index 0000000..b2e4887

Binary files /dev/null and b/docs/source/pacbio-theme/static/pacbioLogo.png differ
diff --git a/docs/source/pacbio-theme/static/pygments.css b/docs/source/pacbio-theme/static/pygments.css

new file mode 100644 (file)

index 0000000..4588cde
--- /dev/null
+++ b/docs/source/pacbio-theme/static/pygments.css
@@ -0,0 +1,55 @@
+.c { color: #999988; font-style: italic } /* Comment */
+.k { font-weight: bold } /* Keyword */
+.o { font-weight: bold } /* Operator */
+.cm { color: #999988; font-style: italic } /* Comment.Multiline */
+.cp { color: #999999; font-weight: bold } /* Comment.preproc */
+.c1 { color: #999988; font-style: italic } /* Comment.Single */
+.gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */
+.ge { font-style: italic } /* Generic.Emph */
+.gr { color: #aa0000 } /* Generic.Error */
+.gh { color: #999999 } /* Generic.Heading */
+.gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */
+.go { color: #111 } /* Generic.Output */
+.gp { color: #555555 } /* Generic.Prompt */
+.gs { font-weight: bold } /* Generic.Strong */
+.gu { color: #aaaaaa } /* Generic.Subheading */
+.gt { color: #aa0000 } /* Generic.Traceback */
+.kc { font-weight: bold } /* Keyword.Constant */
+.kd { font-weight: bold } /* Keyword.Declaration */
+.kp { font-weight: bold } /* Keyword.Pseudo */
+.kr { font-weight: bold } /* Keyword.Reserved */
+.kt { color: #445588; font-weight: bold } /* Keyword.Type */
+.m { color: #009999 } /* Literal.Number */
+.s { color: #bb8844 } /* Literal.String */
+.na { color: #008080 } /* Name.Attribute */
+.nb { color: #999999 } /* Name.Builtin */
+.nc { color: #445588; font-weight: bold } /* Name.Class */
+.no { color: #ff99ff } /* Name.Constant */
+.ni { color: #800080 } /* Name.Entity */
+.ne { color: #990000; font-weight: bold } /* Name.Exception */
+.nf { color: #990000; font-weight: bold } /* Name.Function */
+.nn { color: #555555 } /* Name.Namespace */
+.nt { color: #000080 } /* Name.Tag */
+.nv { color: purple } /* Name.Variable */
+.ow { font-weight: bold } /* Operator.Word */
+.mf { color: #009999 } /* Literal.Number.Float */
+.mh { color: #009999 } /* Literal.Number.Hex */
+.mi { color: #009999 } /* Literal.Number.Integer */
+.mo { color: #009999 } /* Literal.Number.Oct */
+.sb { color: #bb8844 } /* Literal.String.Backtick */
+.sc { color: #bb8844 } /* Literal.String.Char */
+.sd { color: #bb8844 } /* Literal.String.Doc */
+.s2 { color: #bb8844 } /* Literal.String.Double */
+.se { color: #bb8844 } /* Literal.String.Escape */
+.sh { color: #bb8844 } /* Literal.String.Heredoc */
+.si { color: #bb8844 } /* Literal.String.Interpol */
+.sx { color: #bb8844 } /* Literal.String.Other */
+.sr { color: #808000 } /* Literal.String.Regex */
+.s1 { color: #bb8844 } /* Literal.String.Single */
+.ss { color: #bb8844 } /* Literal.String.Symbol */
+.bp { color: #999999 } /* Name.Builtin.Pseudo */
+.vc { color: #ff99ff } /* Name.Variable.Class */
+.vg { color: #ff99ff } /* Name.Variable.Global */
+.vi { color: #ff99ff } /* Name.Variable.Instance */
+.il { color: #009999 } /* Literal.Number.Integer.Long */
+
diff --git a/docs/source/pacbio-theme/theme.conf b/docs/source/pacbio-theme/theme.conf

new file mode 100644 (file)

index 0000000..dd24a1a
--- /dev/null
+++ b/docs/source/pacbio-theme/theme.conf
@@ -0,0 +1,4 @@
+[theme]
+inherit = default 
+stylesheet = pacbio.css
+pygments_style = tango
diff --git a/docs/source/requirements.txt b/docs/source/requirements.txt

new file mode 100644 (file)

index 0000000..cd6467e
--- /dev/null
+++ b/docs/source/requirements.txt
@@ -0,0 +1 @@
+breathe
diff --git a/docs/source/swig_bindings.rst b/docs/source/swig_bindings.rst

new file mode 100644 (file)

index 0000000..e9dc33a
--- /dev/null
+++ b/docs/source/swig_bindings.rst
@@ -0,0 +1,257 @@
+.. _swig_bindings:
+
+Additional Languages
+====================
+
+pbbam uses SWIG to generate bindings for other languages. Currently this includes support for C#, Python, and R.
+
+These bindings are disabled by default. See the entry below for your target language to configure pbbam & integrate
+the bindings into your project.
+
+.. _swig_bindings-csharp:
+
+C#
+------
+
+Building
+````````
+
+To build the support for C#, you need to tell CMake to enable it before building:
+
+.. code-block:: console
+
+   $ cmake .. -DPacBioBAM_wrap_csharp
+   $ make
+
+The 'make' step will build relevant libraries/wrappers, and then run a simple program using them, 
+as a quick sanity-check. 
+
+After building, the libraries and wrappers can be found under the pbbam/lib/csharp directory. 
+
+API Example
+```````````
+
+.. code-block:: c#
+
+   using PacBio.BAM;
+
+   namespace TestStuff
+   {
+       public class TestPbbam
+       {
+           public static void TestZmwQuery()
+           {
+               var d = new DataSet("foo.bam");
+               var q = new ZmwQuery(new IntList {1, 2, 3}, d);
+               var q2 = new ZmwQuery(new IntList { 14743 }, d);
+               if (0 != q.Count() || 4 != q2.Count())
+               {
+                   throw new Exception("ZmwQuery not working");
+               }
+               Console.WriteLine("TestZmwQuery - OK!");
+           }
+       }
+   }
+
+.. _swig_bindings-python:
+
+Python
+------
+
+Building
+````````
+
+To build the support for Python, you need to tell CMake to enable it:
+
+.. code-block:: console
+
+   $ cmake .. -DPacBioBAM_wrap_python
+   $ make
+
+The 'make' step will build relevant libraries/wrappers, and then run a simple program using them, 
+as a quick sanity-check. 
+
+After building, the libraries and wrappers can be found in the pbbam/lib/python directory. 
+'make test' will also include some Python-side unit tests as well.
+
+To use the PacBioBam module, you can set your PYTHONPATH before invoking your script:
+
+.. code-block:: console
+
+   $ PYTHONPATH="path/to/pbbam/lib/python" python myScript.py
+
+Or otherwise configure your environment to find the PacBioBam module. 
+
+API Example
+```````````
+
+.. code-block:: python
+
+   import PacBioBam
+   
+   try:
+       file = PacBioBam.BamFile('foo.bam')
+       writer = PacBioBam.BamWriter('new.bam', file.Header())
+       dataset = PacBioBam.DataSet(file)
+       entireFile = PacBioBam.EntireFileQuery(dataset)
+       for record in PacBioBam.Iterate(entireFile):
+           writer.Write(record)
+   except RuntimeError:
+       # found error
+   
+Python-Specific Notes
+`````````````````````
+   
+Iteration
+.........
+
+Iteration over dataset queries in Python will likely need to use the PacBioBam.Iterate() method. Thus
+file iteration loops will look something like the following:
+
+.. code-block:: python
+       
+   entireFile = PacBioBam.EntireFileQuery("input.bam")
+   for record in PacBioBam.Iterate(entireFile):
+       foo.bar(record)
+
+Exception Handling
+..................
+   
+Exceptions are used widely by the C++ library. To handle them from Python, you can use try blocks, looking for
+any RuntimeError:
+
+.. code-block:: python
+
+   try:
+       file = PacBioBam.BamFile("does_not_exist.bam")
+   except RuntimeError: 
+       print("caught expected error")
+   
+.. _swig_bindings-r:
+
+R
+------
+
+Building
+````````
+
+To build the support for R, you need to tell CMake to enable it:
+
+.. code-block:: console
+
+   $ cmake .. -DPacBioBAM_wrap_r
+   $ make
+   
+The 'make' step will build relevant libraries/wrappers, and then run a simple program using them, 
+as a quick sanity-check. 
+
+After building, the libraries and wrappers can be found in the pbbam/lib/R directory. 
+'make test' will also include some R-side unit tests as well.   
+
+To use the PacBioBam module in your script, nothing should be needed up front - simply invoke 'R' as normal. 
+You'll do the dynamic load of the R module near the beginning of your script:
+
+.. code-block:: r
+
+   # load pbbam R library
+   lib_path <- "path/to/pbbam/lib/R"
+   pbbam_libname <- paste(lib_path, "PacBioBam",   sep="/")
+   pbbam_wrapper <- paste(lib_path, "PacBioBam.R", sep="/")
+   dyn.load(paste(pbbam_libname, .Platform$dynlib.ext, sep=""))
+   source(pbbam_wrapper)
+   cacheMetaData(1) 
+
+
+API Example
+```````````
+
+.. code-block:: r
+
+   # load pbbam R library
+   lib_path <- "path/to/pbbam/lib/R"
+   pbbam_libname <- paste(lib_path, "PacBioBam",   sep="/")
+   pbbam_wrapper <- paste(lib_path, "PacBioBam.R", sep="/")
+   dyn.load(paste(pbbam_libname, .Platform$dynlib.ext, sep=""))
+   source(pbbam_wrapper)
+   cacheMetaData(1)    
+  
+   # sample method
+   copyFileAndFetchRecordNames <-function(inputFn, outputFn) {
+       
+       result <- tryCatch(
+       {
+           file   <- BamFile(inputFn)
+           writer <- BamWriter(outputFn, file$Header())
+           ds     <- DataSet(file)
+            
+           entireFile <- EntireFileQuery(ds)
+           iter <- entireFile$begin()
+           end  <- entireFile$end()
+                       
+           while ( iter$'__ne__'(end) ) {
+               record <- iter$value()
+                
+               names_in <- c(names_in, record$FullName())
+               writer$Write(record)
+               iter$incr()
+            }
+            writer$TryFlush()
+            return(names_in)
+        },
+        error = function(e) {
+            # handle error 
+            return(list())
+        })
+        return(result)
+   }
+
+R-Specific Notes
+````````````````
+
+Iteration
+.........
+
+To compare iterators, you'll need to explicitly use the '__eq__' or '__ne__' methods. Thus iterating over
+a data query, will look something like this:
+
+.. code-block:: r
+
+   iter <- query$begin()
+   end  <- query$end()
+   while ( iter$'__ne__'(end) ) {
+       record <- iter$value() 
+       
+       # do stuff with record
+   }
+   
+operator[]
+..........  
+   
+In C++, operator[] can be used in some classes to directly access elements in a sequence, e.g. Cigar string
+
+.. code-block:: cpp
+
+   CigarOperation op = cigar[0]; 
+   
+For the R wrapper, if you want to do the same sort of thing, you'll need to use the '__getitem__' method. 
+Please note that these are **0-based** indices, not 1-based as in much of R. 
+
+.. code-block:: r
+
+   op <- cigar$'__getitem__'(0) 
+   
+Exception Handling
+..................
+
+Exceptions are used widely by the C++ library. To handle them from R, you can use the 'tryCatch' block, listening for 
+'error' type exceptions.
+
+ .. code-block:: r
+ 
+    result <- tryCatch(
+    {
+        f <- BamFile("does_not_exist.bam") # this statement will throw
+    },
+    error = function(e) {
+        print(paste("caught expected erorr: ",e))
+    })
diff --git a/docs/source/tools/bam2sam.rst b/docs/source/tools/bam2sam.rst

new file mode 100644 (file)

index 0000000..4577686
--- /dev/null
+++ b/docs/source/tools/bam2sam.rst
@@ -0,0 +1,21 @@
+.. _bam2sam:
+
+bam2sam
+=======
+
+::
+
+  Usage: bam2sam [options] [input]
+
+  bam2sam converts a BAM file to SAM. It is essentially a stripped-down 'samtools
+  view', mostly useful for testing/debugging without requiring samtools. Input BAM
+  file is read from a file or stdin, and SAM output is written to stdout.
+
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+
+  Options:
+    input               Input BAM file. If not provided, stdin will be used as input.
+    --no-header         Omit header from output.
+    --header-only       Print only the header (no records).
diff --git a/docs/source/tools/pbindex.rst b/docs/source/tools/pbindex.rst

new file mode 100644 (file)

index 0000000..e7c491f
--- /dev/null
+++ b/docs/source/tools/pbindex.rst
@@ -0,0 +1,18 @@
+.. _pbindex:
+
+pbindex
+=======
+
+::
+
+  Usage: pbindex <input>
+
+  pbindex creates a index file that enables random-access to PacBio-specific data
+  in BAM files. Generated index filename will be the same as input BAM plus .pbi suffix.
+
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+
+  Input/Output:
+    input                 Input BAM file
diff --git a/docs/source/tools/pbindexdump.rst b/docs/source/tools/pbindexdump.rst

new file mode 100644 (file)

index 0000000..6829064
--- /dev/null
+++ b/docs/source/tools/pbindexdump.rst
@@ -0,0 +1,233 @@
+.. _pbindexdump:
+
+pbindexdump
+===========
+
+::
+
+  Usage: pbindexdump [options] [input]
+
+  pbindexdump prints a human-readable view of PBI data to stdout.
+
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+
+  Input/Output:
+    input               Input PBI file. If not provided, stdin will be used as input.
+    --format=STRING     Output format, one of:
+                            json, cpp
+
+                        json: pretty-printed JSON [default]
+
+                        cpp: copy/paste-able C++ code that can be used to
+                        construct the equivalent PacBio::BAM::PbiRawData object
+
+  JSON Formatting:
+    --json-indent-level=INT
+                        JSON indent level [4]
+    --json-raw          Prints fields in a manner that more closely reflects the
+                        PBI file format - presenting data as per-field columns,
+                        not per-record objects.
+
+JSON Output Schemas
+-------------------
+
+Normal JSON:
+
+.. code-block:: JSON
+
+    {
+      "type": "object",
+      "properties": {
+        "fileSections": {
+          "type": "array",
+          "items": { "type": "string" },
+        },
+        "numReads": { "type": "integer" },
+        "reads": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "aEnd": { "type": "integer" },
+              "aStart": { "type": "integer" },
+              "bcForward": { "type": "integer" },
+              "bcQuality": { "type": "integer" },
+              "bcReverse": { "type": "integer" },
+              "contextFlag": { "type": "integer" },
+              "fileOffset": { "type": "integer" },
+              "holeNumber": { "type": "integer" },
+              "mapQuality": { "type": "integer" },
+              "nM": { "type": "integer" },
+              "nMM": { "type": "integer" },
+              "qEnd": { "type": "integer" },
+              "qStart": { "type": "integer" },
+              "readQuality": { "type": "number" },
+              "reverseStrand": { "type": "integer" },
+              "rgId": { "type": "integer" },
+              "tEnd": { "type": "integer" },
+              "tId": { "type": "integer" },
+              "tStart: { "type": "integer" }
+            },
+            "required": [
+              "contextFlag",
+              "fileOffset",
+              "holeNumber",
+              "qEnd",
+              "qStart",
+              "readQuality",
+              "rgId"
+            ]
+          }
+        },
+        "references": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "beginRow": { "type": "integer" },
+              "endRow": { "type": "integer" },
+              "tId": { "type": "integer" }
+            },
+            "required" : [ "beginRow", "endRow","tId" ]
+          }
+        }q
+        "version": { "type": "string" }
+      },
+      "required": [
+        "fileSections",
+        "numReads",
+        "reads",
+        "version"
+      ]
+    }
+
+"Raw" JSON:
+
+.. code-block:: JSON
+
+    {
+      "type": "object",
+      "properties": {
+        "barcodeData" : {
+          "type" : "object",
+          "properties: {
+            "bcForward" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "bcQuality" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "bcReverse" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            }
+          }
+        },
+        "basicData" : {
+          "type" : "object",
+          "properties: {
+            "contextFlag" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "fileOffset" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "holeNumber" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "qEnd" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "qStart" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "readQuality" : {
+              "type": "array",
+              "items" : { "type": "number" }
+            },
+            "rgId : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            }
+          }
+        },
+        "fileSections": {
+          "type": "array",
+          "items": { "type": "string" },
+        },
+        "mappedData" : {
+          "type" : "object",
+          "properties: {
+            "aEnd" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "aStart" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "mapQuality" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "nM" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "nMM" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "readQuality" : {
+              "type": "array",
+              "items" : { "type": "number" }
+            },
+            "reverseStrand" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "tEnd" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "tId" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            },
+            "tStart" : {
+              "type": "array",
+              "items" : { "type": "integer" }
+            }
+          }
+        },
+        "numReads": { "type": "integer" },
+        "references": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "beginRow": { "type": "integer" },
+              "endRow": { "type": "integer" },
+              "tId": { "type": "integer" }
+            },
+            "required" : [ "beginRow", "endRow","tId" ]
+          }
+        },
+        "version" : { "type": "string" }
+      },
+      "required": [
+        "fileSections",
+        "numReads",
+        "basicData",
+        "version"
+      ]
+    }
diff --git a/docs/source/tools/pbmerge.rst b/docs/source/tools/pbmerge.rst

new file mode 100644 (file)

index 0000000..937ec56
--- /dev/null
+++ b/docs/source/tools/pbmerge.rst
@@ -0,0 +1,30 @@
+.. _pbmerge:
+
+pbmerge
+=======
+
+::
+
+  Usage: pbmerge [options] [-o <out.bam>] <INPUT>
+
+  pbmerge merges PacBio BAM files. If the input is DataSetXML, any filters will be
+  applied. If no output filename is specified, new BAM will be written to stdout.
+
+  Options:
+  -h, --help            show this help message and exit
+  --version             show program's version number and exit
+
+  Input/Output:
+    -o output           Output BAM filename.
+    --no-pbi            Set this option to skip PBI index file creation. PBI
+                        creation is automatically skipped if no output filename
+                        is provided.
+    INPUT               Input may be one of:
+                            DataSetXML, list of BAM files, or FOFN
+
+                            fofn: pbmerge -o merged.bam bams.fofn
+
+                            bams: pbmerge -o merged.bam 1.bam 2.bam 3.bam
+
+                            xml:  pbmerge -o merged.bam foo.subreadset.xml
+
diff --git a/docs/specs/pbbam.rst b/docs/specs/pbbam.rst

new file mode 100644 (file)

index 0000000..6842371
--- /dev/null
+++ b/docs/specs/pbbam.rst
@@ -0,0 +1,631 @@
+=================================================================
+**pbbam Software Design & Functional Specification**
+=================================================================
+| *Version 0.1*
+| *Pacific Biosciences Engineering Group*
+| *Jan 29, 2016*
+
+1. Revision History
+===================
+
++-------------+---------------+--------------------+---------------------------+
+| **Date**    | **Revision**  | **Author(s)**      | **Comments**              |
++=============+===============+====================+===========================+
+| 01-29-2016  | 0.1           | Derek Barnett      | Initial draft created     |
+|             |               |                    |                           |
++-------------+---------------+--------------------+---------------------------+
+
+2. Introduction
+===============
+
+2.1. Document Specification Identifier
+--------------------------------------
+
++-----------------------------------+------------------------------------------+
+| **Document Specification Prefix** | **Description**                          |
++===================================+==========================================+
+| FS\_SA\_PBBAM\_                   | Functional spec for pbbam                |
++-----------------------------------+------------------------------------------+
+
+2.2. Purpose
+------------
+
+This document is intended to describe the requirements and interface of the pbbam
+library, which provides functionality for creating, querying, and editing PacBio
+BAM files and associated file formats.
+
+2.3. Scope of Document
+----------------------
+
+This document covers the expected usage of the pbbam library, as well as any
+desired or required performance characteristics with respect to quality or speed.
+
+This document does not provide installation instructions or API documentation.
+
+2.4. Glossary of Terms
+----------------------
+
+The table below specifies only terms specific to this document, and skips
+acronyms/terms that are specified in `Pacific Biosciences Software Glossary`_.
+
+.. _Pacific Biosciences Software Glossary: http://smrtanalysis-docs/pb_sw_glossary.html
+
++------------------+-----------------------------------------------------------+
+| **Acronym/Term** | **Description**                                           |
++==================+===========================================================+
+| API              | Application Programming Interface - a set of routines,    |
+|                  | protocols, and tools for building software applications.  |
+|                  | In this document , this will consist of one or more       |
+|                  | cooperating libraries that specify data structures,       |
+|                  | methods, etc. for use within a target programming         |
+|                  | language.                                                 |
++------------------+-----------------------------------------------------------+
+| Client           | An application that uses the library.                     |
++------------------+-----------------------------------------------------------+
+| I/O              | Input/output of data.                                     |
++------------------+-----------------------------------------------------------+
+
+2.5. References
+---------------
+
++-------------+------------------------------+--------------------------------------+
+| **Ref No.** | **Document Name, Link**      | **Description**                      |
++=============+==============================+======================================+
+| (1)         | `BAM format`_                | General SAM/BAM specification        |
++-------------+------------------------------+--------------------------------------+
+| (2)         | `PacBio BAM`_                | PacBio BAM specification             |
++-------------+------------------------------+--------------------------------------+
+| (3)         | `PacBio BAM index`_          | PacBio BAM index specification       |
++-------------+------------------------------+--------------------------------------+
+| (4)         | `DataSet XML`_               | PacBio DataSet XML specification     |
++-------------+------------------------------+--------------------------------------+
+| (5)         | `Software Style Guide`_      | PacBio coding standards              |
++-------------+------------------------------+--------------------------------------+
+| (6)         | `SMRT Analysis`_             | General SMRT Analysis infrastructure |
++-------------+------------------------------+--------------------------------------+
+
+.. _BAM format: https://samtools.github.io/hts-specs/SAMv1.pdf
+.. _PacBio BAM: http://pacbiofileformats.readthedocs.org/en/3.0/BAM.html
+.. _PacBio BAM index: http://pacbiofileformats.readthedocs.org/en/3.0/PacBioBamIndex.html
+.. _DataSet XML: https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/DataSet.rst
+.. _Software Style Guide: http://smrtanalysis-docs/_downloads/PBISoftwareStyleGuide.doc
+.. _SMRT Analysis: http://smrtanalysis-docs/smrt_docs.html
+
+3. Software Overview
+====================
+
+3.1. Product Description
+------------------------
+
+As of the 3.0 release of SMRTanalysis, PacBio is embracing the industry standard
+`BAM format`_ (1) for (both aligned and unaligned) basecall data files. We have
+also formulated a BAM companion file format (.bam.pbi) enabling fast access to a
+richer set of per-read information as well as compatibility for software built
+around the legacy cmp.h5 format.
+
+The pbbam library provides components to create, query, & transform PacBio BAM
+data: sequence files and their associated indices. This includes a core C++
+library as well as bindings for additional programming languages.
+
+3.2. Product Functional Capabilities
+------------------------------------
+
+The library must be able to read and write BAM files that conform to the
+`PacBio BAM`_ specification (2). BAM records must be editable e.g. adding
+alignment information. Random access must be supported, whether by genomic
+region or by filtering record features. To this end, the library will be able to
+read, write, and create associated index files - both the standard BAM index
+(.bai) and the `PacBio BAM index`_ (.pbi) (3). In addition to working with
+individual files, datasets of related BAM files will be supported. These are
+described in a `DataSet XML`_ document. (4)
+
+3.3. User Characteristics
+-------------------------
+
++---------------------+--------------------------------------------------------+
+| **User Class/Role** | **User Knowledge and Skill Levels**                    |
++=====================+========================================================+
+| Developer           | Competence in one or more programming languages        |
+|                     | supported (C++, R, Python, C#). No knowledge of        |
+|                     | molecular biology wet lab techniques required.         |
++---------------------+--------------------------------------------------------+
+
+3.4. User Operations and Practices
+----------------------------------
+
+Developer users will interact with the software by incorporating the library
+into a client application.
+
+3.5. Operating Environment
+--------------------------
+
+The software is intended to be run in a Linux or OSX environment, with ideally 4
+or more cores.
+
+3.6. Design and Implementation Constraints
+------------------------------------------
+
+Currently there are no constraints outside the operating environment and speed
+requirements. In particular, as the library will be used for writing the BAM
+files coming off a Sequel instrument, it should be able to keep pace.
+
+3.7. Assumptions and Dependencies
+---------------------------------
+
+Input routines for the library will expect to receive files that conform to the
+`PacBio BAM`_ (2) or `DataSet XML`_ (4) specifications.
+
+The pbbam library depends on Boost, zlib, and htslib libraries.
+
+3.8. Other Software
+-------------------
+
+Output PacBio BAMs will be compatible with the `PacBio BAM`_ specification (2)
+and thus compatible with the general `BAM format`_ specification (1). This
+ensures that a wide variety of downstream tools can interact with data files.
+
+The software uses `CMake`_ as its build system.
+
+The core C++ API relies on the following 3rd party components:
+
+* `zlib`_
+* `htslib`_
+* `Boost`_ (header-only modules)
+
+Wrapper APIs for additional languages (Python, R, C#) are generated by `SWIG`_.
+
+API documentation is generated via `Doxygen`_.
+
+.. _CMake: https://cmake.org/
+.. _zlib: http://www.zlib.net/
+.. _htslib: https://github.com/samtools/htslib
+.. _Boost: http://www.boost.org/
+.. _SWIG: http://www.swig.org/
+.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/
+
+4. External Interfaces
+======================
+
+4.1. User Interfaces
+--------------------
+
+N/A
+
+4.2. Software Interfaces
+------------------------
+
+pbbam will require the following software:
+
+* `htslib`_ & `zlib`_ - provides low-level handling of compressed BAM data
+* `Boost`_ - provides utility classes
+
+Incoming data from upstream components will be compliant with
+PacBio BAM format - see `PacBio BAM`_ specification (2) for more detail.
+
+4.3. Hardware Interfaces
+------------------------
+
+N/A
+
+4.4. Communications Interfaces
+------------------------------
+
+N/A
+
+5. Functional Requirements
+==========================
+
+5.1. Query BAM data by genomic region
+-----------------------------------------
+
+5.1.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some genomic
+region of interest.
+
+5.1.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a standard index (.bai) for each source BAM file
+* genomic interval (e.g. "chr1:1000-2000")
+
+5.1.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Obtain an `htslib`_ "iterator" object for a given file and region. This will be
+wrapped by pbbam to hide the low-level nature of this type, as well as handling
+memory lifetime.
+
+5.1.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which are aligned to the requested genomic interval.
+
+For example:
+
+.. code:: c++
+
+    GenomicIntervalQuery query(interval, dataset);
+    for (const BamRecord& record : query) {
+        // ... do stuff ...
+    }
+
+
+5.1.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+5.2. Query BAM data by filter criteria
+-----------------------------------------
+
+5.2.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some filter
+criteria (e.g. only reads from ZMW hole number 200 with a read quality of >0.5).
+
+5.2.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a `PacBio BAM index`_ (.pbi) for each source BAM file
+* filters supported by data contained in the PBI
+
+5.2.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Query PBI files(s) for records that match the provided filter criteria. Merge
+contiguous runs of records into record blocks, to minimize seeks. Advancing the
+iterator either reads the next read from the current block or seeks to the next
+block and fetches the next record.
+
+5.2.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which satisfy the requested filter criteria.
+
+For example:
+
+.. code:: c++
+
+    PbiFilterQuery query(filter, dataset);
+    for (const BamRecord& record : query) {
+        // ... do stuff ...
+    }
+
+5.2.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+5.3. Write PacBio BAM data
+------------------------------------------
+
+5.3.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall be able to write `PacBio BAM`_ files conforming to the specification.
+
+5.3.2. Inputs
+~~~~~~~~~~~~~
+
+* filename
+* header information
+* BAM records
+
+5.3.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Create file handle for the provided filename, output initial header information.
+As records are passed in, write to file. Upon completion, flush any buffers and
+close file handle.
+
+Multithreading, provided by `htslib`_, will be utilized where possible to speed
+up the compression process - often then main bottleneck of BAM throughput.
+
+5.3.4. Outputs
+~~~~~~~~~~~~~~
+
+BAM file conforming to the `PacBio BAM`_ specification.
+
+5.3.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+5.4. Create PacBio BAM index file
+------------------------------------------
+
+5.4.1. Description
+~~~~~~~~~~~~~~~~~~
+
+Much of PacBio BAM data processing relies on the presence of a `PacBio BAM index`_
+file. pbbam shall be able to generate this file type for a `PacBio BAM`_ file.
+
+5.4.2. Inputs
+~~~~~~~~~~~~~
+
+`PacBio BAM`_ file
+
+5.4.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Read through the input BAM records, storing the values relevant to a PBI index.
+At end of file, write the index contents to a file and close.
+
+5.4.4. Outputs
+~~~~~~~~~~~~~~
+
+`PacBio BAM index`_ file
+
+5.4.5. Regulatory Compliance
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6. Non-Functional Requirements
+==============================
+
+6.1. Performance Requirements
+-----------------------------
+
+Since pbbam will be used to write all BAM files coming off a Sequel device, the
+library must keep pace with data generation requirements.
+
+** come back to this, hard numbers ?? **
+
+6.2. Safety Requirements
+------------------------
+
+N/A
+
+6.3. Security Requirements
+--------------------------
+
+N/A
+
+6.4. Quality Attributes
+-----------------------
+
+6.4.1. Availability
+~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6.4.2. Integrity
+~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+Files that do not meet this requirement will raise exceptions and will not be
+accepted.
+
+6.4.3. Interoperability
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+
+6.4.4. Reliability
+~~~~~~~~~~~~~~~~~~
+
+The developed software shall meet the overall product reliability requirements.
+
+6.4.5. Robustness
+~~~~~~~~~~~~~~~~~
+
+pbbam will raise exceptions upon encountering failure cases, allowing client
+applications to recover or report the error to a UI.
+
+6.4.6. Usability
+~~~~~~~~~~~~~~~~
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+Raised exceptions shall carry as much information as possible so that client
+applications can respond with appropriate actions or display useful messages.
+
+6.4.7. Maintainability
+~~~~~~~~~~~~~~~~~~~~~~
+
+The source code of the software covered in this functional specification shall
+adhere to the PacBio `Software Style Guide`_ (9) work instruction, to guarantee
+high quality of code that facilitates maintainability.
+
+6.4.8. Customizability
+~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6.5. Business Rules
+-------------------
+
+N/A
+
+6.6. Installation and Upgrade
+-----------------------------
+
+Installation and Upgrade of this software will be handled as part of the SMRT
+Analysis subsystem. See `SMRT Analysis`_ (6) specifications for more detail.
+
+Additionally, the library may be built independently, either from internal
+version control (Perforce) or from the public-facing Github repository. In
+either case, `CMake`_ is used to drive the build process.
+
+6.7. Administration
+-------------------
+
+N/A
+
+6.8. User Documentation
+-----------------------
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+The "offline" API documentation may be built directly from the source code, using
+`Doxygen`_. Online documentation will be generated via a continuous integration
+server, thus ensuring it is always pointing to the current codebase.
+
+7. High Level Design
+====================
+
+7.1. Top Level Context
+----------------------
+
+The pbbam library is intended to be linked in with client applications,
+providing programmatic access to data files.
+
+7.2. Use Cases
+--------------
+
+Primary use cases for pbbam include:
+
+* BAM file creation
+* BAM file query - iterable access to various subsets of data
+
+8. Detailed Design
+==================
+
+8.1. Structural Representation
+------------------------------
+
+ *image(s) here*
+
+8.2. Behavioral Representation
+------------------------------
+
+This section provides behavioral (dynamic) representation of how the
+elements of the system realize the required use cases.
+
+Describe how the significant subsystems and classes interact with each
+other to realize the architecturally significant use cases.
+
+Provide a link to a file containing Sequence Diagram or Activity Diagram, when applicable.
+The link may be provided with use of 'image' directive.
+
+Sequence Diagram shows one use case scenario, executed by class model,
+with sequence of operations over period of time (time increased from top
+to bottom). It shows interactions between objects, but does not show
+relationships between them.
+
+Activity Diagram is a virtual representation of the sequential flow and
+control logic of a set of related activities or actions. It is a type of
+flowchart, frequently called Swim Lane Diagram, because activities of
+each entity are presented within its swim lane.
+
+Note: You may use http://wsd tool to auto-generate a sequence diagram from
+a descriptive text file, save the diagram to the wsd site, get link to the image,
+and add this link to the document with use of 'image' directive.
+
+8.3. Information Storage
+------------------------
+
+pbbam software requires no persistent storage outside of availability of input
+and output during analysis.
+
+8.4. Technology Overview
+------------------------
+
+pbbam is implemented in C++-11 and should perform as designed on any UNIX-like
+operating system (Linux distributions, Apple OSX, etc.).
+
+8.5. SOUP Components
+--------------------
+
+pbbam utilizes CMake for its build system. The C++ library uses the following
+3rd-party software components: Boost, htslib, & zlib. Wrappers for additional
+languages are generated using SWIG.
+
+8.6. Deployment and Configuration
+---------------------------------
+
+Please refer to `SMRT Analysis`_ (6) documentation
+
+9. Automated Tests
+==================
+
+9.1. Unit Testing
+-----------------
+
+The library shall have unit tests for all classes & components.
+
+9.2. Performance Testing
+------------------------
+
+Unit tests may evaluate performance requirements as desired.
+
+9.3. Regression Testing
+-----------------------
+
+As its role is primarily in data I/O, pbbam has no "scientific quality/validity"
+metrics that would indicate a regression. Instead, passing its unit tests and
+end-to-end tests will indicate that a regression has not been introduced.
+
+These tests will be run after each check-in and nightly.
+
+10. Requirements Traceability Matrices
+======================================
+
+This section provides traces from requirements specified in PRD/DIR documents to the
+requirements covered in this functional specification, and from these
+functional requirements to corresponding Test Cases/Procedures.
+
+10.1. HPQC Functional Specifications
+------------------------------------
+
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| **PBI_ID**  | **Name**                  | **Description**                                   | **Comment** | **Metric** | **Owner** | **PRD/DIR Path**                          |
++=============+===========================+===================================================+=============+============+===========+===========================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query    |             |            | dbarnett  |                                           |
+|             | genomic region            | data, limited to some genomic region of interest. |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query    |             |            | dbarnett  |                                           |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only  |             |            |           |                                           |
+|             |                           | reads from ZMW hole number 200 with a read        |             |            |           |                                           |
+|             |                           | quality of >0.5).                                 |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to  |             |            | dbarnett  |                                           |
+|             |                           | the `PacBio BAM`_ specifictation.                 |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the  |             |            | dbarnett  |                                           |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam     |             |            |           |                                           |
+|             |                           | shall be able to generate this file type for a    |             |            |           |                                           |
+|             |                           | `PacBio BAM`_ file.                               |             |            |           |                                           |
+|             |                           |                                                   |             |            |           |                                           |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+-------------------------------------------+
+
+10.2. Automated Tests Coverage
+------------------------------
+
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| **FS Item** | **FS Item Title**         | **Use Case Description**                           | **Test Case Name/ID**                                            |
++=============+===========================+====================================================+==================================================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query     | TODO                                                             |
+|             | genomic region            | data, limited to some genomic region of interest.  |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query     | TODO                                                             |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only   |                                                                  |
+|             |                           | reads from ZMW hole number 200 with a read         |                                                                  |
+|             |                           | quality of >0.5).                                  |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to   | TODO                                                             |
+|             |                           | the `PacBio BAM`_ specifictation.                  |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the   | TODO                                                             |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam      |                                                                  |
+|             |                           | shall be able to generate this file type for a     |                                                                  |
+|             |                           | `PacBio BAM`_ file.                                |                                                                  |
+|             |                           |                                                    |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+
diff --git a/docs/specs/pbbam_structure.png b/docs/specs/pbbam_structure.png

new file mode 100755 (executable)

index 0000000..40f50cf

Binary files /dev/null and b/docs/specs/pbbam_structure.png differ
diff --git a/docs/specs/pbbam_updated_release3_2.rst b/docs/specs/pbbam_updated_release3_2.rst

new file mode 100755 (executable)

index 0000000..72d9b76
--- /dev/null
+++ b/docs/specs/pbbam_updated_release3_2.rst
@@ -0,0 +1,618 @@
+=============================================================
+**Pbbam Core API Software Design & Functional Specification**
+=============================================================
+| *Version 0.2*
+| *Pacific Biosciences Engineering Group*
+| *Oct 17, 2016*
+
+1. Revision History
+===================
+
++-------------+---------------+--------------------+---------------------------------+
+| **Date**    | **Revision**  | **Author(s)**      | **Comments**                    |
++=============+===============+====================+=================================+
+| 01-29-2016  | 0.1           | Derek Barnett      | Initial draft created           |
+|             |               |                    |                                 |
++-------------+---------------+--------------------+---------------------------------+
+| 10-17-2016  | 0.2           | Derek Barnett      | Added behavioral representation |
+|             |               |                    | and structural representation   |
+|             |               |                    | diagram                         |
++-------------+---------------+--------------------+---------------------------------+
+
+2. Introduction
+===============
+
+2.1. Document Specification Identifier
+--------------------------------------
+
++-----------------------------------+------------------------------------------+
+| **Document Specification Prefix** | **Description**                          |
++===================================+==========================================+
+| FS\_SA\_PBBAM\_                   | Functional spec for pbbam                |
++-----------------------------------+------------------------------------------+
+
+2.2. Purpose
+------------
+
+This document is intended to describe the requirements and interface of the pbbam
+library, which provides functionality for creating, querying, and editing PacBio
+BAM files and associated file formats.
+
+2.3. Scope of Document
+----------------------
+
+This document covers the expected usage of the pbbam library, as well as any
+desired or required performance characteristics with respect to quality or speed.
+
+This document does not provide installation instructions or API documentation.
+
+2.4. Glossary of Terms
+----------------------
+
+The table below specifies only terms specific to this document, and skips
+acronyms/terms that are specified in `Pacific Biosciences Software Glossary`_.
+
+.. _Pacific Biosciences Software Glossary: http://smrtanalysis-docs/pb_sw_glossary.html
+
++------------------+-----------------------------------------------------------+
+| **Acronym/Term** | **Description**                                           |
++==================+===========================================================+
+| API              | Application Programming Interface - a set of routines,    |
+|                  | protocols, and tools for building software applications.  |
+|                  | In this document, this will consist of one or more        |
+|                  | cooperating libraries that specify data structures,       |
+|                  | methods, etc. for use within a target programming         |
+|                  | language.                                                 |
++------------------+-----------------------------------------------------------+
+| Client           | An application that uses the library.                     |
++------------------+-----------------------------------------------------------+
+| I/O              | Input/output of data.                                     |
++------------------+-----------------------------------------------------------+
+
+2.5. References
+---------------
+
++-------------+------------------------------+--------------------------------------+
+| **Ref No.** | **Document Name, Link**      | **Description**                      |
++=============+==============================+======================================+
+| (1)         | `BAM format`_                | General SAM/BAM specification        |
++-------------+------------------------------+--------------------------------------+
+| (2)         | `PacBio BAM`_                | PacBio BAM specification             |
++-------------+------------------------------+--------------------------------------+
+| (3)         | `PacBio BAM index`_          | PacBio BAM index specification       |
++-------------+------------------------------+--------------------------------------+
+| (4)         | `DataSet XML`_               | PacBio DataSet XML specification     |
++-------------+------------------------------+--------------------------------------+
+| (5)         | `Software Style Guide`_      | PacBio coding standards              |
++-------------+------------------------------+--------------------------------------+
+| (6)         | `SMRT Analysis`_             | General SMRT Analysis infrastructure |
++-------------+------------------------------+--------------------------------------+
+
+.. _BAM format: https://samtools.github.io/hts-specs/SAMv1.pdf
+.. _PacBio BAM: http://pacbiofileformats.readthedocs.org/en/3.0/BAM.html
+.. _PacBio BAM index: http://pacbiofileformats.readthedocs.org/en/3.0/PacBioBamIndex.html
+.. _DataSet XML: https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/DataSet.rst
+.. _Software Style Guide: http://smrtanalysis-docs/_downloads/PBISoftwareStyleGuide.doc
+.. _SMRT Analysis: http://smrtanalysis-docs/smrt_docs.html
+
+3. Software Overview
+====================
+
+3.1. Software Module Description
+--------------------------------
+
+As of the 3.0 release of SMRT Analysis, PacBio is embracing the industry standard
+`BAM format`_ (1) for (both aligned and unaligned) basecall data files. We have
+also formulated a BAM companion file format (.bam.pbi) enabling fast access to a
+richer set of per-read information as well as compatibility for software built
+around the legacy cmp.h5 format.
+
+The pbbam library provides components to create, query, & transform PacBio BAM
+data: sequence files and their associated indices. This includes a core C++
+library as well as bindings for additional programming languages.
+
+3.2. Software Module Functional Capabilities
+--------------------------------------------
+
+The library must be able to read and write BAM files that conform to the
+`PacBio BAM`_ specification (2). BAM records must be editable e.g. adding
+alignment information. Random access must be supported, whether by genomic
+region or by filtering record features. To this end, the library will be able to
+read, write, and create associated index files - both the standard BAM index
+(.bai) and the `PacBio BAM index`_ (.pbi) (3). In addition to working with
+individual files, datasets of related BAM files will be supported. These are
+described in a `DataSet XML`_ document. (4)
+
+3.3. User Characteristics
+-------------------------
+
++---------------------+--------------------------------------------------------+
+| **User Class/Role** | **User Knowledge and Skill Levels**                    |
++=====================+========================================================+
+| Developer           | Competence in one or more programming languages        |
+|                     | supported (C++, R, Python, C#). No knowledge of        |
+|                     | molecular biology wet lab techniques required.         |
++---------------------+--------------------------------------------------------+
+
+3.4. User Operations and Practices
+----------------------------------
+
+Developer users will interact with the software by incorporating the library
+into a client application.
+
+3.5. Operating Environment
+--------------------------
+
+The software is intended to be run in a Linux or OSX environment, with ideally 4
+or more cores.
+
+3.6. General Constraints
+------------------------
+
+Currently there are no constraints outside the operating environment and speed
+requirements. In particular, as the library will be used for writing the BAM
+files coming off a Sequel instrument, it should be able to keep pace.
+
+3.7. Assumptions and Dependencies
+---------------------------------
+
+Input routines for the library will expect to receive files that conform to the
+`PacBio BAM`_ (2) or `DataSet XML`_ (4) specifications.
+
+The pbbam library depends on Boost, zlib, and htslib libraries.
+
+3.8. Other Software
+-------------------
+
+Output PacBio BAMs will be compatible with the `PacBio BAM`_ specification (2)
+and thus compatible with the general `BAM format`_ specification (1). This
+ensures that a wide variety of downstream tools can interact with data files.
+
+The software uses `CMake`_ as its build system.
+
+The core C++ API relies on the following 3rd party components:
+
+* `zlib`_
+* `htslib`_
+* `Boost`_ (header-only modules)
+
+Wrapper APIs for additional languages (Python, R, C#) are generated by `SWIG`_.
+
+API documentation is generated via `Doxygen`_.
+
+.. _CMake: https://cmake.org/
+.. _zlib: http://www.zlib.net/
+.. _htslib: https://github.com/samtools/htslib
+.. _Boost: http://www.boost.org/
+.. _SWIG: http://www.swig.org/
+.. _Doxygen: http://www.stack.nl/~dimitri/doxygen/
+
+4. External Interfaces
+======================
+
+4.1. User Interfaces
+--------------------
+
+N/A
+
+4.2. Software Interfaces
+------------------------
+
+pbbam will require the following software:
+
+* `htslib`_ & `zlib`_ - provides low-level handling of compressed BAM data
+* `Boost`_ - provides utility classes
+
+Incoming data from upstream components will be compliant with
+PacBio BAM format - see `PacBio BAM`_ specification (2) for more detail.
+
+4.3. Hardware Interfaces
+------------------------
+
+N/A
+
+4.4. Communications Interfaces
+------------------------------
+
+N/A
+
+5. Functional Requirements
+==========================
+
+5.1. Query BAM data by genomic region
+-------------------------------------
+
+5.1.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some genomic
+region of interest.
+
+5.1.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a standard index (.bai) for each source BAM file
+* genomic interval (e.g. "chr1:1000-2000")
+
+5.1.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Obtain an `htslib`_ "iterator" object for a given file and region. This will be
+wrapped by pbbam to hide the low-level nature of this type, as well as handling
+memory lifetime.
+
+5.1.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which are aligned to the requested genomic interval.
+
+For example:
+
+.. code:: c++
+
+    GenomicIntervalQuery query(interval, dataset);
+    for (const BamRecord& record : query) {
+        // ... use record data ...
+    }
+
+
+5.2. Query BAM data by filter criteria
+--------------------------------------
+
+5.2.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall allow client applications to query data, limited to some filter
+criteria (e.g. only reads from ZMW hole number 200 with a read quality of >0.5).
+
+5.2.2. Inputs
+~~~~~~~~~~~~~
+
+* BAM file(s) or DataSet XML
+* a `PacBio BAM index`_ (.pbi) for each source BAM file
+* filters supported by data contained in the PBI
+
+5.2.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Query PBI files(s) for records that match the provided filter criteria. Merge
+contiguous runs of records into record blocks, to minimize seeks. Advancing the
+iterator either reads the next read from the current block or seeks to the next
+block and fetches the next record.
+
+5.2.4. Outputs
+~~~~~~~~~~~~~~
+
+Iterator providing access to individual BAM records from the input data sources,
+which satisfy the requested filter criteria.
+
+For example:
+
+.. code:: c++
+
+    PbiFilterQuery query(filter, dataset);
+    for (const BamRecord& record : query) {
+        // ... do stuff ...
+    }
+
+
+5.3. Write PacBio BAM data
+--------------------------
+
+5.3.1. Description
+~~~~~~~~~~~~~~~~~~
+
+pbbam shall be able to write `PacBio BAM`_ files conforming to the specification.
+
+5.3.2. Inputs
+~~~~~~~~~~~~~
+
+* filename
+* header information
+* BAM records
+
+5.3.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Create file handle for the provided filename, output initial header information.
+As records are passed in, write to file. Upon completion, flush any buffers and
+close file handle.
+
+Multithreading, provided by `htslib`_, will be utilized where possible to speed
+up the compression process - often then main bottleneck of BAM throughput.
+
+5.3.4. Outputs
+~~~~~~~~~~~~~~
+
+BAM file conforming to the `PacBio BAM`_ specification.
+
+5.4. Create PacBio BAM index file
+---------------------------------
+
+5.4.1. Description
+~~~~~~~~~~~~~~~~~~
+
+Much of PacBio BAM data processing relies on the presence of a `PacBio BAM index`_
+file. pbbam shall be able to generate this file type for a `PacBio BAM`_ file.
+
+5.4.2. Inputs
+~~~~~~~~~~~~~
+
+`PacBio BAM`_ file
+
+5.4.3. Processing
+~~~~~~~~~~~~~~~~~
+
+Read through the input BAM records, storing the values relevant to a PBI index.
+At end of file, write the index contents to a file and close.
+
+5.4.4. Outputs
+~~~~~~~~~~~~~~
+
+`PacBio BAM index`_ file
+
+6. Non-Functional Requirements
+==============================
+
+6.1. Performance Requirements
+-----------------------------
+
+Since pbbam will be used to write all BAM files coming off a Sequel instrument, the
+library must keep pace with data generation requirements.
+
+6.2. Safety Requirements
+------------------------
+
+N/A
+
+6.3. Security Requirements
+--------------------------
+
+N/A
+
+6.4. Quality Attributes
+-----------------------
+
+6.4.1. Availability
+~~~~~~~~~~~~~~~~~~~
+
+The developed software shall meet the overall product availability requirements.
+
+6.4.2. Data Integrity
+~~~~~~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+Files that do not meet this requirement will raise exceptions and will not be
+accepted.
+
+6.4.3. Interoperability
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Inputs and outputs shall adhere to the PacBio BAM or DataSet XML specifications.
+
+6.4.4. Reliability
+~~~~~~~~~~~~~~~~~~
+
+The developed software shall meet the overall product reliability requirements.
+
+6.4.5. Robustness
+~~~~~~~~~~~~~~~~~
+
+pbbam will raise exceptions upon encountering failure cases, allowing client
+applications to recover or report the error to a UI.
+
+6.4.6. Usability
+~~~~~~~~~~~~~~~~
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+Raised exceptions shall carry as much information as possible so that client
+applications can respond with appropriate actions or display useful messages.
+
+6.4.7. Maintainability
+~~~~~~~~~~~~~~~~~~~~~~
+
+The source code of the software covered in this functional specification shall
+adhere to the PacBio `Software Style Guide`_ (9) work instruction, to guarantee
+high quality of code that facilitates maintainability.
+
+6.4.8. Customizability
+~~~~~~~~~~~~~~~~~~~~~~
+
+N/A
+
+6.4.9. Compatibility
+~~~~~~~~~~~~~~~~~~~~
+
+pbbam shall support backward compatibility of the API and BAM format versions
+in order not to break existing clients.
+
+6.5. Business Rules
+-------------------
+
+N/A
+
+6.6. Compliance Requirements
+----------------------------
+
+N/A
+
+6.7. Alarms and Error Handling
+------------------------------
+
+Raised exceptions shall carry as much information as possible so that client
+applications can respond with appropriate actions or display useful messages.
+
+6.8. Persistence Requirements
+-----------------------------
+
+pbbam software requires no persistent storage outside of availability of input
+and output during analysis.
+
+6.9. Installation and Upgrade
+-----------------------------
+
+Installation and Upgrade of this software will be handled as part of the SMRT
+Analysis subsystem. See `SMRT Analysis`_ (6) specifications for more detail.
+
+Additionally, the library may be built independently, either from internal
+version control (Perforce) or from the public-facing Github repository. In
+either case, `CMake`_ is used to drive the build process.
+
+6.10. Administration and Maintenance
+------------------------------------
+
+N/A
+
+6.11. User Documentation
+------------------------
+
+pbbam shall have comprehensive API documentation, available both on- and offline.
+Further documentation will be provided for installation, API usage tips, etc.
+
+The "offline" API documentation may be built directly from the source code, using
+`Doxygen`_. Online documentation will be generated via a continuous integration
+server, thus ensuring it is always pointing to the current codebase.
+
+7. High Level Design
+====================
+
+7.1. Top Level Context
+----------------------
+
+The pbbam library is intended to be linked in with client applications,
+providing programmatic access to data files.
+
+7.2. Use Cases
+--------------
+
+Primary use cases for pbbam include:
+
+* BAM file creation
+* BAM file query - iterable access to various subsets of data
+
+8. Detailed Design
+==================
+
+8.1. Structural Representation
+------------------------------
+
+.. image:: ./pbbam_structure.png
+
+8.2. Behavioral Representation
+------------------------------
+
+The typical access pattern involves a client query against BAM data, optionally
+described in DataSet XML. The query may involve some number of filter criteria.
+
+pbbam queries the associated index files (*.pbi) to pre-determine which records
+pass filtering criteria and where they reside on disk. The client code is given
+an iterable object, such that each iteration of the main access loop returns a
+valid BAM record for analysis, modification, etc.
+
+8.3. Information Storage
+------------------------
+
+pbbam software requires no persistent storage outside of availability of input
+and output during analysis.
+
+8.4. Technology Overview
+------------------------
+
+pbbam is implemented in C++-11 and should perform as designed on any UNIX-like
+operating system (Linux distributions, Apple OSX, etc.).
+
+8.5. SOUP Components
+--------------------
+
+pbbam utilizes CMake for its build system. The C++ library uses the following
+3rd-party software components: `Boost`_, `htslib`_, & `zlib`_. Wrappers for additional
+languages are generated using SWIG.
+
+8.6. Deployment and Configuration
+---------------------------------
+
+Please refer to `SMRT Analysis`_ (6) documentation
+
+9. Automated Tests
+==================
+
+9.1. Unit Testing
+-----------------
+
+The library shall have unit tests for all classes & components.
+
+9.2. Performance Testing
+------------------------
+
+Unit tests may evaluate performance requirements as desired.
+
+9.3. Regression Testing
+-----------------------
+
+As its role is primarily in data I/O, pbbam has no "scientific quality/validity"
+metrics that would indicate a regression. Instead, passing its unit tests and
+end-to-end tests will indicate that a regression has not been introduced.
+
+These tests will be run after each check-in and nightly.
+
+10. Requirements Traceability Matrices
+======================================
+
+This section provides traces from requirements specified in PRD/DIR documents to the
+requirements covered in this functional specification, and from these
+functional requirements to corresponding Test Cases/Procedures.
+
+10.1. HPQC Functional Specifications
+------------------------------------
+
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| **PBI_ID**  | **Name**                  | **Description**                                   | **Comment** | **Metric** | **Owner** | **PRD/DIR Path**                                 |
++=============+===========================+===================================================+=============+============+===========+==================================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query    |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\Common\APIs\\     |
+|             | genomic region            | data, limited to some genomic region of interest. |             |            |           | Software shall provide an API to allow 3rd       |
+|             |                           |                                                   |             |            |           | party software to extract all run information    |
+|             |                           |                                                   |             |            |           | including summary reports and locations          |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query    |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\Common\APIs\\     |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only  |             |            |           | Software shall provide an API to allow 3rd       |
+|             |                           | reads from ZMW hole number 200 with a read        |             |            |           | party software to extract all run information    |
+|             |                           | quality of >0.5).                                 |             |            |           | including summary reports and locations          |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to  |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\\PostProcessing\\ |
+|             |                           | the `PacBio BAM`_ specification.                  |             |            |           | Software shall provide base files including      |
+|             |                           |                                                   |             |            |           | kinetic information in industry standard format  |
+|             |                           |                                                   |             |            |           | such as SAM/BAM using current specifications     |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the  |             | Yes        | dbarnett  | \\DIR\\Functionality\\Software\\PostProcessing\\ |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam     |             |            |           | Software shall provide base files including      |
+|             |                           | shall be able to generate this file type for a    |             |            |           | kinetic information in industry standard format  |
+|             |                           | `PacBio BAM`_ file.                               |             |            |           | such as SAM/BAM using current specifications     |
++-------------+---------------------------+---------------------------------------------------+-------------+------------+-----------+--------------------------------------------------+
+
+10.2. Automated Tests Coverage
+------------------------------
+
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| **FS Item** | **FS Item Title**         | **Use Case Description**                           | **Test Case Name/ID**                                            |
++=============+===========================+====================================================+==================================================================+
+| 5.1         | Query BAM data by         | pbbam shall allow client applications to query     | See section 9.1. Unit Testing.                                   |
+|             | genomic region            | data, limited to some genomic region of interest.  |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.2         | Query BAM data by         | pbbam shall allow client applications to query     | See section 9.1. Unit Testing.                                   |
+|             | filter criteria           | data, limited to some filter criteria (e.g. only   |                                                                  |
+|             |                           | reads from ZMW hole number 200 with a read         |                                                                  |
+|             |                           | quality of >0.5).                                  |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.3         | Write PacBio BAM data     | pbbam shall be able to write files conforming to   | See section 9.1. Unit Testing.                                   |
+|             |                           | the `PacBio BAM`_ specification.                   |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+| 5.4         | Create PacBio BAM index   | Much of PacBio BAM data processing relies on the   | See section 9.1. Unit Testing.                                   |
+|             | file                      | presence of a `PacBio BAM index`_ file. pbbam      |                                                                  |
+|             |                           | shall be able to generate this file type for a     |                                                                  |
+|             |                           | `PacBio BAM`_ file.                                |                                                                  |
++-------------+---------------------------+----------------------------------------------------+------------------------------------------------------------------+
+
diff --git a/include/meson.build b/include/meson.build

new file mode 100644 (file)

index 0000000..33b33f0
--- /dev/null
+++ b/include/meson.build
@@ -0,0 +1,169 @@
+###########
+# headers #
+###########
+
+if not meson.is_subproject()
+  install_headers(
+    files([
+      'pbbam/Accuracy.h',
+      'pbbam/AlignmentPrinter.h',
+      'pbbam/BaiIndexedBamReader.h',
+      'pbbam/BamFile.h',
+      'pbbam/BamHeader.h',
+      'pbbam/BamReader.h',
+      'pbbam/BamRecord.h',
+      'pbbam/BamRecordBuilder.h',
+      'pbbam/BamRecordImpl.h',
+      'pbbam/BamRecordTag.h',
+      'pbbam/BamRecordView.h',
+      'pbbam/BamTagCodec.h',
+      'pbbam/BamWriter.h',
+      'pbbam/BarcodeQuery.h',
+      'pbbam/Cigar.h',
+      'pbbam/CigarOperation.h',
+      'pbbam/ClipType.h',
+      'pbbam/Compare.h',
+      'pbbam/CompositeBamReader.h',
+      'pbbam/CompositeFastaReader.h',
+      'pbbam/Config.h',
+      'pbbam/DataSet.h',
+      'pbbam/DataSetTypes.h',
+      'pbbam/DataSetXsd.h',
+      'pbbam/EntireFileQuery.h',
+      'pbbam/FastaReader.h',
+      'pbbam/FastaSequence.h',
+      'pbbam/FastaSequenceQuery.h',
+      'pbbam/FastqReader.h',
+      'pbbam/FastqSequence.h',
+      'pbbam/FrameEncodingType.h',
+      'pbbam/Frames.h',
+      'pbbam/GenomicInterval.h',
+      'pbbam/GenomicIntervalQuery.h',
+      'pbbam/IndexedBamWriter.h',
+      'pbbam/IndexedFastaReader.h',
+      'pbbam/Interval.h',
+      'pbbam/IRecordWriter.h',
+      'pbbam/LocalContextFlags.h',
+      'pbbam/MakeUnique.h',
+      'pbbam/MD5.h',
+      'pbbam/MoveAppend.h',
+      'pbbam/Orientation.h',
+      'pbbam/PbiBasicTypes.h',
+      'pbbam/PbiBuilder.h',
+      'pbbam/PbiFile.h',
+      'pbbam/PbiFilter.h',
+      'pbbam/PbiFilterQuery.h',
+      'pbbam/PbiFilterTypes.h',
+      'pbbam/PbiIndexedBamReader.h',
+      'pbbam/PbiRawData.h',
+      'pbbam/Position.h',
+      'pbbam/ProgramInfo.h',
+      'pbbam/PulseBehavior.h',
+      'pbbam/PulseExclusionReason.h',
+      'pbbam/QNameQuery.h',
+      'pbbam/QualityValue.h',
+      'pbbam/QualityValues.h',
+      'pbbam/ReadAccuracyQuery.h',
+      'pbbam/ReadGroupInfo.h',
+      'pbbam/RecordType.h',
+      'pbbam/SamTagCodec.h',
+      'pbbam/SamWriter.h',
+      'pbbam/SequenceInfo.h',
+      'pbbam/Strand.h',
+      'pbbam/StringUtilities.h',
+      'pbbam/SubreadLengthQuery.h',
+      'pbbam/Tag.h',
+      'pbbam/TagCollection.h',
+      'pbbam/Unused.h',
+      'pbbam/Validator.h',
+      'pbbam/ZmwGroupQuery.h',
+      'pbbam/ZmwQuery.h',
+      'pbbam/ZmwType.h',
+      'pbbam/ZmwTypeMap.h']),
+    subdir : 'pbbam')
+
+  install_headers(
+    files([
+      'pbbam/exception/BundleChemistryMappingException.h',
+      'pbbam/exception/InvalidSequencingChemistryException.h',
+      'pbbam/exception/ValidationException.h']),
+    subdir : 'pbbam/exception')
+
+  install_headers(
+    files([
+      'pbbam/internal/Accuracy.inl',
+      'pbbam/internal/BamHeader.inl',
+      'pbbam/internal/BamRecord.inl',
+      'pbbam/internal/BamRecordBuilder.inl',
+      'pbbam/internal/BamRecordImpl.inl',
+      'pbbam/internal/BamRecordView.inl',
+      'pbbam/internal/Cigar.inl',
+      'pbbam/internal/CigarOperation.inl',
+      'pbbam/internal/Compare.inl',
+      'pbbam/internal/CompositeBamReader.inl',
+      'pbbam/internal/CompositeFastaReader.inl',
+      'pbbam/internal/DataSet.inl',
+      'pbbam/internal/DataSetBaseTypes.h',
+      'pbbam/internal/DataSetBaseTypes.inl',
+      'pbbam/internal/DataSetElement.h',
+      'pbbam/internal/DataSetElement.inl',
+      'pbbam/internal/DataSetListElement.h',
+      'pbbam/internal/DataSetListElement.inl',
+      'pbbam/internal/DataSetTypes.inl',
+      'pbbam/internal/FastaSequence.inl',
+      'pbbam/internal/FastqSequence.inl',
+      'pbbam/internal/Frames.inl',
+      'pbbam/internal/GenomicInterval.inl',
+      'pbbam/internal/Interval.inl',
+      'pbbam/internal/PbiBasicTypes.inl',
+      'pbbam/internal/PbiFilter.inl',
+      'pbbam/internal/PbiFilterTypes.inl',
+      'pbbam/internal/PbiRawData.inl',
+      'pbbam/internal/ProgramInfo.inl',
+      'pbbam/internal/QualityValue.inl',
+      'pbbam/internal/QualityValues.inl',
+      'pbbam/internal/QueryBase.h',
+      'pbbam/internal/QueryBase.inl',
+      'pbbam/internal/ReadGroupInfo.inl',
+      'pbbam/internal/SequenceInfo.inl',
+      'pbbam/internal/Tag.inl',
+      'pbbam/internal/Validator.inl']),
+    subdir : 'pbbam/internal')
+
+  install_headers(
+    files([
+      'pbbam/vcf/VcfVariant.h',
+      'pbbam/vcf/VcfFile.h',
+      'pbbam/vcf/VcfFormat.h',
+      'pbbam/vcf/VcfHeader.h',
+      'pbbam/vcf/VcfHeaderTypes.h',
+      'pbbam/vcf/VcfReader.h',
+      'pbbam/vcf/VcfSort.h',
+      'pbbam/vcf/VcfQuery.h',
+      'pbbam/vcf/VcfWriter.h']),
+    subdir : 'pbbam/vcf')
+
+  install_headers(
+    files([
+      'pbbam/vcf/internal/VcfVariant.inl',
+      'pbbam/vcf/internal/VcfFile.inl',
+      'pbbam/vcf/internal/VcfHeader.inl',
+      'pbbam/vcf/internal/VcfHeaderTypes.inl']),
+    subdir : 'pbbam/vcf/internal')
+
+  install_headers(
+    files([
+      'pbbam/virtual/VirtualPolymeraseBamRecord.h',
+      'pbbam/virtual/VirtualPolymeraseCompositeReader.h',
+      'pbbam/virtual/VirtualPolymeraseReader.h',
+      'pbbam/virtual/VirtualRegion.h',
+      'pbbam/virtual/VirtualRegionType.h',
+      'pbbam/virtual/VirtualRegionTypeMap.h',
+      'pbbam/virtual/VirtualZmwBamRecord.h',
+      'pbbam/virtual/WhitelistedZmwReadStitcher.h',
+      'pbbam/virtual/ZmwReadStitcher.h',
+      'pbbam/virtual/ZmwWhitelistVirtualReader.h']),
+    subdir : 'pbbam/virtual')
+endif
+
+pbbam_include_directories = include_directories('.')
diff --git a/include/pbbam/Accuracy.h b/include/pbbam/Accuracy.h

new file mode 100644 (file)

index 0000000..39c0267
--- /dev/null
+++ b/include/pbbam/Accuracy.h
@@ -0,0 +1,58 @@
+// File Description
+/// \file Accuracy.h
+/// \brief Defines the Accuracy class.
+//
+// Author: Derek Barnett
+
+#ifndef ACCURACY_H
+#define ACCURACY_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The Accuracy class represents the expected accuracy of a BamRecord.
+///
+/// Values are clamped to fall within [0,1].
+///
+class PBBAM_EXPORT Accuracy
+{
+public:
+    static const float MIN;  ///< Minimum valid accuracy value [0.0]
+    static const float MAX;  ///< Maximum valid accuracy value [1.0]
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// Constructs an Accuracy object from a floating-point number.
+    ///
+    /// \note This is not an \b explicit ctor, to make it as easy as
+    ///       possible to use in numeric operations. We really just want
+    ///       to make sure that the acceptable range is respected.
+    ///
+    Accuracy(float accuracy);
+
+    Accuracy(const Accuracy&) = default;
+    Accuracy(Accuracy&&) = default;
+    Accuracy& operator=(const Accuracy&) = default;
+    Accuracy& operator=(Accuracy&&) = default;
+    ~Accuracy() = default;
+
+    /// \}
+
+public:
+    /// \returns Accuracy as float primitive
+    operator float() const;
+
+private:
+    float accuracy_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/Accuracy.inl"
+
+#endif  // ACCURACY_H
diff --git a/include/pbbam/AlignmentPrinter.h b/include/pbbam/AlignmentPrinter.h

new file mode 100644 (file)

index 0000000..f974597
--- /dev/null
+++ b/include/pbbam/AlignmentPrinter.h
@@ -0,0 +1,76 @@
+// File Description
+/// \file AlignmentPrinter.h
+/// \brief Defines the AlignmentPrinter class.
+//
+// Author: Armin Töpfer
+
+#ifndef ALIGNMENTPRINTER_H
+#define ALIGNMENTPRINTER_H
+
+#include <memory>
+#include <string>
+
+#include "pbbam/BamRecord.h"
+#include "pbbam/IndexedFastaReader.h"
+#include "pbbam/Orientation.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+
+/// \brief The AlignmentPrinter class "pretty-prints" an alignment with respect
+///        to its associated reference sequence.
+///
+/// Example output:
+/// \verbinclude plaintext/AlignmentPrinterOutput.txt
+///
+class AlignmentPrinter
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// Constructs the alignment printer with an associated FASTA file reader.
+    ///
+    /// \param[in] ifr FASTA reader
+    ///
+    /// \throws std::runtime_error if FASTA file cannot be opened for reading.
+    ///
+    AlignmentPrinter(const IndexedFastaReader& ifr);
+
+    AlignmentPrinter() = delete;
+    AlignmentPrinter(const AlignmentPrinter&) = delete;
+    AlignmentPrinter(AlignmentPrinter&&) = default;
+    AlignmentPrinter& operator=(const AlignmentPrinter&) = delete;
+    AlignmentPrinter& operator=(AlignmentPrinter&&) = default;
+    ~AlignmentPrinter() = default;
+
+    /// \}
+
+public:
+    /// \name Printing
+    /// \{
+
+    /// Pretty-prints an aligned BamRecord to std::string.
+    ///
+    /// \note The current implementation includes ANSI escape sequences for
+    ///       coloring terminal output. Future versions of this method will
+    ///       likely make this optional.
+    ///
+    /// \returns formatted string containing the alignment and summary
+    ///          information
+    ///
+    std::string Print(const BamRecord& record,
+                      const Orientation orientation = Orientation::GENOMIC);
+
+    /// \}
+
+private:
+    const std::unique_ptr<IndexedFastaReader> ifr_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ALIGNMENTPRINTER_H
diff --git a/include/pbbam/BaiIndexedBamReader.h b/include/pbbam/BaiIndexedBamReader.h

new file mode 100644 (file)

index 0000000..e6b9320
--- /dev/null
+++ b/include/pbbam/BaiIndexedBamReader.h
@@ -0,0 +1,84 @@
+// File Description
+/// \file BaiIndexedBamReader.h
+/// \brief Defines the BaiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#ifndef BAIINDEXEDBAMREADER_H
+#define BAIINDEXEDBAMREADER_H
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamReader.h"
+#include "pbbam/GenomicInterval.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+struct BaiIndexedBamReaderPrivate;
+}
+
+/// \brief The BaiIndexedBamReader class provides read-only iteration over %BAM
+///        records, bounded by a particular genomic interval.
+///
+/// The SAM/BAM standard index (*.bai) is used to allow random-access operations.
+///
+class PBBAM_EXPORT BaiIndexedBamReader : public BamReader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Constructs %BAM reader, bounded by a genomic interval.
+    ///
+    /// All reads that overlap the interval will be available.
+    ///
+    /// \param[in] interval iteration will be bounded by this GenomicInterval.
+    /// \param[in] filename input %BAM filename
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open
+    ///         for reading, or if the interval is invalid
+    ///
+    BaiIndexedBamReader(const GenomicInterval& interval, std::string filename);
+
+    /// \brief Constructs %BAM reader, bounded by a genomic interval.
+    ///
+    /// All reads that overlap the interval will be available.
+    ///
+    /// \param[in] interval iteration will be bounded by this GenomicInterval.
+    /// \param[in] bamFile input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.bai) fails to open
+    ///         for reading, or if the interval is invalid
+    ///
+    BaiIndexedBamReader(const GenomicInterval& interval, BamFile bamFile);
+
+    /// \}
+
+public:
+    /// \name Random-Access
+    /// \{
+
+    /// \returns the current GenomicInterval in use by this reader
+    const GenomicInterval& Interval() const;
+
+    /// \brief Sets a new genomic interval on the reader.
+    ///
+    /// \param[in] interval
+    /// \returns reference to this reader
+    ///
+    BaiIndexedBamReader& Interval(const GenomicInterval& interval);
+
+    /// \}
+
+protected:
+    int ReadRawData(BGZF* bgzf, bam1_t* b) override;
+
+private:
+    std::unique_ptr<internal::BaiIndexedBamReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAIINDEXEDBAMREADER_H
diff --git a/include/pbbam/BamFile.h b/include/pbbam/BamFile.h

new file mode 100644 (file)

index 0000000..2426d49
--- /dev/null
+++ b/include/pbbam/BamFile.h
@@ -0,0 +1,185 @@
+// File Description
+/// \file BamFile.h
+/// \brief Defines the BamFile class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMFILE_H
+#define BAMFILE_H
+
+#include <cstdint>
+#include <string>
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+class BamFilePrivate;
+}
+
+/// \brief The BamFile class represents a %BAM file.
+///
+/// It provides access to header metadata and methods for finding/creating
+/// associated index files.
+///
+class PBBAM_EXPORT BamFile
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a BamFile object on the provided \p filename &
+    ///        loads header information.
+    ///
+    /// \param[in] filename %BAM filename
+    /// \throws std::exception on failure to open %BAM file for reading
+    ///
+    BamFile(std::string filename);
+
+    BamFile(const BamFile& other);
+    BamFile(BamFile&& other);
+    BamFile& operator=(const BamFile& other);
+    BamFile& operator=(BamFile&& other);
+    ~BamFile();
+
+    /// \}
+
+public:
+    /// \name Index & Filename Methods
+    /// \{
+
+    /// \brief Creates a ".pbi" file for this %BAM file.
+    ///
+    /// \note Existing index file will be overwritten. Use
+    ///       EnsurePacBioIndexExists() if this is not desired.
+    ///
+    /// \throws if PBI file could not be properly created and/or
+    ///         written to disk
+    ///
+    void CreatePacBioIndex() const;
+
+    /// \brief Creates a ".bai" file for this %BAM file.
+    ///
+    /// \note Existing index file will be overwritten. Use
+    ///       EnsureStandardIndexExists() if this is not desired.
+    ///
+    /// \throws if BAI file could not be properly created (e.g. this
+    ///         %BAM is not coordinate-sorted) or could not be written to disk
+    ///
+    void CreateStandardIndex() const;
+
+    /// \brief Creates a ".pbi" file if one does not exist or is older than its
+    ///        %BAM file.
+    ///
+    /// Equivalent to:
+    /// \code{.cpp}
+    ///    if (!file.PacBioIndexExists())
+    ///        file.CreatePacBioIndex();
+    /// \endcode
+    ///
+    /// \note As of v0.4.02+, no timestamp check is performed. Previously we requr
+    /// with an additional timestamp check.
+    ///
+    /// \throws if PBI file could not be properly created and/or
+    ///         written to disk
+    ///
+    void EnsurePacBioIndexExists() const;
+
+    /// \brief Creates a ".bai" file if one does not exist or is older than its
+    ///        %BAM file.
+    ///
+    /// Equivalent to:
+    /// \code{.cpp}
+    ///    if (!file.StandardIndexExists())
+    ///        file.CreateStandardIndex();
+    /// \endcode
+    ///
+    /// \note As of v0.4.2, no timestamp check is performed.
+    ///
+    /// \throws if BAI file could not be properly created (e.g. this
+    ///         %BAM is not coordinate-sorted) or could not be written to disk
+    ///
+    void EnsureStandardIndexExists() const;
+
+    /// \returns %BAM filename
+    const std::string& Filename() const;
+
+    /// \returns true if %BAM file has EOF marker (empty BGZF block). Streamed
+    ///          input (filename: "-")
+    bool HasEOF() const;
+
+    /// \returns true if ".pbi" exists and is newer than this %BAM file.
+    bool PacBioIndexExists() const;
+
+    /// \returns filename of %PacBio index file (".pbi")
+    /// \note No guarantee is made on the existence of this file.
+    ///       This method simply returns the expected filename.
+    std::string PacBioIndexFilename() const;
+
+    /// \returns true if ".pbi" has a more recent timestamp than this file
+    bool PacBioIndexIsNewer() const;
+
+    /// \returns true if ".bai" exists
+    bool StandardIndexExists() const;
+
+    /// \note No guarantee is made on the existence of this file.
+    ///       This method simply returns the expected filename.
+    std::string StandardIndexFilename() const;
+
+    /// \returns true if ".bai" has a more recent timestamp than this file
+    bool StandardIndexIsNewer() const;
+
+    /// \}
+
+public:
+    /// \name File Header Data
+    /// \{
+
+    /// \returns true if header metadata has this reference name
+    bool HasReference(const std::string& name) const;
+
+    /// \returns const reference to BamHeader containing the file's metadata
+    const BamHeader& Header() const;
+
+    /// \returns true if file is a %PacBio %BAM file (i.e. has non-empty version
+    ///          associated with header "pb" tag)
+    bool IsPacBioBAM() const;
+
+    /// \returns ID for reference \p name (can be used for e.g.
+    ///          GenomicIntervalQuery), or -1 if not found
+    int ReferenceId(const std::string& name) const;
+
+    /// \return name of reference matching \p id, empty string if not found
+    std::string ReferenceName(const int id) const;
+
+    /// \returns length of requested reference \p name. 0 if not found
+    uint32_t ReferenceLength(const std::string& name) const;
+
+    /// \returns length of requested reference \p id. 0 if not found
+    uint32_t ReferenceLength(const int id) const;
+
+    /// \}
+
+public:
+    /// \name Additional Attributes
+    /// \{
+
+    /// \returns virtual offset of first alignment. Intended mostly for internal
+    ///          use. Note that this is a BGZF \b virtual offset, not a
+    ///          'normal' file position.
+    ///
+    int64_t FirstAlignmentOffset() const;
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::BamFilePrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMFILE_H
diff --git a/include/pbbam/BamHeader.h b/include/pbbam/BamHeader.h

new file mode 100644 (file)

index 0000000..b987a25
--- /dev/null
+++ b/include/pbbam/BamHeader.h
@@ -0,0 +1,393 @@
+// File Description
+/// \file BamHeader.h
+/// \brief Defines the BamHeader class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMHEADER_H
+#define BAMHEADER_H
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "pbbam/Config.h"
+#include "pbbam/ProgramInfo.h"
+#include "pbbam/ReadGroupInfo.h"
+#include "pbbam/SequenceInfo.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+class BamHeaderPrivate;
+}
+
+/// \brief The BamHeader class represents the header section of the %BAM file.
+///
+/// It provides metadata about the file including file version, reference
+/// sequences, read groups, comments, etc.
+///
+/// A BamHeader may be fetched from a BamFile to view an existing file's
+/// metadata. Or one may be created/edited for use with writing to a new file
+/// (via BamWriter).
+///
+/// \note A particular BamHeader is likely to be re-used in lots of places
+///       throughout the library, for read-only purposes. For this reason, even
+///       though a BamHeader may be returned by value, it is essentially a thin
+///       wrapper for a shared-pointer to the actual data. This means, though,
+///       that if you need to edit an existing BamHeader for use with a
+///       BamWriter, please consider using BamHeader::DeepCopy. Otherwise any
+///       modifications will affect all BamHeaders that are sharing its
+///       underlying data.
+///
+class PBBAM_EXPORT BamHeader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    ///
+    /// \brief Creates a BamHeader from SAM-formatted text
+    /// \param samHeaderText
+    ///
+    BamHeader(const std::string& samHeaderText);
+
+    BamHeader();
+    BamHeader(const BamHeader&) = default;
+    BamHeader(BamHeader&&) = default;
+    BamHeader& operator=(const BamHeader&) = default;
+    BamHeader& operator=(BamHeader&&) = default;
+    ~BamHeader() = default;
+
+    /// \brief Detaches underlying data from the shared-pointer, returning a
+    ///        independent copy of the header contents.
+    ///
+    /// This ensures that any modifications to the newly returned BamHeader do
+    /// not affect other BamHeader objects that were sharing its underlying data.
+    ///
+    BamHeader DeepCopy() const;
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    /// \brief Merges another header with this one.
+    ///
+    /// Headers must be compatible for merging. This means that their Version,
+    /// SortOrder, PacBioBamVersion (and in the case of aligned BAM data,
+    /// Sequences) must all match. If not, an exception will be thrown.
+    ///
+    /// \param[in] other  header to merge with this one
+    /// \returns reference to this header
+    ///
+    /// \throws std::runtime_error if the headers are not compatible
+    ///
+    BamHeader& operator+=(const BamHeader& other);
+
+    /// \brief Creates a new, merged header.
+    ///
+    /// Headers must be compatible for merging. This means that their Version,
+    /// SortOrder, PacBioBamVersion (and in the case of aligned BAM data,
+    /// Sequences) must all match. If not, an exception will be thrown.
+    ///
+    /// Both original headers (this header and \p other) will not be modified.
+    ///
+    /// \param[in] other  header to merge with this one
+    /// \returns merged header
+    ///
+    /// \throws std::runtime_error if the headers are not compatible
+    ///
+    BamHeader operator+(const BamHeader& other) const;
+
+    /// \}
+
+public:
+    /// \name General Attributes
+    /// \{
+
+    /// \returns the %PacBio %BAM version number (\@HD:pb)
+    ///
+    /// \note This is different from the SAM/BAM version number
+    /// \sa BamHeader::Version.
+    ///
+    std::string PacBioBamVersion() const;
+
+    /// \returns the sort order used
+    ///
+    /// Valid values: "unknown", "unsorted", "queryname", or "coordinate"
+    ///
+    std::string SortOrder() const;
+
+    /// \returns the SAM/BAM version number (\@HD:VN)
+    ///
+    /// \note This is different from the %PacBio %BAM version number
+    /// \sa BamHeader::PacBioBamVersion
+    ///
+    std::string Version() const;
+
+    /// \}
+
+public:
+    /// \name Read Groups
+    /// \{
+
+    /// \returns true if the header contains a read group with \p id (\@RG:ID)
+    bool HasReadGroup(const std::string& id) const;
+
+    /// \returns a ReadGroupInfo object representing the read group matching
+    ///          \p id (\@RG:ID)
+    /// \throws std::runtime_error if \p id is unknown
+    ///
+    ReadGroupInfo ReadGroup(const std::string& id) const;
+
+    /// \returns vector of read group IDs listed in this header
+    std::vector<std::string> ReadGroupIds() const;
+
+    /// \returns vector of ReadGroupInfo objects, representing all read groups
+    ///          listed in this header
+    ///
+    std::vector<ReadGroupInfo> ReadGroups() const;
+
+    /// \}
+
+public:
+    /// \name Sequences
+    /// \{
+
+    /// \returns true if header contains a sequence with \p name (\@SQ:SN)
+    bool HasSequence(const std::string& name) const;
+
+    /// \returns number of sequences (\@SQ entries) stored in this header
+    size_t NumSequences() const;
+
+    /// \returns numeric ID for sequence matching \p name (\@SQ:SN)
+    ///
+    /// This is the numeric ID used elsewhere throughout the API.
+    ///
+    /// \throws std::runtime_error if \p name is unknown
+    /// \sa BamReader::ReferenceId, PbiReferenceIdFilter,
+    ///     PbiRawMappedData::tId_
+    ///
+    int32_t SequenceId(const std::string& name) const;
+
+    /// \returns the length of the sequence (\@SQ:LN, e.g. chromosome length) at
+    ///          index \p id
+    ///
+    /// \sa SequenceInfo::Length, BamHeader::SequenceId
+    ///
+    std::string SequenceLength(const int32_t id) const;
+
+    /// \returns the name of the sequence (\@SQ:SN) at index \p id
+    ///
+    /// \sa SequenceInfo::Name, BamHeader::SequenceId
+    ///
+    std::string SequenceName(const int32_t id) const;
+
+    /// \returns vector of sequence names (\@SQ:SN) stored in this header
+    ///
+    /// Position in the vector is equivalent to SequenceId.
+    ///
+    std::vector<std::string> SequenceNames() const;
+
+    /// \returns SequenceInfo object at index \p id
+    ///
+    /// \throws std::out_of_range if \p is an invalid or unknown index
+    /// \sa BamHeader::SequenceId
+    ///
+    SequenceInfo Sequence(const int32_t id) const;
+
+    /// \returns SequenceInfo for the sequence matching \p name
+    SequenceInfo Sequence(const std::string& name) const;
+
+    /// \returns vector of SequenceInfo objects representing the sequences
+    ///          (\@SQ entries) stored in this header
+    ///
+    std::vector<SequenceInfo> Sequences() const;
+
+    /// \}
+
+public:
+    /// \name Programs
+    /// \{
+
+    /// \returns true if this header contains a program entry with ID (\@PG:ID)
+    ///          matching \p id
+    ///
+    bool HasProgram(const std::string& id) const;
+
+    /// \returns ProgramInfo object for the program entry matching \p id
+    /// \throws std::runtime_error if \p id is unknown
+    ///
+    ProgramInfo Program(const std::string& id) const;
+
+    /// \returns vector of program IDs (\@PG:ID)
+    std::vector<std::string> ProgramIds() const;
+
+    /// \returns vector of ProgramInfo objects representing program entries
+    ///          (\@PG) stored in this heder
+    ///
+    std::vector<ProgramInfo> Programs() const;
+
+    /// \}
+
+public:
+    /// \name Comments
+    /// \{
+
+    /// \returns vector of comment (\@CO) strings
+    std::vector<std::string> Comments() const;
+
+    /// \}
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \returns SAM-header-formatted string representing this header's data
+    std::string ToSam() const;
+
+    /// \}
+
+public:
+    /// \name General Attributes
+    /// \{
+
+    /// \brief Sets this header's PacBioBAM version number (\@HD:pb).
+    ///
+    /// \returns reference to this object
+    /// \throws std::runtime_error if version number cannot be parsed or
+    ///         is less than the minimum version allowed.
+    ///
+    BamHeader& PacBioBamVersion(const std::string& version);
+
+    /// \brief Sets this header's sort order label (\@HD:SO).
+    ///
+    /// Valid values: "unknown", "unsorted", "queryname", or "coordinate"
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& SortOrder(std::string order);
+
+    /// \brief Sets this header's SAM/BAM version number (\@HD:VN).
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Version(std::string version);
+
+    /// \}
+
+public:
+    /// \name Read Groups
+    /// \{
+
+    /// \brief Appends a read group entry (\@RG) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddReadGroup(ReadGroupInfo readGroup);
+
+    /// \brief Removes all read group entries from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearReadGroups();
+
+    /// \brief Replaces this header's list of read group entries with those in
+    ///        \p readGroups.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ReadGroups(std::vector<ReadGroupInfo> readGroups);
+
+    /// \}
+
+public:
+    /// \name Sequences
+    /// \{
+
+    /// \brief Appends a sequence entry (\@SQ) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddSequence(SequenceInfo sequence);
+
+    /// \brief Removes all sequence entries from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearSequences();
+
+    /// \brief Replaces this header's list of sequence entries with those in
+    ///       \p sequences.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Sequences(std::vector<SequenceInfo> sequences);
+
+    /// \}
+
+public:
+    /// \name Programs
+    /// \{
+
+    /// \brief Appends a program entry (\@PG) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddProgram(ProgramInfo pg);
+
+    /// \brief Removes all program entries from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearPrograms();
+
+    /// \brief Replaces this header's list of program entries with those in
+    ///        \p programs.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Programs(std::vector<ProgramInfo> programs);
+
+    /// \}
+
+public:
+    /// \name Comments
+    /// \{
+
+    /// \brief Appends a comment (\@CO) to this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& AddComment(std::string comment);
+
+    /// \brief Removes all comments from this header.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& ClearComments();
+
+    /// \brief Replaces this header's list of comments with those in \p comments.
+    ///
+    /// \returns reference to this object
+    ///
+    BamHeader& Comments(std::vector<std::string> comments);
+
+    /// \}
+
+private:
+    std::shared_ptr<internal::BamHeaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/BamHeader.inl"
+
+#endif  // BAMHEADER_H
diff --git a/include/pbbam/BamReader.h b/include/pbbam/BamReader.h

new file mode 100644 (file)

index 0000000..fe1494a
--- /dev/null
+++ b/include/pbbam/BamReader.h
@@ -0,0 +1,153 @@
+// File Description
+/// \file BamReader.h
+/// \brief Defines the BamReader class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMREADER_H
+#define BAMREADER_H
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include <htslib/sam.h>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/GenomicInterval.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+struct BamReaderPrivate;
+}
+
+/// \brief The BamReader class provides basic read-access to a %BAM file.
+///
+/// The base-class implementation provides a sequential read-through of BAM
+/// records. Derived classes may implement other access schemes (e.g. genomic
+/// region, PBI-enabled record filtering).
+///
+class PBBAM_EXPORT BamReader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Opens BAM file for reading.
+    ///
+    /// \param[in] fn %BAM filename
+    /// \throws std::runtime_error if failed to open
+    ///
+    explicit BamReader(std::string fn);
+
+    /// \brief Opens BAM file for reading.
+    ///
+    /// \param[in] bamFile BamFile object
+    /// \throws std::runtime_error if failed to open
+    ///
+    explicit BamReader(BamFile bamFile);
+
+    virtual ~BamReader();
+
+    /// \}
+
+public:
+    /// \name BAM File Attributes
+    /// \{
+
+    /// \returns the underlying BamFile
+    const BamFile& File() const;
+
+    /// \returns %BAM filename
+    const std::string& Filename() const;
+
+    /// \returns BamHeader object from %BAM header contents
+    const BamHeader& Header() const;
+
+    /// \}
+
+public:
+    /// \name BAM File I/O
+    /// \{
+
+    /// \brief Fetches the "next" %BAM record.
+    ///
+    /// Default implementation will read records until EOF. Derived readers may
+    /// use additional criteria to decide which record is "next" and when
+    /// reading is done.
+    ///
+    /// \param[out] record  next BamRecord object. Should not be used if method
+    ///                     returns false.
+    ///
+    /// \returns true if record was read successfully. Returns false if EOF (or
+    ///          end of iterator in derived readers). False is not an error,
+    ///          it indicates "end of data".
+    ///
+    /// \throws std::runtime_error if failed to read from file (e.g. possible
+    ///         truncated or corrupted file).
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// \brief Seeks to virtual offset in %BAM.
+    ///
+    /// \note This is \b NOT a normal file offset, but the virtual offset used
+    ///       in %BAM indexing.
+    ///
+    /// \throws std::runtime_error if failed to seek
+    ///
+    void VirtualSeek(int64_t virtualOffset);
+
+    /// \returns current (virtual) file position.
+    ///
+    /// \note This is \b NOT a normal file offset, but the virtual offset used
+    ///       in %BAM indexing.
+    ///
+    int64_t VirtualTell() const;
+
+    /// \}
+
+protected:
+    /// \name BAM File I/O
+    /// \{
+
+    /// \brief Helper method for access to underlying BGZF stream pointer.
+    ///
+    /// Useful for derived readers' contact points with htslib methods.
+    ///
+    /// \returns BGZF stream pointer
+    ///
+    BGZF* Bgzf() const;
+
+    /// \brief Performs the actual raw read of the next record from the BAM
+    ///        file.
+    ///
+    /// Default implementation will read records, sequentially, until EOF.
+    /// Derived readers may use additional criteria to decide which record is
+    ///  "next" and when reading is done.
+    ///
+    /// Return value should be equivalent to htslib's bam_read1():
+    ///     >= 0 : normal
+    ///       -1 : EOF (not an error)
+    ///     < -1 : error
+    ///
+    /// \param[in]  bgzf BGZF stream pointer
+    /// \param[out] b    %BAM record pointer
+    /// \returns integer status code, see description
+    ///
+    virtual int ReadRawData(BGZF* bgzf, bam1_t* b);
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::BamReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMREADER_H
diff --git a/include/pbbam/BamRecord.h b/include/pbbam/BamRecord.h

new file mode 100644 (file)

index 0000000..9ca0d69
--- /dev/null
+++ b/include/pbbam/BamRecord.h
@@ -0,0 +1,1238 @@
+// File Description
+/// \file BamRecord.h
+/// \brief Defines the BamRecord class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORD_H
+#define BAMRECORD_H
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "pbbam/Accuracy.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecordImpl.h"
+#include "pbbam/ClipType.h"
+#include "pbbam/FrameEncodingType.h"
+#include "pbbam/Frames.h"
+#include "pbbam/LocalContextFlags.h"
+#include "pbbam/Orientation.h"
+#include "pbbam/PulseBehavior.h"
+#include "pbbam/PulseExclusionReason.h"
+#include "pbbam/QualityValues.h"
+#include "pbbam/ReadGroupInfo.h"
+#include "pbbam/RecordType.h"
+#include "pbbam/Strand.h"
+#include "pbbam/ZmwType.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+
+class BamRecordMemory;
+class Pulse2BaseCache;
+
+}  // namespace internal
+
+/// \brief The BamRecord class represents a %PacBio %BAM record.
+///
+/// %PacBio %BAM records are extensions of normal SAM/BAM records. Thus in
+/// addition to normal fields like bases, qualities, mapping coordinates, etc.,
+/// tags are used extensively to annotate records with additional
+/// PacBio-specific data.
+///
+/// Mapping and clipping APIs are provided as well to ensure that such
+/// operations "trickle down" to all data fields properly.
+///
+/// \sa https://samtools.github.io/hts-specs/SAMv1.pdf
+///     for more information on standard %BAM data, and
+///     https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst
+///     for more information on %PacBio %BAM fields.
+///
+class PBBAM_EXPORT BamRecord
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    BamRecord();
+    BamRecord(BamHeader header);
+    BamRecord(BamRecordImpl impl);
+    BamRecord(const BamRecord& other);
+    BamRecord(BamRecord&& other);
+    BamRecord& operator=(const BamRecord& other);
+    BamRecord& operator=(BamRecord&& other);
+    virtual ~BamRecord();
+
+    /// \}
+
+public:
+    /// \name General Data
+    /// \{
+
+    /// \returns this record's full name
+    /// \sa BamRecordImpl::Name
+    ///
+    std::string FullName() const;
+
+    /// \returns shared pointer to this record's associated BamHeader
+    BamHeader Header() const;
+
+    /// \returns ZMW hole number
+    /// \throws if missing zm tag & record name does not contain hole number
+    ///
+    int32_t HoleNumber() const;
+
+    /// \returns this record's LocalContextFlags
+    PacBio::BAM::LocalContextFlags LocalContextFlags() const;
+
+    /// \returns this record's movie name
+    std::string MovieName() const;
+
+    /// \returns "number of complete passes of the insert"
+    int32_t NumPasses() const;
+
+    /// \returns the record's query end position, or Sequence().length() if not
+    ///          stored
+    /// \note QueryEnd is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position QueryEnd() const;
+
+    /// \returns the record's query start position, or 0 if not stored
+    ///
+    /// \note QueryStart is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position QueryStart() const;
+
+    /// \returns this record's expected read accuracy [0, 1000]
+    Accuracy ReadAccuracy() const;
+
+    /// \returns ReadGroupInfo object for this record
+    ReadGroupInfo ReadGroup() const;
+
+    /// \returns string ID of this record's read group
+    /// \sa ReadGroupInfo::Id
+    ///
+    std::string ReadGroupId() const;
+
+    /// \returns integer value for this record's read group ID
+    int32_t ReadGroupNumericId() const;
+
+    /// \returns this scrap record's scrap region type
+    VirtualRegionType ScrapRegionType() const;
+
+    /// \returns this scrap record's scrap ZMW type
+    ZmwType ScrapZmwType() const;
+
+    /// \returns this record's average signal-to-noise for each of A, C, G,
+    ///          and T
+    ///
+    std::vector<float> SignalToNoise() const;
+
+    /// \returns this record's type
+    /// \sa RecordType
+    RecordType Type() const;
+
+    /// \}
+
+public:
+    /// \name Mapping Data
+    /// \{
+
+    /// \returns the record's aligned end position
+    ///
+    /// \note AlignedEnd is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position AlignedEnd() const;
+
+    /// \returns the record's aligned start position
+    ///
+    /// \note AlignedStart is in polymerase read coordinates, NOT genomic
+    ///       coordinates.
+    ///
+    Position AlignedStart() const;
+
+    /// \returns the record's strand as a Strand enum value
+    Strand AlignedStrand() const;
+
+    /// \returns the record's CIGAR data as a Cigar object
+    ///
+    /// \param[in] exciseAllClips   if true, remove all clipping operations
+    ///                             (hard & soft) [default:false]
+    ///
+    Cigar CigarData(bool exciseAllClips = false) const;
+
+    /// \returns true if this record was mapped by aligner
+    bool IsMapped() const;
+
+    /// \returns this record's mapping quality. A value of 255 indicates
+    ///          "unknown"
+    ///
+    uint8_t MapQuality() const;
+
+    /// \returns the number of deleted bases (relative to reference)
+    size_t NumDeletedBases() const;
+
+    /// \returns the number of inserted bases (relative to reference)
+    size_t NumInsertedBases() const;
+
+    /// \returns the number of matching bases (sum of '=' CIGAR op lengths)
+    size_t NumMatches() const;
+
+    /// \returns a tuple containing NumMatches (first) and NumMismatches
+    ///         (second)
+    ///
+    std::pair<size_t, size_t> NumMatchesAndMismatches() const;
+
+    /// \returns the number of mismatching bases (sum of 'X' CIGAR op lengths)
+    size_t NumMismatches() const;
+
+    /// \returns this record's reference ID, or -1 if unmapped.
+    ///
+    /// \note This is only a valid identifier within this %BAM file
+    ///
+    int32_t ReferenceId() const;
+
+    /// \returns this record's reference name.
+    ///
+    /// \throws an exception if unmapped record.
+    ///
+    std::string ReferenceName() const;
+
+    /// \returns the record's reference end position, or UnmappedPosition if
+    ///          unmapped
+    ///
+    /// \note ReferenceEnd is in reference coordinates, NOT polymerase read
+    ///       coordinates.
+    ///
+    Position ReferenceEnd() const;
+
+    /// \returns the record's reference start position, or UnmappedPosition if
+    ///          unmapped
+    ///
+    /// \note ReferenceStart is in reference coordinates, NOT polymerase read
+    ///       coordinates.
+    ///
+    Position ReferenceStart() const;
+
+    /// \}
+
+public:
+    /// \name Barcode Data
+    /// \{
+
+    /// \returns forward barcode id
+    ///
+    /// \throws std::runtime_error if barcode data is absent or malformed.
+    /// \sa HasBarcodes
+    ///
+    int16_t BarcodeForward() const;
+
+    /// \returns barcode call confidence (Phred-scaled posterior probability
+    ///          of correct barcode call)
+    ///
+    /// \sa HasBarcodeQuality
+    ///
+    uint8_t BarcodeQuality() const;
+
+    /// \returns reverse barcode id
+    ///
+    /// \throws std::runtime_error if barcode data is absent or malformed.
+    /// \sa HasBarcodes
+    ///
+    int16_t BarcodeReverse() const;
+
+    /// \returns the forward and reverse barcode ids
+    ///
+    /// \throws std::runtime_error if barcode data is absent or malformed.
+    /// \sa HasBarcodes
+    ///
+    std::pair<int16_t, int16_t> Barcodes() const;
+
+    /// \}
+
+public:
+    /// \name Auxiliary Data Queries
+    /// \{
+
+    /// \returns true if this record has AltLabelQV data
+    bool HasAltLabelQV() const;
+
+    /// \returns true if this record has AltLabelTag data
+    bool HasAltLabelTag() const;
+
+    /// \returns true if this record has Barcode data
+    bool HasBarcodes() const;
+
+    /// \returns true is this record has BarcodeQuality data
+    bool HasBarcodeQuality() const;
+
+    /// \returns true if this record has DeletionQV data
+    bool HasDeletionQV() const;
+
+    /// \returns true if this record has DeletionTag data
+    bool HasDeletionTag() const;
+
+    /// \returns true if this record has a HoleNumber
+    bool HasHoleNumber() const;
+
+    /// \returns true if this record has InsertionQV data
+    bool HasInsertionQV() const;
+
+    /// \returns true if this record has IPD data
+    bool HasIPD() const;
+
+    /// \returns true if this record has LabelQV data
+    bool HasLabelQV() const;
+
+    /// \returns true if this record has LocalContextFlags (absent in CCS)
+    bool HasLocalContextFlags() const;
+
+    /// \returns true if this record has MergeQV data
+    bool HasMergeQV() const;
+
+    /// \returns true if this record has NumPasses data
+    bool HasNumPasses() const;
+
+    /// \returns true if this record has Pkmean data
+    bool HasPkmean() const;
+
+    /// \returns true if this record has Pkmid data
+    bool HasPkmid() const;
+
+    /// \returns true if this record has Pkmean2 data
+    bool HasPkmean2() const;
+
+    /// \returns true if this record has Pkmid2 data
+    bool HasPkmid2() const;
+
+    /// \returns true if this record has PreBaseFrames aka IPD data
+    bool HasPreBaseFrames() const;
+
+    /// \returns true if this record has PrePulseFrames data
+    bool HasPrePulseFrames() const;
+
+    /// \returns true if this record has PulseCall data
+    bool HasPulseCall() const;
+
+    /// \returns true if this record has PulseCallWidth data
+    bool HasPulseCallWidth() const;
+
+    /// \returns true if this record has PulseExclusion data
+    bool HasPulseExclusion(void) const;
+
+    /// \returns true if this record has PulseMergeQV data
+    bool HasPulseMergeQV() const;
+
+    /// \returns true if this record has PulseWidth data
+    bool HasPulseWidth() const;
+
+    /// \returns true if this record has ReadAccuracyTag data
+    bool HasReadAccuracy() const;
+
+    /// \returns true if this record has QueryEnd data
+    bool HasQueryEnd() const;
+
+    /// \returns true if this record has QueryStart data
+    bool HasQueryStart() const;
+
+    /// \returns true if this record has ScrapRegionType data (only in SCRAP)
+    bool HasScrapRegionType() const;
+
+    /// \returns true if this record has scrap ZMW type data (only in SCRAP)
+    bool HasScrapZmwType() const;
+
+    /// \returns true if this record has signal-to-noise data (absent in
+    ///          POLYMERASE)
+    ///
+    bool HasSignalToNoise() const;
+
+    /// \returns true if this record has StartFrame data
+    bool HasStartFrame() const;
+
+    /// \returns true if this record has SubstitutionQV data
+    bool HasSubstitutionQV() const;
+
+    /// \returns true if this record has SubstitutionTag data
+    bool HasSubstitutionTag() const;
+
+    /// \}
+
+public:
+    /// \name Sequence & Tag Data
+    /// \{
+
+    /// \brief Fetches this record's AltLabelTag values ("pt" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    ///
+    /// \returns AltLabelTags string
+    ///
+    std::string AltLabelTag(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                            bool exciseSoftClips = false,
+                            PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's DeletionTag values ("dt" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns DeletionTag string
+    ///
+    std::string DeletionTag(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                            bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's DNA sequence (SEQ field).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns sequence string
+    ///
+    std::string Sequence(const Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                         bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's SubstitutionTag values ("st" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new gap chars will be '-' and padding chars will be '*'.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns SubstitutionTags string
+    ///
+    std::string SubstitutionTag(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                                bool exciseSoftClips = false) const;
+
+    /// \}
+
+public:
+    /// \name Quality Data
+    /// \{
+
+    /// \brief Fetches this record's AltLabelQV values ("pv" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    ///
+    /// \returns AltLabelQV as QualityValues object
+    ///
+    QualityValues AltLabelQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                             bool exciseSoftClips = false,
+                             PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's DeletionQV values ("dq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns DeletionQV as QualityValues object
+    ///
+    QualityValues DeletionQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                             bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's InsertionQV values ("iq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns InsertionQVs as QualityValues object
+    ///
+    QualityValues InsertionQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                              bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's LabelQV values ("pq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    ///
+    /// \returns LabelQV as QualityValues object
+    ///
+    QualityValues LabelQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's MergeQV values ("mq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns MergeQV as QualityValues object
+    ///
+    QualityValues MergeQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                          bool exciseSoftClips = false) const;
+
+    /// \brief Fetches  this record's %BAM quality values (QUAL field).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns %BAM qualities as QualityValues object
+    ///
+    QualityValues Qualities(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                            bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's SubstitutionQV values ("sq" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new QVs will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns SubstitutionQV as QualityValues object
+    ///
+    QualityValues SubstitutionQV(Orientation orientation = Orientation::NATIVE,
+                                 bool aligned = false, bool exciseSoftClips = false) const;
+
+    /// \}
+
+public:
+    /// \name Pulse Data
+    /// \{
+
+    /// \brief Fetches this record's IPD values ("ip" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new frames will have a value of 0;
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns IPD as Frames object
+    ///
+    Frames IPD(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+               bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's IPD values ("ip" tag), but does not upscale.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns IPD as Frames object
+    ///
+    Frames IPDRaw(Orientation orientation = Orientation::NATIVE) const;
+
+    /// \brief Fetches this record's Pkmean values ("pa" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmean as vector<float> object
+    ///
+    std::vector<float> Pkmean(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                              bool exciseSoftClips = false,
+                              PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's Pkmid values ("pm" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmid as vector<float> object
+    ///
+    std::vector<float> Pkmid(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                             bool exciseSoftClips = false,
+                             PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's Pkmean2 values ("pi" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmean as vector<float> object
+    ///
+    std::vector<float> Pkmean2(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                               bool exciseSoftClips = false,
+                               PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's Pkmid2 values ("ps" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns Pkmid as vector<float> object
+    ///
+    std::vector<float> Pkmid2(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                              bool exciseSoftClips = false,
+                              PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PreBaseFrames aka IPD values ("ip" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new frames will have a value of 0;
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns IPD as Frames object
+    ///
+    Frames PreBaseFrames(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                         bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's PrePulseFrames values ("pd" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PrePulseFrames as Frames object
+    ///
+    Frames PrePulseFrames(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PulseCall values ("pc" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseCalls string
+    ///
+    std::string PulseCall(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PulseCallWidth values ("px" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseCallWidth as Frames object
+    ///
+    Frames PulseCallWidth(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                          bool exciseSoftClips = false,
+                          PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PulseExclusionReason values ("pe" tag).
+    ///
+    /// \returns vector of pulse exclusion reason value
+    ///
+    std::vector<PacBio::BAM::PulseExclusionReason> PulseExclusionReason(
+        Orientation orientation = Orientation::NATIVE, bool aligned = false,
+        bool exciseSoftClips = false, PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetch this record's PulseMergeQV values ("pg" tag).
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseMergeQV as QualityValues object
+    ///
+    QualityValues PulseMergeQV(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                               bool exciseSoftClips = false,
+                               PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \brief Fetches this record's PulseWidth values ("pw" tag).
+    ///
+    /// \note If \p aligned is true, and gaps/padding need to be inserted, the
+    ///       new frames will have a value of 0.
+    ///
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns PulseWidths as Frames object
+    ///
+    Frames PulseWidth(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                      bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's PulseWidth values ("pw" tag), but does not
+    ///        upscale.
+    ///
+    /// \param[in] orientation     Orientation of output.
+    /// \returns PulseWidth as Frames object
+    ///
+    Frames PulseWidthRaw(Orientation orientation = Orientation::NATIVE, bool aligned = false,
+                         bool exciseSoftClips = false) const;
+
+    /// \brief Fetches this record's StartFrame values ("sf" tag).
+    ///
+    /// \param[in] orientation     Orientation of output
+    ///
+    /// \returns StartFrame as uint32_t vector
+    ///
+    std::vector<uint32_t> StartFrame(Orientation orientation = Orientation::NATIVE,
+                                     bool aligned = false, bool exciseSoftClips = false,
+                                     PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    /// \}
+
+public:
+    /// \name Low-Level Access & Operations
+    /// \{
+
+    /// \warning This method should be considered temporary and avoided as much
+    ///          as possible. Direct access to the internal object is likely to
+    ///          disappear as BamRecord interface matures.
+    ///
+    /// \returns const reference to underlying BamRecordImpl object
+    ///
+    const BamRecordImpl& Impl() const;
+
+    /// \warning This method should be considered temporary and avoided as much
+    ///          as possible. Direct access to the internal object is likely to
+    ///          disappear as BamRecord interface matures.
+    ///
+    /// \returns reference to underlying BamRecordImpl object
+    ///
+    BamRecordImpl& Impl();
+
+    /// \}
+
+public:
+    /// \name General Data
+    /// \{
+
+    /// \brief Sets this record's ZMW hole number.
+    ///
+    /// \param[in] holeNumber
+    /// \returns reference to this record
+    ///
+    BamRecord& HoleNumber(const int32_t holeNumber);
+
+    /// \brief Sets this record's local context flags
+    ///
+    /// \param[in] flags
+    /// \returns reference to this record
+    ///
+    BamRecord& LocalContextFlags(const PacBio::BAM::LocalContextFlags flags);
+
+    /// \brief Sets this record's "number of complete passes of the insert".
+    ///
+    /// \param[in] numPasses
+    /// \returns reference to this record
+    ///
+    BamRecord& NumPasses(const int32_t numPasses);
+
+    /// \brief Sets this record's query end position.
+    ///
+    /// \note Changing this will modify the name of non-CCS records.
+    ///
+    /// \param[in] pos
+    /// \returns reference to this record
+    ///
+    BamRecord& QueryEnd(const PacBio::BAM::Position pos);
+
+    /// \brief Sets this record's query start position.
+    ///
+    /// \note Changing this will modify the name of non-CCS records.
+    ///
+    /// \param[in] pos
+    /// \returns reference to this record
+    ///
+    BamRecord& QueryStart(const PacBio::BAM::Position pos);
+
+    /// \brief Sets this record's expected read accuracy [0, 1000]
+    ///
+    /// \param[in] accuracy
+    /// \returns reference to this record
+    ///
+    BamRecord& ReadAccuracy(const Accuracy& accuracy);
+
+    /// \brief Attaches this record to the provided read group, changing the
+    ///        record name & 'RG' tag.
+    ///
+    /// \param[in] rg
+    /// \returns reference to this record
+    ///
+    BamRecord& ReadGroup(const ReadGroupInfo& rg);
+
+    /// \brief Attaches this record to the provided read group, changing the
+    ///        record name & 'RG' tag.
+    ///
+    /// \param[in] id
+    /// \returns reference to this record
+    ///
+    BamRecord& ReadGroupId(const std::string& id);
+
+    /// \brief Sets this scrap record's ScrapRegionType
+    ///
+    /// \param[in] type
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapRegionType(const VirtualRegionType type);
+
+    /// \brief Sets this scrap record's ScrapRegionType
+    ///
+    /// \param[in] type character equivalent of VirtualRegionType
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapRegionType(const char type);
+
+    /// \brief Sets this scrap record's ScrapZmwType
+    ///
+    /// \param[in] type
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapZmwType(const ZmwType type);
+
+    /// \brief Sets this scrap record's ScrapZmwType
+    ///
+    /// \param[in] type character equivalent of ZmwType
+    /// \returns reference to this record
+    ///
+    BamRecord& ScrapZmwType(const char type);
+
+    /// \brief Sets this record's average signal-to-noise in each of A, C, G,
+    ///        and T
+    ///
+    /// \param[in] snr average signal-to-noise of A, C, G, and T (in this order)
+    /// \returns reference to this record
+    ///
+    BamRecord& SignalToNoise(const std::vector<float>& snr);
+
+    /// \}
+
+public:
+    /// \name Barcode Data
+    /// \{
+
+    /// \brief Sets this record's barcode IDs ('bc' tag)
+    ///
+    /// \param[in] barcodeIds
+    /// \returns reference to this record
+    ///
+    BamRecord& Barcodes(const std::pair<int16_t, int16_t>& barcodeIds);
+
+    /// \brief Sets this record's barcode quality ('bq' tag)
+    ///
+    /// \param[in] quality Phred-scaled confidence call
+    /// \returns reference to this record
+    ///
+    BamRecord& BarcodeQuality(const uint8_t quality);
+
+    /// \}
+
+public:
+    /// \name Sequence & Tag Data
+    /// \{
+
+    /// \brief Sets this record's AltLabelTag values ("at" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& AltLabelTag(const std::string& tags);
+
+    /// \brief Sets this record's DeletionTag values ("dt" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& DeletionTag(const std::string& tags);
+
+    /// \brief Sets this record's SubstitutionTag values ("st" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& SubstitutionTag(const std::string& tags);
+
+    /// \}
+
+public:
+    /// \name Quality Data
+    /// \{
+
+    /// \brief Sets this record's AltLabelQV values ("pv" tag).
+    ///
+    /// \param[in] altLabelQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& AltLabelQV(const QualityValues& altLabelQVs);
+
+    /// \brief Sets this record's DeletionQV values ("dq" tag).
+    ///
+    /// \param[in] deletionQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& DeletionQV(const QualityValues& deletionQVs);
+
+    /// \brief Sets this record's InsertionQV values ("iq" tag).
+    ///
+    /// \param[in] insertionQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& InsertionQV(const QualityValues& insertionQVs);
+
+    /// \brief Sets this record's LabelQV values ("pq" tag).
+    ///
+    /// \param[in] labelQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& LabelQV(const QualityValues& labelQVs);
+
+    /// \brief Sets this record's MergeQV values ("mq" tag).
+    ///
+    /// \param[in] mergeQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& MergeQV(const QualityValues& mergeQVs);
+
+    /// \brief Sets this record's SubstitutionQV values ("sq" tag).
+    ///
+    /// \param[in] substitutionQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& SubstitutionQV(const QualityValues& substitutionQVs);
+
+    /// \}
+
+public:
+    /// \name Pulse Data
+    /// \{
+
+    /// \brief Sets this record's IPD values ("ip" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& IPD(const Frames& frames, const FrameEncodingType encoding);
+
+    /// \brief Sets this record's Pkmean values ("pm" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmean values ("pm" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's Pkmid values ("pa" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmid values ("pa" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's Pkmean2 values ("ps" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean2(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmean2 values ("ps" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmean2(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's Pkmid2 values ("pi" tag).
+    ///
+    /// \param[in] photons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid2(const std::vector<float>& photons);
+
+    /// \brief Sets this record's Pkmid2 values ("pi" tag).
+    ///
+    /// \param[in] encodedPhotons
+    /// \returns reference to this record
+    ///
+    BamRecord& Pkmid2(const std::vector<uint16_t>& encodedPhotons);
+
+    /// \brief Sets this record's PreBaseFrames aka IPD values ("ip" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PreBaseFrames(const Frames& frames, const FrameEncodingType encoding);
+
+    /// \brief Sets this record's PrePulseFrames values ("pd" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PrePulseFrames(const Frames& frames, const FrameEncodingType encoding);
+
+    /// \brief Sets this record's PulseCall values ("pc" tag).
+    ///
+    /// \param[in] tags
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseCall(const std::string& tags);
+
+    /// \brief Sets this record's PulseCallWidth values ("px" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseCallWidth(const Frames& frames, const FrameEncodingType encoding);
+
+    ///
+    /// \\brief Sets this record's PulseExclusionReason values ("pe" tag).
+    /// \param[in] reasons
+    /// \return reference to this record
+    ///
+    BamRecord& PulseExclusionReason(const std::vector<PacBio::BAM::PulseExclusionReason>& reasons);
+
+    /// \brief Sets this record's PulseMergeQV values ("pg" tag).
+    ///
+    /// \param[in] pulseMergeQVs
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseMergeQV(const QualityValues& pulseMergeQVs);
+
+    /// \brief Sets this record's PulseWidth values ("pw" tag).
+    ///
+    /// \param[in] frames
+    /// \param[in] encoding specify how to encode the data (8-bit lossy, or
+    ///                     16-bit lossless)
+    /// \returns reference to this record
+    ///
+    BamRecord& PulseWidth(const Frames& frames, const FrameEncodingType encoding);
+
+    /// \brief Sets this record's StartFrame values ("sf" tag).
+    ///
+    /// \param[in] startFrame
+    /// \returns reference to this record
+    ///
+    BamRecord& StartFrame(const std::vector<uint32_t>& startFrame);
+
+    /// \}
+
+public:
+    /// \name Low-Level Access & Operations
+    /// \{
+
+    /// \brief Resets cached aligned start/end.
+    ///
+    /// \note This method should not be needed in most client code. It exists
+    ///       primarily as a hook for internal reading loops (queries, index
+    ///       build, etc.) It's essentially a workaround and will likely be
+    ///       removed from the API.
+    ///
+    void ResetCachedPositions() const;
+
+    /// \brief Resets cached aligned start/end.
+    ///
+    /// \note This method should not be needed in most client code. It exists
+    ///       primarily as a hook for internal reading loops (queries, index
+    ///       build, etc.) It's essentially a workaround and will likely be
+    ///       removed from the API.
+    ///
+    void ResetCachedPositions();
+
+    /// \brief Updates the record's name (BamRecord::FullName) to reflect
+    ///        modifications to name components (movie name, ZMW hole number,
+    ///        etc.)
+    ///
+    void UpdateName();
+
+    /// \}
+
+public:
+    /// \name Pulse Data
+    /// \{
+
+    static const float photonFactor;
+
+    static std::vector<uint16_t> EncodePhotons(const std::vector<float>& data);
+
+    /// \}
+
+public:
+    /// \name Clipping & Mapping
+    /// \{
+
+    /// Creates a copied record from input, with clipping applied
+    static BamRecord Clipped(const BamRecord& input, const ClipType clipType,
+                             const PacBio::BAM::Position start, const PacBio::BAM::Position end);
+
+    /// Creates a copied record from input, with mapping applied
+    static BamRecord Mapped(const BamRecord& input, const int32_t referenceId,
+                            const Position refStart, const Strand strand, const Cigar& cigar,
+                            const uint8_t mappingQuality);
+
+    /// Applies clipping to this record
+    BamRecord& Clip(const ClipType clipType, const PacBio::BAM::Position start,
+                    const PacBio::BAM::Position end);
+
+    /// Creates a copied record from this one, with clipping applied
+    BamRecord Clipped(const ClipType clipType, const PacBio::BAM::Position start,
+                      const PacBio::BAM::Position end) const;
+
+    /// Applies mapping to this record
+    BamRecord& Map(const int32_t referenceId, const Position refStart, const Strand strand,
+                   const Cigar& cigar, const uint8_t mappingQuality);
+
+    /// Creates a copied record from this one, with mapping applied
+    BamRecord Mapped(const int32_t referenceId, const Position refStart, const Strand strand,
+                     const Cigar& cigar, const uint8_t mappingQuality) const;
+    /// \}
+
+private:
+    BamRecordImpl impl_;
+
+public:
+    /// public & mutable so that queries can directly set the header info,
+    /// even on a record that is const from client code's perspective
+    mutable BamHeader header_;
+
+private:
+    /// \internal
+    /// cached positions (mutable to allow lazy-calc in const methods)
+    mutable Position alignedStart_;
+    mutable Position alignedEnd_;
+
+private:
+    /// \internal
+    /// pulse to bam mapping cache
+    mutable std::unique_ptr<internal::Pulse2BaseCache> p2bCache_;
+
+public:
+    /// clips the PacBio tags to a specified length
+    void ClipTags(const size_t clipPos, const size_t clipLength);
+
+private:
+    ///\internal
+    /// clipping methods
+
+    void ClipFields(const size_t clipPos, const size_t clipLength);
+
+    BamRecord& ClipToQuery(const PacBio::BAM::Position start, const PacBio::BAM::Position end);
+    BamRecord& ClipToReference(const PacBio::BAM::Position start, const PacBio::BAM::Position end);
+    BamRecord& ClipToReferenceForward(const PacBio::BAM::Position start,
+                                      const PacBio::BAM::Position end);
+    BamRecord& ClipToReferenceReverse(const PacBio::BAM::Position start,
+                                      const PacBio::BAM::Position end);
+
+private:
+    ///\internal
+    /// raw tag data fetching
+
+    // sequence tags
+    std::string FetchBasesRaw(const BamRecordTag tag) const;
+    std::string FetchBases(const BamRecordTag tag,
+                           const Orientation orientation = Orientation::NATIVE,
+                           const bool aligned = false, const bool exciseSoftClips = false,
+                           const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // frame tags
+    Frames FetchFramesRaw(const BamRecordTag tag) const;
+    Frames FetchFrames(const BamRecordTag tag, const Orientation orientation = Orientation::NATIVE,
+                       const bool aligned = false, const bool exciseSoftClips = false,
+                       const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // pulse tags
+    std::vector<float> FetchPhotonsRaw(const BamRecordTag tag) const;
+    std::vector<float> FetchPhotons(const BamRecordTag tag,
+                                    const Orientation orientation = Orientation::NATIVE,
+                                    const bool aligned = false, const bool exciseSoftClips = false,
+                                    const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // QV tags
+    QualityValues FetchQualitiesRaw(const BamRecordTag tag) const;
+    QualityValues FetchQualities(const BamRecordTag tag,
+                                 const Orientation orientation = Orientation::NATIVE,
+                                 const bool aligned = false, const bool exciseSoftClips = false,
+                                 const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // UInt tags (e.g. start frame)
+    //
+    // TODO (DB): clean this up w.r.t FetchUInt8s
+    //
+    std::vector<uint32_t> FetchUInt32sRaw(const BamRecordTag tag) const;
+    std::vector<uint32_t> FetchUInt32s(
+        const BamRecordTag tag, const Orientation orientation = Orientation::NATIVE,
+        const bool aligned = false, const bool exciseSoftClips = false,
+        const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+    // UInt tags (e.g. pulse exclusion)
+    //
+    // ODO (DB): clean this up w.r.t FetchUInt32s
+    //
+    std::vector<uint8_t> FetchUInt8sRaw(const BamRecordTag tag) const;
+    std::vector<uint8_t> FetchUInt8s(const BamRecordTag tag,
+                                     const Orientation orientation = Orientation::NATIVE,
+                                     const bool aligned = false, const bool exciseSoftClips = false,
+                                     const PulseBehavior pulseBehavior = PulseBehavior::ALL) const;
+
+private:
+    ///\internal
+    /// marked const to allow calling from const methods
+    /// but updates our mutable cached values
+    void CalculateAlignedPositions() const;
+    void CalculatePulse2BaseCache() const;
+
+    friend class internal::BamRecordMemory;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/BamRecord.inl"
+
+#endif  // BAMRECORD_H
diff --git a/include/pbbam/BamRecordBuilder.h b/include/pbbam/BamRecordBuilder.h

new file mode 100644 (file)

index 0000000..b703aed
--- /dev/null
+++ b/include/pbbam/BamRecordBuilder.h
@@ -0,0 +1,245 @@
+// File Description
+/// \file BamRecordBuilder.h
+/// \brief Defines the BamRecordBuilder class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDBUILDER_H
+#define BAMRECORDBUILDER_H
+
+#include <cstdint>
+#include <string>
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BamRecordBuilder class provides a helper utility for building
+///        BamRecords.
+///
+/// This class provides a mechanism for building up %BAM data and
+/// lazy-encoding/constructing the actual BamRecord. Currently, the methods here
+/// really only support  filling in the low-level SAM/BAM-style fields, not so
+/// much the PacBio-specific fields.
+///
+class PBBAM_EXPORT BamRecordBuilder
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty %BAM record builder.
+    BamRecordBuilder();
+
+    /// \brief Creates an empty %BAM record builder, with header info to apply
+    ///        to built records.
+    ///
+    /// \param[in] header   BamHeader object
+    ///
+    explicit BamRecordBuilder(BamHeader header);
+
+    /// \brief Creates record builder with inital record data.
+    ///
+    /// \param[in] prototype    data from this record will be used to seed the
+    ///                         builder
+    ///
+    BamRecordBuilder(const BamRecord& prototype);
+
+    BamRecordBuilder(const BamRecordBuilder&) = default;
+    BamRecordBuilder(BamRecordBuilder&&) = default;
+    BamRecordBuilder& operator=(const BamRecordBuilder&) = default;
+    BamRecordBuilder& operator=(BamRecordBuilder&&) = default;
+    ~BamRecordBuilder() = default;
+
+    /// \}
+
+public:
+    /// \name Record-Building
+    /// \{
+
+    /// \brief Builds a BamRecord from current builder attributes.
+    ///
+    /// \returns newly-built BamRecord object
+    ///
+    BamRecord Build() const;
+
+    /// \brief Replaces an existing BamRecord's data with current builder
+    ///        attributes.
+    ///
+    /// \param[out] record resulting record
+    /// \returns true if successful
+    ///
+    bool BuildInPlace(BamRecord& record) const;
+
+    /// \brief Resets builder attributes to default values.
+    ///
+    void Reset();
+
+    /// \brief Resets builder attributes with \p prototype's data.
+    ///
+    /// \param[in] prototype
+    ///
+    void Reset(BamRecord prototype);
+
+    /// \}
+
+public:
+    /// \name Core Attribute Setup
+    /// \{
+
+    /// \brief Sets the record's (BAI) index bin ID.
+    ///
+    /// \param[in] bin BAI index bin ID.
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Bin(const uint32_t bin);
+
+    /// \brief Sets this record's alignment flag, using a raw integer.
+    ///
+    /// \param[in] flag raw alignment flag
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& Flag(const uint32_t flag);
+
+    /// \brief Sets this record's insert size.
+    ///
+    /// \param[in] iSize insert size
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& InsertSize(const int32_t iSize);
+
+    /// \brief Sets this record's map quality.
+    ///
+    /// \param[in] mapQual mapping quality - value of 255 indicates "unknown"
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& MapQuality(const uint8_t mapQual);
+
+    /// \brief Sets this record's mate's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& MatePosition(const int32_t pos);
+
+    /// \brief Sets this record's mate's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& MateReferenceId(const int32_t id);
+
+    /// \brief Sets this record's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& Position(const int32_t pos);
+
+    /// \brief Sets this record's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordBuilder& ReferenceId(const int32_t id);
+
+    /// \}
+
+public:
+    /// \name Alignment Flag Setup
+    /// \{
+
+    /// \brief Sets whether this record is a PCR/optical duplicate
+    BamRecordBuilder& SetDuplicate(bool ok);
+
+    /// \brief Sets whether this record failed quality controls
+    BamRecordBuilder& SetFailedQC(bool ok);
+
+    /// \brief Sets whether this record is the first mate of a pair.
+    BamRecordBuilder& SetFirstMate(bool ok);
+
+    /// \brief Sets whether this record was aligned.
+    BamRecordBuilder& SetMapped(bool ok);
+
+    /// \brief Sets whether this record's mate was aligned.
+    BamRecordBuilder& SetMateMapped(bool ok);
+
+    /// \brief Sets whether this record's mate mapped to reverse strand.
+    BamRecordBuilder& SetMateReverseStrand(bool ok);
+
+    /// \brief Sets whether this record came from paired-end sequencing.
+    BamRecordBuilder& SetPaired(bool ok);
+
+    /// \brief Sets whether this record is a read's primary alignment.
+    BamRecordBuilder& SetPrimaryAlignment(bool ok);
+
+    /// \brief Sets whether this record & its mate were properly mapped, per the
+    ///        aligner.
+    ///
+    BamRecordBuilder& SetProperPair(bool ok);
+
+    /// \brief Sets whether this record mapped to reverse strand.
+    BamRecordBuilder& SetReverseStrand(bool ok);
+
+    /// \brief Sets whether this record is the second mate of a pair.
+    BamRecordBuilder& SetSecondMate(bool ok);
+
+    /// \brief Sets whether this record is a supplementary alignment.
+    BamRecordBuilder& SetSupplementaryAlignment(bool ok);
+
+    /// \}
+
+public:
+    /// \name Variable-Length Data Setup
+    /// \{
+
+    /// \brief Sets the record's CIGAR data.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Cigar(PacBio::BAM::Cigar cigar);
+
+    /// \brief Sets the record's name.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Name(std::string name);
+
+    /// \brief Sets the record's qualities.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Qualities(std::string qualities);
+
+    /// \brief Sets the record's sequence.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Sequence(std::string sequence);
+
+    /// \brief Sets the record's tags.
+    ///
+    /// \returns reference to this builder
+    ///
+    BamRecordBuilder& Tags(TagCollection tags);
+
+    /// \}
+
+private:
+    BamHeader header_;
+    bam1_core_t core_;
+    std::string name_;
+    std::string sequence_;
+    std::string qualities_;
+    PacBio::BAM::Cigar cigar_;
+    TagCollection tags_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/BamRecordBuilder.inl"
+
+#endif  // BAMRECORDBUILDER_H
diff --git a/include/pbbam/BamRecordImpl.h b/include/pbbam/BamRecordImpl.h

new file mode 100644 (file)

index 0000000..949807c
--- /dev/null
+++ b/include/pbbam/BamRecordImpl.h
@@ -0,0 +1,591 @@
+// File Description
+/// \file BamRecordImpl.h
+/// \brief Defines the BamRecordImpl class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDIMPL_H
+#define BAMRECORDIMPL_H
+
+#include <htslib/sam.h>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include "pbbam/BamRecordTag.h"
+#include "pbbam/Cigar.h"
+#include "pbbam/Config.h"
+#include "pbbam/Position.h"
+#include "pbbam/QualityValues.h"
+#include "pbbam/TagCollection.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+class BamRecordMemory;
+}
+
+/// \brief The BamRecordImpl class holds all data necessary for creating,
+///        querying or editing a generic %BAM record.
+///
+/// For PacBio-specific extensions and convenience methods, see BamRecord.
+///
+/// \note This class is mostly an internal implementation detail and will
+///       likely be removed from the public API in the future. Please use
+///       BamRecord as much as possible.
+///
+class PBBAM_EXPORT BamRecordImpl
+{
+public:
+    // clang-format off
+    /// These flags describe the alignment status of the record.
+    enum AlignmentFlag
+    {
+        PAIRED              = 0x0001,   ///< Record comes from paired-end sequencing
+        PROPER_PAIR         = 0x0002,   ///< Each mate of a pair was properly aligned ("proper" as determined by aligner)
+        UNMAPPED            = 0x0004,   ///< Record was not mapped by aligner
+        MATE_UNMAPPED       = 0x0008,   ///< Record's mate was not mapped by aligner
+        REVERSE_STRAND      = 0x0010,   ///< Record was aligned to reverse strand (Sequence() is reverse-complemented)
+        MATE_REVERSE_STRAND = 0x0020,   ///< Record's mate was aligned to reverse strand (mate's Sequence() is reverse-complemented)
+        MATE_1              = 0x0040,   ///< Record is first mate of pair
+        MATE_2              = 0x0080,   ///< Record is second mate of pair
+        SECONDARY           = 0x0100,   ///< Record is a secondary alignment
+        FAILED_QC           = 0x0200,   ///< Record failed quality controls
+        DUPLICATE           = 0x0400,   ///< Record is a PCR/optical duplicate
+        SUPPLEMENTARY       = 0x0800    ///< Record is a supplementary alignment
+    };
+    // clang-format on
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    BamRecordImpl();
+    BamRecordImpl(const BamRecordImpl& other);
+    BamRecordImpl(BamRecordImpl&& other);
+    BamRecordImpl& operator=(const BamRecordImpl& other);
+    BamRecordImpl& operator=(BamRecordImpl&& other);
+    virtual ~BamRecordImpl() = default;
+
+    /// \}
+
+public:
+    /// \name Core Data
+    /// \{
+
+    /// \returns this record's assigned (BAI) index bin ID.
+    uint32_t Bin() const;
+
+    /// \returns this record's alignment flag, in raw integer form.
+    uint32_t Flag() const;
+
+    /// \returns this record's insert size
+    int32_t InsertSize() const;
+
+    /// \returns this record's mapping quality. A value of 255 indicates "unknown"
+    uint8_t MapQuality() const;
+
+    /// \returns this record's mate's mapped position, or -1 if unmapped
+    PacBio::BAM::Position MatePosition() const;
+
+    /// \returns this record's mate's mapped reference ID, or -1 if unmapped
+    int32_t MateReferenceId() const;
+
+    /// \returns this record's mapped position, or -1 if unmapped
+    PacBio::BAM::Position Position() const;
+
+    /// \returns this record's mate's mapped reference ID, or -1 if unmapped
+    int32_t ReferenceId() const;
+
+    /// Sets the record's (BAI) index bin ID.
+    ///
+    /// \param[in] bin BAI index bin ID.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Bin(uint32_t bin);
+
+    /// Sets this record's alignment flag, using a raw integer.
+    ///
+    /// \param[in] flag raw alignment flag
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Flag(uint32_t flag);
+
+    /// Sets this record's insert size.
+    ///
+    /// \param[in] iSize insert size
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& InsertSize(int32_t iSize);
+
+    /// Sets this record's map quality.
+    ///
+    /// \param[in] mapQual mapping quality - value of 255 indicates "unknown"
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& MapQuality(uint8_t mapQual);
+
+    /// Sets this record's mate's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& MatePosition(PacBio::BAM::Position pos);
+
+    /// Sets this record's mate's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& MateReferenceId(int32_t id);
+
+    /// Sets this record's mapped position.
+    ///
+    /// \param[in] pos mapped position. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Position(PacBio::BAM::Position pos);
+
+    /// Sets this record's mapped reference ID
+    ///
+    /// \param[in] id reference ID. A value of -1 indicates unmapped.
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& ReferenceId(int32_t id);
+
+    /// \}
+
+public:
+    /// \name Alignment Flags
+    /// \{
+
+    /// \returns true if this record is a PCR/optical duplicate
+    bool IsDuplicate() const;
+
+    /// \returns true if this record failed quality controls
+    bool IsFailedQC() const;
+
+    /// \returns true if this record is the first mate of a pair
+    bool IsFirstMate() const;
+
+    /// \returns true if this record was mapped by aligner
+    bool IsMapped() const;
+
+    /// \returns true if this record's mate was mapped by aligner
+    bool IsMateMapped() const;
+
+    /// \returns true if this record's mate was mapped to the reverse strand
+    bool IsMateReverseStrand() const;
+
+    /// \returns true if this record comes from paired-end sequencing
+    bool IsPaired() const;
+
+    /// \returns true if this record is a read's primary alignment
+    bool IsPrimaryAlignment() const;
+
+    /// \returns true if this record & its mate were properly aligned
+    bool IsProperPair() const;
+
+    /// \returns true if this record was mapped to the reverse strand
+    bool IsReverseStrand() const;
+
+    /// \returns true if this record is the second mate of a pair
+    bool IsSecondMate() const;
+
+    /// \returns true if this record is a supplementary alignment
+    bool IsSupplementaryAlignment() const;
+
+    /// Sets whether this record is a PCR/optical duplicate
+    BamRecordImpl& SetDuplicate(bool ok);
+
+    /// Sets whether this record failed quality controls
+    BamRecordImpl& SetFailedQC(bool ok);
+
+    /// Sets whether this record is the first mate of a pair.
+    BamRecordImpl& SetFirstMate(bool ok);
+
+    /// Sets whether this record was aligned.
+    BamRecordImpl& SetMapped(bool ok);
+
+    /// Sets whether this record's mate was aligned.
+    BamRecordImpl& SetMateMapped(bool ok);
+
+    /// Sets whether this record's mate mapped to reverse strand.
+    BamRecordImpl& SetMateReverseStrand(bool ok);
+
+    /// Sets whether this record came from paired-end sequencing.
+    BamRecordImpl& SetPaired(bool ok);
+
+    /// Sets whether this record is a read's primary alignment.
+    BamRecordImpl& SetPrimaryAlignment(bool ok);
+
+    /// Sets whether this record & its mate were properly mapped, per the aligner.
+    BamRecordImpl& SetProperPair(bool ok);
+
+    /// Sets whether this record mapped to reverse strand.
+    BamRecordImpl& SetReverseStrand(bool ok);
+
+    /// Sets whether this record is the second mate of a pair.
+    BamRecordImpl& SetSecondMate(bool ok);
+
+    /// Sets whether this record is a supplementary alignment.
+    BamRecordImpl& SetSupplementaryAlignment(bool ok);
+
+    /// \}
+
+public:
+    /// \name Variable-length Data (sequence, qualities, etc.)
+    /// \{
+
+    /// \returns the record's CIGAR data as a Cigar object
+    Cigar CigarData() const;
+
+    /// Sets the record's CIGAR data using a Cigar object
+    ///
+    /// \param[in] cigar PacBio::BAM::Cigar object
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& CigarData(const Cigar& cigar);
+
+    /// Sets the record's CIGAR data using a CIGAR-formatted string.
+    ///
+    /// \param[in] cigarString CIGAR-formatted string
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& CigarData(const std::string& cigarString);
+
+    // TODO: CIGAR iterator - Cigar only or here as well ??
+
+    /// \returns the record's query name
+    std::string Name() const;
+
+    /// Sets the record's "query name".
+    ///
+    /// \param name new name
+    /// \returns reference to this record
+    ///
+    BamRecordImpl& Name(const std::string& name);
+
+    /// \returns the record's quality values (phred-style ASCII)
+    ///
+    /// \note Usually Qualities().size() == Sequence.size(). However, in
+    ///       some data sets, the quality values are not provided. In that
+    ///       case, this method will return an empty container.
+    ///
+    QualityValues Qualities() const;
+
+    /// \returns the record's DNA sequence.
+    std::string Sequence() const;
+
+    size_t SequenceLength() const;
+
+    /// \brief Sets the record's DNA sequence and quality values
+    ///
+    /// This is an overloaded function. Sets the DNA sequence and quality
+    /// values, using the length of \p sequence.
+    ///
+    /// \note When using this overload (and \p qualities is non-empty), the
+    ///       lengths of \p sequence and \p qualities \b must be equal.
+    ///
+    /// \todo How to handle mismatched lengths?
+    ///
+    /// \param[in] sequence  std::string containing DNA sequence
+    /// \param[in] qualities std::string containing ASCII quality values
+    ///
+    /// \returns reference to this record.
+    ///
+    /// \sa SetSequenceAndQualities(const char* sequence,
+    ///     const size_t sequenceLength, const char* qualities)
+    ///
+    BamRecordImpl& SetSequenceAndQualities(const std::string& sequence,
+                                           const std::string& qualities = std::string());
+
+    /// \brief Sets the record's DNA sequence and quality values.
+    ///
+    /// The \p sequence must consist of IUPAC nucleotide codes {=ACMGRSVTWYHKDBN}.
+    /// The \p qualities, if not empty, must consist of 'phred'-style ASCII
+    /// quality values. \p qualities may be an empty string or NULL pointer in
+    /// cases where there are no such data available.
+    ///
+    /// \param[in] sequence         C-string containing DNA sequence
+    /// \param[in] sequenceLength   length of DNA sequence
+    /// \param[in] qualities        C-string containing 'phred-style' ASCII
+    ///                             quality values
+    ///
+    /// \note \p sequence does \b NOT have to be NULL-terminated. Length is
+    ///       explicitly determined by the value of \p sequenceLength provided.
+    ///
+    /// \returns reference to this record.
+    ///
+    BamRecordImpl& SetSequenceAndQualities(const char* sequence, const size_t sequenceLength,
+                                           const char* qualities = nullptr);
+
+    /// \brief Sets the record's DNA sequence and quality values.
+    ///
+    /// The \p encodedSequence should be preencoded/packed into the BAM binary
+    /// format. The \p qualities, if not empty, must consist of 'phred'-style
+    /// ASCII quality values. \p qualities may be an empty string or NULL
+    /// pointer in cases where there are no such data available.
+    ///
+    /// \param[in] encodedSequence      C-string containing BAM-format-encoded
+    ///                                 DNA sequence
+    /// \param[in] rawSequenceLength    length of DNA sequence (not the encoded
+    ///                                 length)
+    /// \param[in] qualities            C-string containing 'phred-style' ASCII
+    ///                                 quality values
+    ///
+    /// \note \p encodedSequence does \b NOT have to be NULL-terminated. Length
+    ///       is explicitly determined by the value of \p sequenceLength
+    ///       provided.
+    ///
+    /// \returns reference to this record.
+    ///
+    /// \sa SetSequenceAndQualities(const char* sequence,
+    ///     const size_t sequenceLength, const char* qualities)
+    ///
+    BamRecordImpl& SetPreencodedSequenceAndQualities(const char* encodedSequence,
+                                                     const size_t rawSequenceLength,
+                                                     const char* qualities = nullptr);
+
+    /// \}
+
+public:
+    /// \name Tag Data
+    /// \{
+
+    /// \returns record's full tag data as a TagCollection object
+    TagCollection Tags() const;
+
+    /// \brief Sets the record's full tag data via a TagCollection object
+    ///
+    BamRecordImpl& Tags(const TagCollection& tags);
+
+    /// \brief Adds a new tag to this record.
+    ///
+    /// \param[in] tagName  2-character tag name.
+    /// \param[in] value    Tag object that describes the type & value of data
+    ///                     to be added
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     string s;
+    ///     vector<uint32_t> v;
+    ///     record.AddTag("XX", s); // will add a string-type tag
+    ///     record.AddTag("YY", v); // will add a uint32-array-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const std::string& tagName, const Tag& value);
+
+    /// \brief Adds a new tag to this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag      BamRecordTag enum
+    /// \param[in] value    Tag object that describes the type & value of data
+    ///                     to be added
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const BamRecordTag tag, const Tag& value);
+
+    /// \brief Adds a new tag to this record, with an optional modifier.
+    ///
+    /// \param[in] tagName              2-character tag name.
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     char c;
+    ///     string h;
+    ///     record.AddTag("XX", c, TagModifier::ASCII_CHAR); // will add a char-type tag
+    ///     record.AddTag("YY", h, TagModifier::HEX_STRING); // will add a hex string-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const std::string& tagName, const Tag& value, const TagModifier additionalModifier);
+
+    /// \brief Adds a new tag to this record, with an optional modifier.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag                  BamRecordTag enum.
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns true if tag was successfully added.
+    ///
+    bool AddTag(const BamRecordTag tag, const Tag& value, const TagModifier additionalModifier);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// \param[in] tagName      2-character tag name. Name must be present
+    ///                         (see HasTag)
+    /// \param[in] newValue     Tag object that describes the type & value of
+    ///                         new data to be added
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     string s;
+    ///     vector<uint32_t> v;
+    ///     record.EditTag("XX", s); // will overwrite tag XX with a string-type tag
+    ///     record.EditTag("YY", v); // will overwrite tag YY with a uint32-array-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const std::string& tagName, const Tag& newValue);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag          BamRecordTag enum
+    /// \param[in] newValue     Tag object that describes the type & value of
+    ///                         new data to be added
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const BamRecordTag tag, const Tag& newValue);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// \param[in] tagName              2-character tag name. Name must be
+    ///                                 present (see HasTag)
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of new data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \note Any value that can be used to implicitly construct a Tag is valid.
+    /// \code
+    ///     char c;
+    ///     string h;
+    ///     record.EditTag("XX", c, TagModifier::ASCII_CHAR); // will overwrite tag XX with a char-type tag
+    ///     record.EditTag("YY", h, TagModifier::HEX_STRING); // will overwrite tag YY with a hex string-type tag
+    /// \endcode
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const std::string& tagName, const Tag& value,
+                 const TagModifier additionalModifier);
+
+    /// \brief Edits an existing tag on this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag                  BamRecordTag enum
+    /// \param[in] value                Tag object that describes the type &
+    ///                                 value of new data to be added
+    /// \param[in] additionalModifier   optional extra modifier (for explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns true if tag was successfully edited.
+    ///
+    bool EditTag(const BamRecordTag tag, const Tag& value, const TagModifier additionalModifier);
+
+    /// \returns true if a tag with this name is present in this record.
+    bool HasTag(const std::string& tagName) const;
+
+    /// \returns true if this tag is present in this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    bool HasTag(const BamRecordTag tag) const;
+
+    /// \brief Removes an existing tag from this record.
+    ///
+    /// \param[in] tagName  2-character tag name.
+    ///
+    /// \returns true if tag was actaully removed (i.e. false if tagName
+    ///          previously unknown)
+    /// \sa HasTag
+    ///
+    bool RemoveTag(const std::string& tagName);
+
+    /// \brief Removes an existing tag from this record.
+    ///
+    /// This is an overloaded method.
+    ///
+    /// \param[in] tag  BamRecordTag enum
+    ///
+    /// \returns true if tag was actaully removed (i.e. false if tagName
+    ///          previously unknown)
+    /// \sa HasTag
+    ///
+    bool RemoveTag(const BamRecordTag tag);
+
+    /// \brief Fetches a tag from this record.
+    ///
+    /// \param[in] tagName  2-character tag name.
+    ///
+    /// \returns Tag object for the requested name. If name is unknown, a
+    ///          default constructed Tag is returned (Tag::IsNull() is true).
+    ///
+    Tag TagValue(const std::string& tagName) const;
+
+    /// \brief Fetches a tag from this record.
+    ///
+    /// This is an overloaded method
+    ///
+    /// \param[in] tag  BamRecordTag enum
+    ///
+    /// \returns Tag object for the requested name. If name is unknown, a
+    ///          default constructed Tag is returned (Tag::IsNull() is true).
+    ///
+    Tag TagValue(const BamRecordTag tag) const;
+
+    // change above to Tag();
+
+    //    template<typename T>
+    //    T TagValue(const std::string& tagName) const;
+
+    /// \}
+
+private:
+    // returns a BamRecordImpl object, with a deep copy of @rawData contents
+    static BamRecordImpl FromRawData(const std::shared_ptr<bam1_t>& rawData);
+
+    // internal memory setup/expand methods
+    void InitializeData();
+    void MaybeReallocData();
+    void UpdateTagMap() const;  // allowed to be called from const methods
+                                // (lazy update on request)
+
+    // internal tag helper methods
+    bool AddTagImpl(const std::string& tagName, const Tag& value,
+                    const TagModifier additionalModifier);
+    bool RemoveTagImpl(const std::string& tagName);
+    int TagOffset(const std::string& tagName) const;
+
+    // internal CIGAR handling
+    void SetCigarData(const Cigar& cigar);
+
+    // core seq/qual logic shared by the public API
+    BamRecordImpl& SetSequenceAndQualitiesInternal(const char* sequence,
+                                                   const size_t sequenceLength,
+                                                   const char* qualities, bool isPreencoded);
+
+private:
+    // data members
+    std::shared_ptr<bam1_t> d_;
+    mutable std::map<uint16_t, int> tagOffsets_;
+
+    // friends
+    friend class internal::BamRecordMemory;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/BamRecordImpl.inl"
+
+#endif  // BAMRECORDIMPL_H
diff --git a/include/pbbam/BamRecordTag.h b/include/pbbam/BamRecordTag.h

new file mode 100644 (file)

index 0000000..c8de2cd
--- /dev/null
+++ b/include/pbbam/BamRecordTag.h
@@ -0,0 +1,60 @@
+// File Description
+/// \file BamRecordTag.h
+/// \brief Defines the BamRecordTag enum.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDTAG_H
+#define BAMRECORDTAG_H
+
+namespace PacBio {
+namespace BAM {
+
+enum class BamRecordTag
+{
+    ALT_LABEL_QV,
+    ALT_LABEL_TAG,
+    BARCODE_QUALITY,
+    BARCODES,
+    CONTEXT_FLAGS,
+    DELETION_QV,
+    DELETION_TAG,
+    HOLE_NUMBER,
+    INSERTION_QV,
+    IPD,
+    LABEL_QV,
+    LONG_CIGAR,
+    MERGE_QV,
+    NUM_PASSES,
+    PKMEAN,
+    PKMEAN_2,
+    PKMID,
+    PKMID_2,
+    PRE_PULSE_FRAMES,
+    PULSE_CALL,
+    PULSE_CALL_WIDTH,
+    PULSE_EXCLUSION,
+    PULSE_MERGE_QV,
+    PULSE_WIDTH,
+    QUERY_END,
+    QUERY_START,
+    READ_ACCURACY,
+    READ_GROUP,
+    SCRAP_REGION_TYPE,
+    SCRAP_ZMW_TYPE,
+    SNR,
+    START_FRAME,
+    SUBSTITUTION_QV,
+    SUBSTITUTION_TAG,
+
+    //
+    // not tags per se, but faking these here to simplify data fetching
+    //
+    QUAL,
+    SEQ
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMRECORDTAG_H
diff --git a/include/pbbam/BamRecordView.h b/include/pbbam/BamRecordView.h

new file mode 100644 (file)

index 0000000..d8fb2a6
--- /dev/null
+++ b/include/pbbam/BamRecordView.h
@@ -0,0 +1,132 @@
+// File Description
+/// \file BamRecordView.h
+/// \brief Defines the BamRecordView class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDVIEW_H
+#define BAMRECORDVIEW_H
+
+#include <cstdint>
+
+#include "pbbam/BamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief Provides a re-usable "view" onto a BamRecord
+///
+/// This class acts a convenience wrapper for working with per-base BamRecord
+/// data. Most of these BamRecord methods take a list of parameters, to adjust
+/// how the underlying data are presented to client code. Often these parameters
+/// will be re-used for each BamRecord method call. Thus, to simplify such
+/// client code, a BamRecordView can be used to state those parameters once, and
+/// then simply request the desired fields.
+///
+/// \internal
+/// \todo Sync up method names with BamRecord
+/// \endinternal
+///
+class PBBAM_EXPORT BamRecordView
+{
+public:
+    /// \brief Constructs a view onto \p record using the supplied parameters.
+    ///
+    /// For frame or QV data, if \p aligned is true, a value of 0 (Accuracy or
+    /// QualityValue) will be used at each inserted or padded base location.
+    ///
+    /// \param[in] record           BamRecord data source.
+    /// \param[in] orientation      Orientation of output.
+    /// \param[in] aligned          if true, gaps/padding will be inserted, per
+    ///                             Cigar info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    BamRecordView(const BamRecord& record, const Orientation orientation, const bool aligned,
+                  const bool exciseSoftClips,
+                  const PulseBehavior pulseBehavior = PulseBehavior::ALL);
+
+public:
+    /// \returns BamRecord::AltLabelQV with this view's parameters applied
+    QualityValues AltLabelQVs() const;
+
+    /// \returns BamRecord::AltLabelTag with this view's parameters applied
+    std::string AltLabelTags() const;
+
+    /// \returns BamRecord::DeletionQV with this view's parameters applied
+    QualityValues DeletionQVs() const;
+
+    /// \returns BamRecord::DeletionTag with this view's parameters applied
+    std::string DeletionTags() const;
+
+    /// \returns BamRecord::InsertionQV with this view's parameters applied
+    QualityValues InsertionQVs() const;
+
+    /// \returns BamRecord::IPD with this view's parameters applied
+    Frames IPD() const;
+
+    /// \returns BamRecord::LabelQV with this view's parameters applied
+    QualityValues LabelQVs() const;
+
+    /// \returns BamRecord::MergeQV with this view's parameters applied
+    QualityValues MergeQVs() const;
+
+    /// \returns BamRecord::PulseMergeQV with this view's parameters applied
+    QualityValues PulseMergeQVs() const;
+
+    /// \returns BamRecord::Pkmean with this view's parameters applied
+    std::vector<float> Pkmean() const;
+
+    /// \returns BamRecord::Pkmid with this view's parameters applied
+    std::vector<float> Pkmid() const;
+
+    /// \returns BamRecord::Pkmean2 with this view's parameters applied
+    std::vector<float> Pkmean2() const;
+
+    /// \returns BamRecord::Pkmid2 with this view's parameters applied
+    std::vector<float> Pkmid2() const;
+
+    /// \returns BamRecord::PreBaseFrames with this view's parameters applied
+    Frames PrebaseFrames() const;
+
+    /// \returns BamRecord::PrePulseFrames with this view's parameters applied
+    Frames PrePulseFrames() const;
+
+    /// \returns BamRecord::PulseCalls with this view's parameters applied
+    std::string PulseCalls() const;
+
+    /// \returns BamRecord::PulseCallWidth with this view's parameters applied
+    Frames PulseCallWidth() const;
+
+    /// \returns BamRecord::PulseWidths with this view's parameters applied
+    Frames PulseWidths() const;
+
+    /// \returns BamRecord::Qualities with this view's parameters applied
+    QualityValues Qualities() const;
+
+    /// \returns BamRecord::Sequence with this view's parameters applied
+    std::string Sequence() const;
+
+    /// \returns BamRecord::StartFrame with this view's parameters applied
+    std::vector<uint32_t> StartFrames() const;
+
+    /// \returns BamRecord::SubstitutionQV with this view's parameters applied
+    QualityValues SubstitutionQVs() const;
+
+    /// \returns BamRecord::SubstitutionTag with this view's parameters applied
+    std::string SubstitutionTags() const;
+
+private:
+    const BamRecord& record_;
+    Orientation orientation_;
+    bool aligned_;
+    bool exciseSoftClips_;
+    PulseBehavior pulseBehavior_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/BamRecordView.inl"
+
+#endif  // BAMRECORDVIEW_H
diff --git a/include/pbbam/BamTagCodec.h b/include/pbbam/BamTagCodec.h

new file mode 100644 (file)

index 0000000..68924bf
--- /dev/null
+++ b/include/pbbam/BamTagCodec.h
@@ -0,0 +1,90 @@
+// File Description
+/// \file BamTagCodec.h
+/// \brief Defines the BamTagCodec class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMTAGCODEC_H
+#define BAMTAGCODEC_H
+
+#include <cstdint>
+#include <vector>
+#include "pbbam/Config.h"
+#include "pbbam/TagCollection.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BamTagCodec class provides binary encoding/decoding of %BAM tag
+///        data.
+///
+/// \note BamTagCodec is mostly an implementation and/or testing detail, and may
+///       be removed from the public API.
+///
+class PBBAM_EXPORT BamTagCodec
+{
+public:
+    /// \name Tag Collection Methods
+    /// \{
+
+    /// \brief Creates a TagCollection from raw BAM data.
+    ///
+    /// \param[in] data     BAM-formatted (binary) tag data
+    /// \returns TagCollection containing tag data
+    ///
+    static TagCollection Decode(const std::vector<uint8_t>& data);
+
+    /// \brief Creates binary BAM data from a TagCollection.
+    ///
+    /// \param[in] tags     TagCollection containing tag data
+    /// \returns vector of bytes (encoded BAM data)
+    ///
+    static std::vector<uint8_t> Encode(const PacBio::BAM::TagCollection& tags);
+
+    /// \}
+
+public:
+    /// \name Per-Tag Methods
+    /// \{
+
+    /// \brief Determines the SAM/BAM tag code for a Tag.
+    ///
+    /// \param[in] tag                  Tag object to check
+    /// \param[in] additionalModifier   optional extra modifier (allows explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns the SAM/BAM single char code for tag type
+    ///
+    static uint8_t TagTypeCode(const PacBio::BAM::Tag& tag,
+                               const TagModifier& additionalModifier = TagModifier::NONE);
+
+    /// \brief Encodes a single Tag's contents in %BAM binary
+    ///
+    /// \note This method does \b NOT encode the tag name & tag type. It does
+    ///       include the element type for array-type tags.
+    ///
+    /// \param[in] tag                  Tag object containing data to encode
+    /// \param[in] additionalModifier   optional extra modifier (allows explicit
+    ///                                 modification of an otherwise const Tag)
+    ///
+    /// \returns vector of bytes (encoded BAM data)
+    ///
+    static std::vector<uint8_t> ToRawData(
+        const PacBio::BAM::Tag& tag, const TagModifier& additionalModifier = TagModifier::NONE);
+
+    /// \brief Creates a Tag object from binary BAM data.
+    ///
+    /// \param[in] rawData      raw BAM bytes (assumed to be the result of
+    ///                         htslib's bam_aux_get())
+    ///
+    /// \returns resulting Tag object
+    ///
+    static PacBio::BAM::Tag FromRawData(uint8_t* rawData);
+
+    /// \}
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMTAGCODEC_H
diff --git a/include/pbbam/BamWriter.h b/include/pbbam/BamWriter.h

new file mode 100644 (file)

index 0000000..eabaabd
--- /dev/null
+++ b/include/pbbam/BamWriter.h
@@ -0,0 +1,224 @@
+// File Description
+/// \file BamWriter.h
+/// \brief Defines the BamWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMWRITER_H
+#define BAMWRITER_H
+
+#include <htslib/sam.h>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/IRecordWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile;
+
+namespace internal {
+class BamWriterPrivate;
+}
+
+/// \brief The BamWriter class provides a writing interface for creating
+///        new %BAM files.
+///
+/// \note The underlying buffered data may not be flushed to the file until the
+///       destructor is called. Trying to access the file (reading, stat-ing,
+///       indexing, etc.) before the BamWriter is destroyed yields undefined
+///       behavior. Enclose the BamWriter in some form of local scope (curly
+///       braces, a separate function, etc.) to ensure that its destructor is
+///       called before proceeding to read-based operations.
+///
+/// \code{.cpp}
+///  {
+///     BamWriter w(...);
+///     // write data
+///  }
+///  // now safe to access the new file
+/// \endcode
+///
+///
+class PBBAM_EXPORT BamWriter : public IRecordWriter
+{
+public:
+    /// \brief This enum allows you to control the compression level of the
+    ///        output %BAM file.
+    ///
+    /// Values are equivalent to zlib compression levels. See its documentation
+    /// for more details: http://www.zlib.net/manual.html
+    ///
+    enum CompressionLevel
+    {
+        CompressionLevel_0 = 0,
+        CompressionLevel_1 = 1,
+        CompressionLevel_2 = 2,
+        CompressionLevel_3 = 3,
+        CompressionLevel_4 = 4,
+        CompressionLevel_5 = 5,
+        CompressionLevel_6 = 6,
+        CompressionLevel_7 = 7,
+        CompressionLevel_8 = 8,
+        CompressionLevel_9 = 9,
+
+        DefaultCompression = -1,
+        NoCompression = CompressionLevel_0,
+        FastCompression = CompressionLevel_1,
+        BestCompression = CompressionLevel_9
+    };
+
+    /// \brief This enum allows you to control whether BAI bin numbers are
+    ///        calculated for output records.
+    ///
+    /// For most cases, the default behavior (ON) should be retained for maximum
+    /// compatibility with downstream tools (e.g. samtools index). Disabling bin
+    /// calculation should only be used if all records are known to never be
+    /// mapped, and even then only if profiling revelas the calculation to
+    /// affect extremely performance-sensitive, "critical paths".
+    ///
+    enum BinCalculationMode
+    {
+        BinCalculation_ON = 0,
+        BinCalculation_OFF
+    };
+
+    ///
+    /// \brief The Config struct provides a "parameter object" for BamWriter
+    ///        settings. This allows for writer configuration without having to
+    ///        refer to ordering of parameters, default values, etc.
+    ///
+    struct Config
+    {
+        Config() = default;
+
+        // zlib compression level
+        CompressionLevel compressionLevel = DefaultCompression;
+
+        // The number of threads for compression. If set to 0, BamWriter will
+        // attempt to determine a reasonable estimate. If set to 1, this will
+        // force single-threaded execution. No checks are made against an upper limit.
+        size_t numThreads = 4;
+
+        // If ON, ensures that proper BAI bin numbers are provided for all records.
+        BamWriter::BinCalculationMode binCalculationMode = BamWriter::BinCalculation_ON;
+
+        // If true, write to <filename>.tmp, and rename  to <filename> in dtor.
+        // This allows downstream checks to see if BAM file may be truncated
+        // due to early termination (e.g. a thrown exception). If false, write
+        // directly to <filename>.
+        bool useTempFile = true;
+    };
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Opens a %BAM file for writing & writes the header information.
+    ///
+    /// \note Set \p filename to "-" for stdout.
+    ///
+    /// \param[in] filename         path to output %BAM file
+    /// \param[in] header           BamHeader object
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, BamWriter will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \param[in] binCalculationMode BAI bin calculation mode. The default
+    ///            behavior will ensure proper bin numbers are provided for all
+    ///            records written. This extra step may turned off when bin
+    ///            numbers are not needed. Though if in doubt, keep the default.
+    ///
+    /// \param[in] useTempFile      If true, write to <filename>.tmp, and rename
+    ///                             to <filename>. This provides for downstream
+    ///                             checks to see if BAM file may be truncated
+    ///                             due to early termination (a thrown exception).
+    ///
+    /// \throws std::runtmie_error if there was a problem opening the file for
+    ///         writing or if an error occurred while writing the header
+    ///
+    BamWriter(const std::string& filename, const BamHeader& header,
+              const BamWriter::CompressionLevel compressionLevel = BamWriter::DefaultCompression,
+              const size_t numThreads = 4,
+              const BinCalculationMode binCalculationMode = BamWriter::BinCalculation_ON,
+              const bool useTempFile = true);
+
+    ///
+    /// \brief Opens a %BAM file for writing & writes the header information.
+    ///
+    /// \param[in] filename     path to output %BAM file
+    /// \param[in] header       BamHeader object
+    /// \param[in] config       container for add'l configuration options
+    ///
+    /// \throws std::runtmie_error if there was a problem opening the file for
+    ///         writing or if an error occurred while writing the header
+    ///
+    BamWriter(const std::string& filename, const BamHeader& header,
+              const BamWriter::Config& config);
+
+    /// Fully flushes all buffered data & closes file.
+    ~BamWriter() override;
+
+    BamWriter(const BamWriter&) = delete;
+    BamWriter& operator=(const BamWriter&) = delete;
+    BamWriter(BamWriter&&) = default;
+    BamWriter& operator=(BamWriter&&) = default;
+
+    /// \}
+
+public:
+    /// \name Data Writing & Resource Management
+    /// \{
+
+    /// \brief Try to flush any buffered data to file.
+    ///
+    /// \note The underlying implementation doesn't necessarily flush buffered
+    ///       data immediately, especially in a multithreaded writer situation.
+    ///       Let the BamWriter go out of scope to fully ensure flushing.
+    ///
+    /// \throws std::runtime_error if flush fails
+    ///
+    void TryFlush() override;
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] record BamRecord object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecord& record) override;
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] record BamRecord object
+    /// \param[out] vOffset BGZF virtual offset to start of \p record
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecord& record, int64_t* vOffset);
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] recordImpl BamRecordImpl object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecordImpl& recordImpl) override;
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::BamWriterPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMWRITER_H
diff --git a/include/pbbam/BarcodeQuery.h b/include/pbbam/BarcodeQuery.h

new file mode 100644 (file)

index 0000000..d83966c
--- /dev/null
+++ b/include/pbbam/BarcodeQuery.h
@@ -0,0 +1,62 @@
+// File Description
+/// \file BarcodeQuery.h
+/// \brief Defines the BarcodeQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef BARCODEQUERY_H
+#define BARCODEQUERY_H
+
+#include <cstdint>
+#include <vector>
+#include "pbbam/Config.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BarcodeQuery class provides iterable access to a DataSet's %BAM
+///        records, limiting results to those matching a particular barcode.
+///
+/// Example:
+/// \include code/BarcodeQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT BarcodeQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new BarcodeQuery, limiting record results to only those
+    ///        annotated with a particular barcode ID.
+    ///
+    /// \param[in] barcode  filtering criteria
+    /// \param[in] dataset  input data source(s)
+    ///
+    /// \sa BamRecord::Barcodes
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI
+    ///         files.
+    ///
+    BarcodeQuery(const int16_t barcode, const DataSet& dataset);
+
+    ~BarcodeQuery() override;
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+private:
+    struct BarcodeQueryPrivate;
+    std::unique_ptr<BarcodeQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BARCODEQUERY_H
diff --git a/include/pbbam/Cigar.h b/include/pbbam/Cigar.h

new file mode 100644 (file)

index 0000000..9c58bfd
--- /dev/null
+++ b/include/pbbam/Cigar.h
@@ -0,0 +1,77 @@
+// File Description
+/// \file Cigar.h
+/// \brief Defines the Cigar class.
+//
+// Author: Derek Barnett
+
+#ifndef CIGAR_H
+#define CIGAR_H
+
+#include <string>
+#include <vector>
+#include "pbbam/CigarOperation.h"
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The Cigar class represents the CIGAR string used to report alignment
+///        charateristics in SAM/BAM.
+///
+/// \note Use of the 'M' operator is forbidden in PacBio BAMs. See
+///       CigarOperationType description for more information.
+///
+/// \sa https://samtools.github.io/hts-specs/SAMv1.pdf for more information on CIGAR in general.
+///
+class PBBAM_EXPORT Cigar : public std::vector<CigarOperation>
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a Cigar object from SAM/BAM string input
+    ///
+    /// \param [in] stdString   SAM/BAM formatted CIGAR data
+    /// \returns a Cigar object representing the input data
+    ///
+    /// \note This class may be removed from the public API in the future,
+    ///       as the constructor taking a std::string accomplishes the same end.
+    ///
+    static Cigar FromStdString(const std::string& stdString);
+
+    /// \brief Creates an empty Cigar.
+    Cigar() = default;
+
+    /// \brief Creates a Cigar object from SAM/BAM string input
+    ///
+    /// \param [in] cigarString   SAM/BAM formatted CIGAR data
+    ///
+    Cigar(const std::string& cigarString);
+
+    Cigar(const Cigar&) = default;
+    Cigar(Cigar&&) = default;
+    Cigar& operator=(const Cigar&) = default;
+    Cigar& operator=(Cigar&&) = default;
+    ~Cigar() = default;
+
+    /// \}
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// Converts Cigar object data to SAM/BAM formatted string
+    ///
+    /// \returns SAM/BAM formatted std::string
+    ///
+    std::string ToStdString() const;
+
+    /// \}
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/Cigar.inl"
+
+#endif  // CIGAR_H
diff --git a/include/pbbam/CigarOperation.h b/include/pbbam/CigarOperation.h

new file mode 100644 (file)

index 0000000..949cbd9
--- /dev/null
+++ b/include/pbbam/CigarOperation.h
@@ -0,0 +1,147 @@
+// File Description
+/// \file CigarOperation.h
+/// \brief Defines the CigarOperationType enum & CigarOperation class.
+//
+// Author: Derek Barnett
+
+#ifndef CIGAROPERATION_H
+#define CIGAROPERATION_H
+
+#include <cstdint>
+#include <stdexcept>
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace pbbamify {
+class Settings;
+}
+
+/// \brief Describes a CIGAR operation.
+///
+/// Bracketed character is the corresponding SAM/BAM character code.
+///
+/// \warning ALIGNMENT_MATCH ('M') is included in this enum to maintain
+///          consistency with htslib. However, as of PacBio BAM spec version
+///          3.0b7, this CIGAR operation \b forbidden. Any attempt to read or
+///          write a record containing this operation will trigger a
+///          std::runtime_error. SEQUENCE_MATCH('=) or SEQUENCE_MISMATCH('X')
+///          should be used instead.
+///
+enum class CigarOperationType
+{
+    UNKNOWN_OP = -1,      ///< unknown/invalid CIGAR operator
+    ALIGNMENT_MATCH = 0,  ///< alignment match (can be a sequence match or mismatch) [M]
+    INSERTION,            ///< insertion to the reference [I]
+    DELETION,             ///< deletion from the reference [D]
+    REFERENCE_SKIP,       ///< skipped region from the reference [N]
+    SOFT_CLIP,            ///< soft clipping (clipped sequences present in SEQ) [S]
+    HARD_CLIP = 5,        ///< hard clipping (clipped sequences NOT present in SEQ) [H]
+    PADDING,              ///< padding (silent deletion from padded reference) [P]
+    SEQUENCE_MATCH,       ///< sequence match [=]
+    SEQUENCE_MISMATCH     ///< sequence mismatch [X]
+};
+
+/// \brief The CigarOperation class represents a single CIGAR operation
+///        (consisting of a type & length).
+///
+class PBBAM_EXPORT CigarOperation
+{
+public:
+    /// \name Operation Type Conversion Methods
+    /// \{
+
+    /// Convert between CigarOperationType enum & SAM/BAM character code.
+    ///
+    /// \param[in] type CigarOperationType value
+    /// \returns SAM/BAM character code
+    static char TypeToChar(const CigarOperationType type);
+
+    /// Convert between CigarOperationType enum & SAM/BAM character code.
+    ///
+    /// \param[in] c SAM/BAM character code
+    /// \returns CigarOperationType value
+    static CigarOperationType CharToType(const char c);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    CigarOperation() = default;
+    CigarOperation(char c, uint32_t length);
+    CigarOperation(CigarOperationType op, uint32_t length);
+
+    CigarOperation(const CigarOperation&) = default;
+    CigarOperation(CigarOperation&&) = default;
+    CigarOperation& operator=(const CigarOperation&) = default;
+    CigarOperation& operator=(CigarOperation&&) = default;
+    ~CigarOperation() = default;
+
+    /// \}
+
+public:
+    /// \returns operation type as SAM/BAM char code
+    inline char Char() const;
+
+    /// \returns operation length
+    inline uint32_t Length() const;
+
+    /// \returns operation type as CigarOperationType enum value
+    inline CigarOperationType Type() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// Sets this operation type.
+    ///
+    /// \param[in] opChar SAM/BAM character code
+    /// \returns reference to this operation
+    inline CigarOperation& Char(const char opChar);
+
+    /// Sets this operation length.
+    ///
+    /// \param[in] length
+    /// \returns reference to this operation
+    inline CigarOperation& Length(const uint32_t length);
+
+    /// Sets this operation type.
+    ///
+    /// \param[in] opType CigarOperationType value
+    /// \returns reference to this operation
+    inline CigarOperation& Type(const CigarOperationType opType);
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    /// \returns true if both CIGAR operation type & length match
+    inline bool operator==(const CigarOperation& other) const;
+
+    /// \returns true if either CIGAR operation type or length differ
+    inline bool operator!=(const CigarOperation& other) const;
+
+    /// \}
+
+private:
+    CigarOperationType type_ = CigarOperationType::UNKNOWN_OP;
+    uint32_t length_ = 0;
+
+    // runtime disabling of Cigar validation
+    static bool validate_;
+    friend class pbbamify::Settings;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/CigarOperation.inl"
+
+#endif  // CIGAROPERATION_H
diff --git a/include/pbbam/ClipType.h b/include/pbbam/ClipType.h

new file mode 100644 (file)

index 0000000..fe13dee
--- /dev/null
+++ b/include/pbbam/ClipType.h
@@ -0,0 +1,30 @@
+// File Description
+/// \file ClipType.h
+/// \brief Defines the ClipType enum.
+//
+// Author: Derek Barnett
+
+#ifndef CLIPTYPE_H
+#define CLIPTYPE_H
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the modes supported by BamRecord clipping
+///        operations.
+///
+/// Methods like BamRecord::Clip accept Position parameters - which may be in
+/// either polymerase or reference coorindates. Using this enum as a flag
+/// indicates how the positions should be interpreted.
+///
+enum class ClipType
+{
+    CLIP_NONE,         ///< No clipping will be performed.
+    CLIP_TO_QUERY,     ///< Clipping positions are in polymerase coordinates.
+    CLIP_TO_REFERENCE  ///< Clipping positions are in genomic coordinates.
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // CLIPTYPE_H
diff --git a/include/pbbam/Compare.h b/include/pbbam/Compare.h

new file mode 100644 (file)

index 0000000..caf94b8
--- /dev/null
+++ b/include/pbbam/Compare.h
@@ -0,0 +1,466 @@
+// File Description
+/// \file Compare.h
+/// \brief Defines the Compare class & a number of function objects for
+///       comparing BamRecords.
+//
+// Author: Derek Barnett
+
+#ifndef COMPARE_H
+#define COMPARE_H
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <utility>
+#include "pbbam/BamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The Compare class provides utilities for sorting collections of
+///        BamRecords.
+///
+/// \note The functors provided here currently only support std::less<T>
+///       comparisons (i.e. sorting by ascending value).
+///
+/// \include code/Compare.txt
+///
+struct PBBAM_EXPORT Compare
+{
+public:
+    /// \name Comparison Type
+    /// \{
+
+    /// \brief This enum defines the supported comparison types
+    ///        { ==, !=, <, <=, >, >=, & (contains), ~ (not contains) }.
+    ///
+    enum Type
+    {
+        EQUAL = 0,
+        NOT_EQUAL,
+        LESS_THAN,
+        LESS_THAN_EQUAL,
+        GREATER_THAN,
+        GREATER_THAN_EQUAL,
+        CONTAINS,
+        NOT_CONTAINS
+    };
+
+    /// \brief Convert operator string to Compare::Type.
+    ///
+    /// \include code/Compare_TypeFromOperator.txt
+    ///
+    /// \param[in] opString operator string. Can be C++-style operators
+    ///                     ("==", "!=", "<=", etc) or alpha equivalents
+    ///                     ("eq", "ne", "lte", etc).
+    ///
+    /// \returns comparison type from an operator string
+    /// \throws std::runtime_error if cannot convert opString to Compare::Type
+    /// \sa Compare::TypeToOperator
+    ///
+    static Compare::Type TypeFromOperator(const std::string& opString);
+
+    /// \brief Convert a Compare::Type to printable enum name.
+    ///
+    /// \include code/Compare_TypeToName.txt
+    ///
+    /// \param[in] type Compare::Type to convert
+    /// \returns the printable name for a Compare::Type enum value.are::Type
+    /// \throws std::runtime_error on unknown Compare::Type
+    ///
+    static std::string TypeToName(const Compare::Type& type);
+
+    /// \brief Convert a Compare::Type to printable operator.
+    ///
+    /// \param[in] type     Compare::Type to convert
+    /// \param[in] asAlpha  (optional) flag to print using alpha equivalents
+    ///                     e.g. "lte" rather than "<="
+    /// \returns the printable operator string
+    /// \throws std::runtime_error on unknown Compare::Type
+    ///
+    static std::string TypeToOperator(const Compare::Type& type, bool asAlpha = false);
+
+    /// \}
+
+public:
+    /// \name Comparison Function Objects
+    /// \{
+
+    /// %Base class for all BamRecord compare functors.
+    ///
+    /// Mostly used for method signatures that can accept any comparator.
+    ///
+    /// Custom comparators may be used by inheriting from this class.
+    ///
+    struct Base : public std::function<bool(const BamRecord&, const BamRecord&)>
+    {
+    };
+
+private:
+    /// \internal
+    ///
+    /// Exists to provide the typedef we'll use in the actual
+    /// MemberFunctionBase, since we need to use it in the template signature.
+    /// This keeps that a lot easier to read.
+    ///
+    template <typename ValueType>
+    struct MemberFunctionBaseHelper : public Compare::Base
+    {
+        using MemberFnType = ValueType (BamRecord::*)() const;
+    };
+
+public:
+    /// \brief %Base class for all BamRecord compare functors that take a
+    ///        BamRecord function pointer and compare on its return type.
+    ///
+    /// Derived comparators usually need only declare the return value &
+    /// function pointer in the template signature. This class implements the
+    /// basic method-calling machinery.
+    ///
+    /// Custom comparators will work for any BamRecord member function that does
+    /// not take any input parameters.
+    ///
+    template <typename ValueType, typename MemberFunctionBaseHelper<ValueType>::MemberFnType fn,
+              typename CompareType = std::less<ValueType> >
+    struct MemberFunctionBase : public Compare::MemberFunctionBaseHelper<ValueType>
+    {
+        bool operator()(const BamRecord& lhs, const BamRecord& rhs) const;
+    };
+
+public:
+    /// \brief Compares on BamRecord::AlignedEnd.
+    ///
+    /// Example:
+    /// \include code/Compare_AlignedEnd.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct AlignedEnd : public MemberFunctionBase<Position, &BamRecord::AlignedEnd>
+    {
+    };
+
+    /// \brief Compares on BamRecord::AlignedStart.
+    ///
+    /// Example:
+    /// \include code/Compare_AlignedStart.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct AlignedStart : public MemberFunctionBase<Position, &BamRecord::AlignedStart>
+    {
+    };
+
+    /// \brief Compares on BamRecord::AlignedStrand
+    ///
+    /// Example:
+    /// \include code/Compare_AlignedStrand.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct AlignedStrand : public MemberFunctionBase<Strand, &BamRecord::AlignedStrand>
+    {
+    };
+
+    /// \brief Compares on BamRecord::BarcodeForward.
+    ///
+    /// Example:
+    /// \include code/Compare_BarcodeForward.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct BarcodeForward : public MemberFunctionBase<int16_t, &BamRecord::BarcodeForward>
+    {
+    };
+
+    /// \brief Compares on BamRecord::BarcodeQuality.
+    ///
+    /// Example:
+    /// \include code/Compare_BarcodeQuality.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct BarcodeQuality : public MemberFunctionBase<uint8_t, &BamRecord::BarcodeQuality>
+    {
+    };
+
+    /// \brief Compares on BamRecord::BarcodeReverse.
+    ///
+    /// Example:
+    /// \include code/Compare_BarcodeReverse.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct BarcodeReverse : public MemberFunctionBase<int16_t, &BamRecord::BarcodeReverse>
+    {
+    };
+
+    /// \brief Compares on BamRecord::FullName.
+    ///
+    /// Example:
+    /// \include code/Compare_FullName.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct FullName : public MemberFunctionBase<std::string, &BamRecord::FullName>
+    {
+    };
+
+    /// \brief Compares on BamRecord::LocalContextFlags.
+    ///
+    /// Example:
+    /// \include code/Compare_LocalContextFlag.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct LocalContextFlag
+        : public MemberFunctionBase<LocalContextFlags, &BamRecord::LocalContextFlags>
+    {
+    };
+
+    /// \brief Compares on BamRecord::MapQuality.
+    ///
+    /// Example:
+    /// \include code/Compare_MapQuality.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct MapQuality : public MemberFunctionBase<uint8_t, &BamRecord::MapQuality>
+    {
+    };
+
+    /// \brief Compares on BamRecord::MovieName.
+    ///
+    /// Example:
+    /// \include code/Compare_MovieName.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct MovieName : public MemberFunctionBase<std::string, &BamRecord::MovieName>
+    {
+    };
+
+    /// \brief Provides an operator() is essentially a no-op for
+    ///        comparing/sorting.
+    ///
+    /// If used in a sorting operation, then no change will occur.
+    ///
+    struct None : public Compare::Base
+    {
+        bool operator()(const BamRecord&, const BamRecord&) const;
+    };
+
+    ///\brief Compares on BamRecord::NumDeletedBases.
+    ///
+    /// Example:
+    /// \include code/Compare_NumDeletedBases.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumDeletedBases : public MemberFunctionBase<size_t, &BamRecord::NumDeletedBases>
+    {
+    };
+
+    /// \brief Compares on BamRecord::NumInsertedBases.
+    ///
+    /// Example:
+    /// \include code/Compare_NumInsertedBases.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumInsertedBases : public MemberFunctionBase<size_t, &BamRecord::NumInsertedBases>
+    {
+    };
+
+    /// \brief Compares on BamRecord::NumMatches.
+    ///
+    /// Example:
+    /// \include code/Compare_NumMatches.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumMatches : public MemberFunctionBase<size_t, &BamRecord::NumMatches>
+    {
+    };
+
+    /// \brief Compares on BamRecord::NumMismatches.
+    ///
+    /// Example:
+    /// \include code/Compare_NumMismatches.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct NumMismatches : public MemberFunctionBase<size_t, &BamRecord::NumMismatches>
+    {
+    };
+
+    /// \brief Compares on BamRecord::QueryEnd.
+    ///
+    /// Example:
+    /// \include code/Compare_QueryEnd.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct QueryEnd : public MemberFunctionBase<Position, &BamRecord::QueryEnd>
+    {
+    };
+
+    /// \brief Compares on BamRecord::QueryStart.
+    ///
+    /// Example:
+    /// \include code/Compare_QueryStart.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct QueryStart : public MemberFunctionBase<Position, &BamRecord::QueryStart>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReadAccuracy.
+    ///
+    /// Example:
+    /// \include code/Compare_ReadAccuracy.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReadAccuracy : public MemberFunctionBase<Accuracy, &BamRecord::ReadAccuracy>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReadGroupId.
+    ///
+    /// \note Even though the ReadGroupId string contains hex values, it is
+    ///       still just a std::string. Comparisons will use lexical, not
+    ///       numeric ordering. If numeric ordering is desired, use
+    ///       Compare::ReadGroupNumericId instead.
+    ///
+    /// Example:
+    /// \include code/Compare_ReadGroupId.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReadGroupId : public MemberFunctionBase<std::string, &BamRecord::ReadGroupId>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReadGroupNumericId.
+    ///
+    /// Example:
+    /// \include code/Compare_ReadGroupNumericId.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReadGroupNumericId : public MemberFunctionBase<int32_t, &BamRecord::ReadGroupNumericId>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReferenceEnd.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceEnd.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceEnd : public MemberFunctionBase<Position, &BamRecord::ReferenceEnd>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReferenceId.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceId.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceId : public MemberFunctionBase<int32_t, &BamRecord::ReferenceId>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReferenceName.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceName.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceName : public MemberFunctionBase<std::string, &BamRecord::ReferenceName>
+    {
+    };
+
+    /// \brief Compares on BamRecord::ReferenceStart.
+    ///
+    /// Example:
+    /// \include code/Compare_ReferenceStart.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct ReferenceStart : public MemberFunctionBase<Position, &BamRecord::ReferenceStart>
+    {
+    };
+
+    /// \brief Compares on BamRecord::HoleNumber.
+    ///
+    /// Example:
+    /// \include code/Compare_Zmw.txt
+    ///
+    /// \note Currently only supports std::less<T> comparisons (i.e. sorting by
+    ///       ascending value).
+    ///
+    struct Zmw : public MemberFunctionBase<int32_t, &BamRecord::HoleNumber>
+    {
+    };
+
+    /// \}
+
+    template <typename T>
+    static inline bool Check(const T& lhs, const T& rhs, const Compare::Type cmp)
+    {
+        switch (cmp) {
+            case Compare::EQUAL:
+                return lhs == rhs;
+            case Compare::LESS_THAN:
+                return lhs < rhs;
+            case Compare::LESS_THAN_EQUAL:
+                return lhs <= rhs;
+            case Compare::GREATER_THAN:
+                return lhs > rhs;
+            case Compare::GREATER_THAN_EQUAL:
+                return lhs >= rhs;
+            case Compare::NOT_EQUAL:
+                return lhs != rhs;
+            default:
+                assert(false);
+                throw std::runtime_error{"unsupported compare type requested"};
+        }
+    }
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/Compare.inl"
+
+#endif  // COMPARE_H
diff --git a/include/pbbam/CompositeBamReader.h b/include/pbbam/CompositeBamReader.h

new file mode 100644 (file)

index 0000000..6fb9e9c
--- /dev/null
+++ b/include/pbbam/CompositeBamReader.h
@@ -0,0 +1,228 @@
+// File Description
+/// \file CompositeBamReader.h
+/// \brief Defines the composite BAM readers, for working with multiple input
+///       files.
+//
+// Author: Derek Barnett
+
+#ifndef COMPOSITEBAMREADER_H
+#define COMPOSITEBAMREADER_H
+
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+#include "pbbam/BaiIndexedBamReader.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamReader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/GenomicInterval.h"
+#include "pbbam/PbiIndexedBamReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+
+/// \internal
+/// \brief The CompositeMergeItem class provides a helper struct for composite
+///        readers, containing a single-file reader and its "next" record.
+///
+struct CompositeMergeItem
+{
+public:
+    std::unique_ptr<BamReader> reader;
+    BamRecord record;
+
+public:
+    CompositeMergeItem(std::unique_ptr<BamReader> rdr);
+    CompositeMergeItem(std::unique_ptr<BamReader> rdr, BamRecord rec);
+    CompositeMergeItem(CompositeMergeItem&&) = default;
+    CompositeMergeItem& operator=(CompositeMergeItem&&) = default;
+    ~CompositeMergeItem() = default;
+};
+
+/// \internal
+/// \brief The CompositeMergeItemSorter class provides a helper function object
+///        for ordering composite reader results.
+///
+/// Essentially just exracts a BamRecord from its parent CompositeMergeItem for
+/// further checks.
+///
+template <typename CompareType>
+struct CompositeMergeItemSorter
+    : public std::function<bool(const CompositeMergeItem&, const CompositeMergeItem&)>
+{
+    bool operator()(const CompositeMergeItem& lhs, const CompositeMergeItem& rhs);
+};
+
+}  // namespace internal
+
+/// \brief The GenomicIntervalCompositeBamReader class provides read access to
+///        multipe %BAM files, limiting results to a genomic region.
+///
+/// Requires a ".bai" file for each input %BAM file.
+///
+/// Results will be returned in order of genomic coordinate (first by reference
+/// ID, then by position).
+///
+class PBBAM_EXPORT GenomicIntervalCompositeBamReader
+{
+public:
+    /// \name Contstructors & Related Methods
+    /// \{
+
+    GenomicIntervalCompositeBamReader(const GenomicInterval& interval,
+                                      const std::vector<BamFile>& bamFiles);
+    GenomicIntervalCompositeBamReader(const GenomicInterval& interval, const DataSet& dataset);
+
+    /// \}
+
+public:
+    /// \name Data Access
+    /// \{
+
+    /// Fetches next BAM record in the interval specified, storing in \p record
+    ///
+    /// \param[out] record
+    /// \returns true on success, false if no more data available.
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// Sets a new genomic interval of interest.
+    ///
+    /// \returns reference to this reader
+    ///
+    GenomicIntervalCompositeBamReader& Interval(const GenomicInterval& interval);
+
+    /// \returns the current specified interval
+    ///
+    const GenomicInterval& Interval() const;
+
+    /// \}
+
+private:
+    void UpdateSort();
+
+private:
+    GenomicInterval interval_;
+    std::deque<internal::CompositeMergeItem> mergeItems_;
+    std::vector<std::string> filenames_;
+};
+
+/// \brief Provides read access to multipe %BAM files, limiting results to those
+///        passing a PbiFilter.
+///
+/// Requires a ".pbi" file for each input %BAM file.
+///
+/// \note The template parameter OrderByType is not fully implemented at this
+///       time. Use of comparison functor (e.g. Compare::Zmw) for this will
+///       currently result in the proper "next" value <b> at each iteration
+///       step, independently, but not over the full data set. </b> If all
+///       files' "order-by" data values are accessible in increasing order
+///       within each file, then the expected ordering will be observed,
+///       However, if these data are not sorted within a file, the final results
+///       will appear unordered. \n
+///       \n
+///           Example:\n
+///           file 1: { 1, 5, 2, 6 } \n
+///           file 2: { 3, 8, 4, 7 } \n
+///           results: { 1, 3, 5, 2, 6, 8, 4, 7 } \n
+///       \n
+///       This a known issue and will be addressed in a future update. But in
+///       the meantime, use of Compare::None as the OrderByType is recommended,
+///       to explicitly indicate that no particular ordering is expected.
+///
+template <typename OrderByType>
+class PBBAM_EXPORT PbiFilterCompositeBamReader
+{
+public:
+    using value_type = internal::CompositeMergeItem;
+    using merge_sorter_type = internal::CompositeMergeItemSorter<OrderByType>;
+    using container_type = std::deque<value_type>;
+    using iterator = typename container_type::iterator;
+    using const_iterator = typename container_type::const_iterator;
+
+public:
+    /// \name Contstructors & Related Methods
+    /// \{
+
+    PbiFilterCompositeBamReader(const PbiFilter& filter, const std::vector<BamFile>& bamFiles);
+    PbiFilterCompositeBamReader(const PbiFilter& filter, const DataSet& dataset);
+
+    /// \}
+
+public:
+    /// \name Data Access
+    /// \{
+
+    /// Fetches next BAM record in the interval specified.
+    ///
+    /// \returns true on success, false if no more data available.
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// Sets a new PBI filter
+    ///
+    /// \returns reference to this reader
+    ///
+    PbiFilterCompositeBamReader& Filter(const PbiFilter& filter);
+
+    uint32_t NumReads() const;
+
+    /// \}
+
+private:
+    void UpdateSort();
+
+private:
+    container_type mergeQueue_;
+    std::vector<std::string> filenames_;
+    uint32_t numReads_;
+};
+
+/// \brief The SequentialCompositeBamReader class provides read access to
+///        multiple %BAM files, reading through the entire contents of each
+///        file.
+///
+/// Input files will be accessed in the order provided to the constructor. Each
+/// file's contents will be exhausted before moving on to the next one (as
+/// opposed to a "round-robin" scheme).
+///
+class PBBAM_EXPORT SequentialCompositeBamReader
+{
+public:
+    /// \name Contstructors & Related Methods
+    /// \{
+
+    SequentialCompositeBamReader(std::vector<BamFile> bamFiles);
+    SequentialCompositeBamReader(const DataSet& dataset);
+
+    /// \}
+
+public:
+    /// \name Data Access
+    /// \{
+
+    /// Fetches next BAM record from the .
+    ///
+    /// \returns true on success, false if no more data available.
+    ///
+    bool GetNext(BamRecord& record);
+
+    /// \}
+
+private:
+    std::deque<std::unique_ptr<BamReader> > readers_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/CompositeBamReader.inl"
+
+#endif  // COMPOSITEBAMREADER_H
diff --git a/include/pbbam/CompositeFastaReader.h b/include/pbbam/CompositeFastaReader.h

new file mode 100644 (file)

index 0000000..2e2a2b6
--- /dev/null
+++ b/include/pbbam/CompositeFastaReader.h
@@ -0,0 +1,64 @@
+// File Description
+/// \file CompositeFastaReader.h
+/// \brief Defines the composite FASTA reader, for working with multiple input
+///       files.
+//
+// Author: Derek Barnett
+
+#ifndef COMPOSITEFASTAREADER_H
+#define COMPOSITEFASTAREADER_H
+
+#include "pbbam/Config.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/FastaReader.h"
+#include "pbbam/FastaSequence.h"
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The CompositeFastaReader class provides read access to
+///        multiple FASTA files, reading through the entire contents of each
+///        file.
+///
+/// Input files will be accessed in the order provided to the constructor. Each
+/// file's contents will be exhausted before moving on to the next one (as
+/// opposed to a "round-robin" scheme).
+///
+class PBBAM_EXPORT CompositeFastaReader
+{
+public:
+    /// \name Contstructors & Related Methods
+    /// \{
+
+    CompositeFastaReader(const std::vector<std::string>& fastaFiles);
+    CompositeFastaReader(const DataSet& dataset);
+
+    /// \}
+
+public:
+    /// \name Data Access
+    /// \{
+
+    /// Fetches next FASTA sequence.
+    ///
+    /// \returns true on success, false if no more data available.
+    ///
+    bool GetNext(FastaSequence& seq);
+
+    /// \}
+
+private:
+    std::deque<std::unique_ptr<FastaReader> > readers_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "internal/CompositeFastaReader.inl"
+
+#endif  // COMPOSITEFASTAREADER_H
diff --git a/include/pbbam/Config.h b/include/pbbam/Config.h

new file mode 100644 (file)

index 0000000..5690f07
--- /dev/null
+++ b/include/pbbam/Config.h
@@ -0,0 +1,49 @@
+// File Description
+/// \file Config.h
+/// \brief Defines library-wide macros & global variables.
+//
+// Author: Derek Barnett
+
+#ifndef PBBAM_CONFIG_H
+#define PBBAM_CONFIG_H
+
+/// \name Library Import/Export
+/// \{
+
+#ifndef PBBAM_EXPORT
+#if defined(WIN32)
+#define PBBAM_EXPORT __declspec(dllimport)
+#else
+#define PBBAM_EXPORT
+#endif
+#endif
+
+/// \}
+
+namespace PacBio {
+namespace BAM {
+
+/// \name Verbosity Settings
+/// \{
+
+/// \brief Sets the desired verbosity level of htslib warnings.
+///
+/// Change this value to allow debug/warning statements from htslib itself.
+/// The valid range seems to be [0-3], where 0 indicates OFF, and 3 is the
+/// most verbose.
+///
+/// By default, pbbam disables htslib statements to keep output channels clean.
+/// We rely on exceptions & their associated messages instead.
+///
+/// This global variable is obviously not thread-safe by any means. But as a
+/// debug flag, it is unlikely to cause any real issues. The worst case would be
+/// unexpected presence/absence of output statements.
+///
+extern int HtslibVerbosity;
+
+/// \}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBBAM_CONFIG_H
diff --git a/include/pbbam/DataSet.h b/include/pbbam/DataSet.h

new file mode 100644 (file)

index 0000000..c229302
--- /dev/null
+++ b/include/pbbam/DataSet.h
@@ -0,0 +1,811 @@
+// File Description
+/// \file DataSet.h
+/// \brief Defines the DataSet class.
+//
+// Author: Derek Barnett
+
+#ifndef DATASET_H
+#define DATASET_H
+
+#include <chrono>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "pbbam/BamFile.h"
+#include "pbbam/Config.h"
+#include "pbbam/DataSetTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The DataSet class represents a %PacBio analyis dataset (e.g. from
+///        XML).
+///
+/// \nosubgrouping
+///
+/// It provides resource paths, filters, and metadata associated with a dataset
+/// under analysis.
+///
+class PBBAM_EXPORT DataSet
+{
+public:
+    /// \name DataSet Type
+    /// \{
+
+    /// \brief This enum defines the currently-supported DataSet types.
+    ///
+    enum TypeEnum
+    {
+        GENERIC = 0,
+        ALIGNMENT,
+        BARCODE,
+        CONSENSUS_ALIGNMENT,
+        CONSENSUS_READ,
+        CONTIG,
+        HDF_SUBREAD,
+        REFERENCE,
+        SUBREAD,
+        TRANSCRIPT,
+        TRANSCRIPT_ALIGNMENT
+    };
+
+    /// \brief Converts printable dataset type to type enum.
+    ///
+    /// \param[in] typeName printable dataset type
+    /// \returns dataset type enum
+    /// \throws std::runtime_error if \p typeName is unknown
+    ///
+    static DataSet::TypeEnum NameToType(const std::string& typeName);
+
+    /// \brief Converts dataset type enum to printable name.
+    ///
+    /// \param[in] type dataset type enum
+    /// \returns printable dataset type
+    /// \throws std::runtime_error if \p type is unknown
+    ///
+    static std::string TypeToName(const DataSet::TypeEnum& type);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Constructs an empty, generic DataSet.
+    ///
+    DataSet();
+
+    /// \brief Constructs an empty DataSet of the type specified.
+    ///
+    /// \param[in] type dataset type
+    /// \throws std::runtime_error if \p type is unknown
+    ///
+    DataSet(const DataSet::TypeEnum type);
+
+    /// \brief Constructs a DataSet from a %BAM file.
+    ///
+    /// This currently defaults to a SubreadSet, with an ExternalResource
+    /// pointing to BamFile::Filename.
+    ///
+    /// \param[in] bamFile  BamFile object
+    ///
+    DataSet(const BamFile& bamFile);
+
+    /// \brief Loads a DataSet from a file.
+    ///
+    /// \p filename may be one of the following types, indicated by its extension:\n
+    ///  - %BAM ("*.bam") \n
+    ///  - FOFN ("*.fofn") \n
+    ///  - FASTA ("*.fa" or "*.fasta") \n
+    ///  - DataSetXML ("*.xml") \n
+    ///
+    /// \param[in] filename  input filename
+    /// \throws std::runtime_error if \p filename has an unsupported extension,
+    ///         or if a valid DataSet could not be created from its contents
+    ///
+    DataSet(const std::string& filename);
+
+    /// \brief Constructs a DataSet from a list of files.
+    ///
+    /// \param[in] filenames  input filenames
+    /// \throws std::runtime_error if DataSet could not be created from
+    ///         \p filenames
+    ///
+    DataSet(const std::vector<std::string>& filenames);
+
+    DataSet(const DataSet& other);
+    DataSet(DataSet&&) = default;
+    DataSet& operator=(const DataSet& other);
+    DataSet& operator=(DataSet&&) = default;
+    ~DataSet() = default;
+
+    /// \brief Creates a DataSet from "raw" XML data.
+    ///
+    /// \param[in] xml DataSetXML text
+    ///
+    static DataSet FromXml(const std::string& xml);
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    /// \brief Merges DataSet contents.
+    ///
+    /// Adds contents of \p other to this dataset object
+    ///
+    /// \param[in] other  some other dataset to add to this one
+    /// \returns reference to this dataset object
+    ///
+    DataSet& operator+=(const DataSet& other);
+
+    /// \}
+
+public:
+    /// \name Serialization
+    /// \{
+
+    /// \brief Saves dataset XML to file.
+    ///
+    /// \param[in] outputFilename destination for XML contents
+    ///
+    /// \throws std::runtime_error if file could be opened or if DataSet
+    ///         elements could not be converted to XML
+    ///
+    void Save(const std::string& outputFilename);
+
+    /// \brief Saves dataset XML to output stream, e.g. std::cout,
+    ///        std::stringstream.
+    ///
+    /// \param[out] out destination for XML contents
+    ///
+    /// \throws std::runtime_error if DataSet elements could not be converted to
+    ///         XML
+    ///
+    void SaveToStream(std::ostream& out);
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+    ///
+
+    /// \brief Fetches the value of a DataSet root element's attribute.
+    ///
+    /// These are the attributes attached to the root dataset element: \n
+    /// \verbatim <SubreadSet foo="x" bar="y" /> \endverbatim
+    ///
+    /// Built-in accessors exist for the standard attributes (e.g. CreatedAt)
+    /// but additional attributes can be used as well via these generic
+    /// Attribute methods.
+    ///
+    /// \param[in] name root element's attribute name
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Attribute(const std::string& name) const;
+
+    /// \brief Fetches the value of dataset's CreatedAt attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& CreatedAt() const;
+
+    /// \brief Fetches the value of dataset's Format attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Format() const;
+
+    /// \brief Fetches the value of dataset's MetaType attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& MetaType() const;
+
+    /// \brief Fetches the value of dataset's ModifiedAt attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& ModifiedAt() const;
+
+    /// \brief Fetches the value of dataset's Name attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Name() const;
+
+    /// \brief Fetches the value of dataset's ResourceId attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& ResourceId() const;
+
+    /// \brief Fetches the value of dataset's Tags attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Tags() const;
+
+    /// \brief Fetches the value of dataset's TimeStampedName attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& TimeStampedName() const;
+
+    /// \brief Fetches the value of dataset's UniqueId attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& UniqueId() const;
+
+    /// \brief Fetches the value of dataset's Version attribute.
+    ///
+    /// \returns const reference to attribute's value (empty string if not
+    ///          present)
+    ///
+    const std::string& Version() const;
+
+    /// \}
+
+public:
+    /// \name DataSet Type
+    /// \{
+
+    /// \brief Fetches the dataset's type.
+    ///
+    /// \returns dataset type enum
+    ///
+    PacBio::BAM::DataSet::TypeEnum Type() const;
+
+    /// \brief Fetches the dataset's type.
+    ///
+    /// \returns printable dataset type
+    ///
+    std::string TypeName() const;
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the dataset's Extensions element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::Extensions& Extensions() const;
+
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ExternalResources& ExternalResources() const;
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::Filters& Filters() const;
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::DataSetMetadata& Metadata() const;
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::SubDataSets& SubDataSets() const;
+
+    /// \}
+
+public:
+    /// \name Resource Handling
+    /// \{
+
+    /// \brief Returns all of this dataset's resource files, with relative
+    ///        filepaths already resolved.
+    ///
+    /// Includes both primary resources (e.g. subread BAM files), as well as all
+    /// secondary or child resources (e.g. index files, scraps BAM, etc).
+    ///
+    /// \returns vector of (resolveD) filepaths
+    ///
+    /// \sa DataSet::ResolvedResourceIds
+    ///
+    std::vector<std::string> AllFiles() const;
+
+    /// \brief Returns this dataset's primary %BAM resources, with relative
+    ///        filepaths already resolved.
+    ///
+    /// Primary resources are those listed as top-level %ExternalResources, not
+    /// associated files (indices, references, scraps %BAMs, etc.).
+    ///
+    /// \returns vector of BamFiles
+    ///
+    /// \sa DataSet::ResolvedResourceIds
+    ///
+    std::vector<BamFile> BamFiles() const;
+
+    /// \brief Returns this dataset's primary FASTA resources, with relative
+    ///        filepaths already resolved.
+    ///
+    /// Primary resources are those listed as top-level %ExternalResources, not
+    /// associated files (indices, references, scraps %BAMs, etc.).
+    ///
+    /// \returns vector of filepaths to FASTA resources
+    ///
+    /// \sa DataSet::ResolvedResourceIds
+    ///
+    std::vector<std::string> FastaFiles() const;
+
+    /// \brief Returns all primary external resource filepaths, with relative
+    ///        paths resolved.
+    ///
+    /// Primary resources are those listed as top-level %ExternalResources, not
+    /// associated files (indices, references, scraps %BAMs, etc.).
+    ///
+    /// \sa ResolvePath
+    ///
+    /// \returns resourceIds
+    ///
+    std::vector<std::string> ResolvedResourceIds() const;
+
+    /// \brief Resolves a filepath (that may be relative to the dataset).
+    ///
+    /// A DataSet's resources may be described using absolute filepaths or with
+    /// relative paths. For absolute paths, nothing is changed from the input.
+    /// For relative paths, these are resolved using the DataSet's own path
+    /// as a starting point. A DataSet's own path will be one of:\n
+    ///  1 - the location of its XML or %BAM input file, e.g. created using
+    ///      DataSet("foo.xml") or DataSet("foo.bam")\n
+    ///  2 - application's current working directory for all other DataSet
+    ///      construction methods { DataSet(), DataSet(type),
+    ///      DataSet("foo.fofn") }\n
+    ///
+    /// \param[in] originalPath     input file path (absolute or relative)
+    /// \returns resolved path
+    ///
+    std::string ResolvePath(const std::string& originalPath) const;
+
+    /// \returns sequence chemistry info for all read groups in this dataset
+    ///
+    /// \sa ReadGroupInfo::SequencingChemistry
+    ///
+    std::set<std::string> SequencingChemistries() const;
+
+    /// \}
+
+public:
+    /// \name XML Namespace Handling
+    /// \{
+
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns const reference to dataset's NamespaceRegistry
+    ///
+    const NamespaceRegistry& Namespaces() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Fetches the value of a DataSet root element's attribute.
+    ///
+    /// These are the attributes attached to the root dataset element: \n
+    /// \verbatim <SubreadSet foo="x" bar="y" /> \endverbatim
+    ///
+    /// Built-in accessors exist for the standard attributes (e.g. CreatedAt)
+    /// but additional attributes can be used as well via these generic methods.
+    ///
+    /// A new attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] name root element's attribute name
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Attribute(const std::string& name);
+
+    /// \brief Fetches the value of dataset's CreatedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& CreatedAt();
+
+    /// \brief Fetches the value of dataset's Format attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Format();
+
+    /// \brief Fetches the value of dataset's MetaType attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& MetaType();
+
+    /// \brief Fetches the value of dataset's ModifiedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& ModifiedAt();
+
+    /// \brief Fetches the value of dataset's Name attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Name();
+
+    /// \brief Fetches the value of dataset's ResourceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& ResourceId();
+
+    /// \brief Fetches the value of dataset's Tags attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Tags();
+
+    /// \brief Fetches the value of dataset's TimeStampedName attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& TimeStampedName();
+
+    /// \brief Fetches the value of dataset's UniqueId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& UniqueId();
+
+    /// \brief Fetches the value of dataset's Version attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute's value (empty string if this
+    ///          is a new attribute)
+    ///
+    std::string& Version();
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets this dataset's XML attribute \p name, with \p value
+    ///
+    /// These are the attributes attached to the root dataset element: \n
+    /// \verbatim <SubreadSet foo="x" bar="y" /> \endverbatim
+    ///
+    /// Built-in accessors exist for the standard attributes (e.g. CreatedAt)
+    /// but additional attributes can be used as well via these generic methods.
+    ///
+    /// The attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] name   root element's attribute name
+    /// \param[in] value  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Attribute(const std::string& name, const std::string& value);
+
+    /// \brief Sets this dataset's CreatedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] createdAt  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& CreatedAt(const std::string& createdAt);
+
+    /// \brief Sets this dataset's Format attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] format  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Format(const std::string& format);
+
+    /// \brief Sets this dataset's MetaType attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] metatype  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& MetaType(const std::string& metatype);
+
+    /// \brief Sets this dataset's ModifiedAt attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] modifiedAt  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& ModifiedAt(const std::string& modifiedAt);
+
+    /// \brief Sets this dataset's Name attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] name  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Name(const std::string& name);
+
+    /// \brief Sets this dataset's ResourceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] resourceId  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& ResourceId(const std::string& resourceId);
+
+    /// \brief Sets this dataset's Tags attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] tags  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Tags(const std::string& tags);
+
+    /// \brief Sets this dataset's TimeStampedName attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] timeStampedName  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& TimeStampedName(const std::string& timeStampedName);
+
+    /// \brief Sets this dataset's UniqueId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] uuid  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& UniqueId(const std::string& uuid);
+
+    /// \brief Sets this dataset's Version attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] version  new value for the attribute
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Version(const std::string& version);
+
+    /// \}
+
+public:
+    /// \name DataSet Type
+    /// \{
+
+    /// \brief Edits dataset type.
+    ///
+    /// \param[in] type  new dataset type
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Type(const PacBio::BAM::DataSet::TypeEnum type);
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the dataset's Extensions element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Extensions& Extensions();
+
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ExternalResources& ExternalResources();
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Filters& Filters();
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::DataSetMetadata& Metadata();
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::SubDataSets& SubDataSets();
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Sets this dataset's Extensions element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] extensions  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Extensions(const PacBio::BAM::Extensions& extensions);
+
+    /// \brief Sets this dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] resources  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& ExternalResources(const PacBio::BAM::ExternalResources& resources);
+
+    /// \brief Sets this dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] filters  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Filters(const PacBio::BAM::Filters& filters);
+
+    /// \brief Sets this dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] metadata  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& Metadata(const PacBio::BAM::DataSetMetadata& metadata);
+
+    /// \brief Sets this dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] subdatasets  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSet& SubDataSets(const PacBio::BAM::SubDataSets& subdatasets);
+
+    /// \}
+
+public:
+    /// \name XML Namespace Handling
+    /// \{
+
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns non-const reference to dataset's NamespaceRegistry
+    ///
+    NamespaceRegistry& Namespaces();
+
+    /// \}
+
+private:
+    std::unique_ptr<DataSetBase> d_;
+    std::string path_;
+};
+
+/// \name DataSet Timestamp Utilities
+/// \{
+
+/// \brief Fetches current time, in "DataSetXML format".
+///
+/// \returns DataSetXML formatted timestamp
+///
+/// \sa ToDataSetFormat
+///
+PBBAM_EXPORT std::string CurrentTimestamp();
+
+/// \brief Converts a time_point to "DataSetXML-formatted" timestamp.
+///
+/// This is the format used as a component of the DataSet::TimeStampedName
+/// (yymmdd_HHmmssttt>.
+///
+/// \returns "DataSetXML-formatted" timestamp
+///
+PBBAM_EXPORT std::string ToDataSetFormat(const std::chrono::system_clock::time_point& tp);
+
+/// \brief Converts a time_t to "DataSetXML-formatted" timestamp.
+///
+/// This is the format used as a component of the DataSet::TimeStampedName
+/// (yymmdd_HHmmssttt>.
+///
+/// \returns "DataSetXML-formatted" timestamp
+///
+PBBAM_EXPORT std::string ToDataSetFormat(const time_t& tp);
+
+/// \brief Converts a time_point to ISO-8601 formatted timestamp.
+///
+/// This is the format used in DataSet::CreatedAt and DataSet::ModifiedAt.
+///
+/// \returns ISO-8601 formatted timestamp
+///
+PBBAM_EXPORT std::string ToIso8601(const std::chrono::system_clock::time_point& tp);
+
+/// \brief Converts a time_t to ISO-8601 formatted timestamp.
+///
+/// This is the format used in DataSet::CreatedAt and DataSet::ModifiedAt.
+///
+/// \returns ISO-8601 formatted timestamp
+///
+PBBAM_EXPORT std::string ToIso8601(const time_t& t);
+
+/// \}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/DataSet.inl"
+
+#endif  // DATASET_H
diff --git a/include/pbbam/DataSetTypes.h b/include/pbbam/DataSetTypes.h

new file mode 100644 (file)

index 0000000..43da7f6
--- /dev/null
+++ b/include/pbbam/DataSetTypes.h
@@ -0,0 +1,879 @@
+// File Description
+/// \file DataSetTypes.h
+/// \brief Defines the public DataSet component classes.
+//
+// Author: Derek Barnett
+
+#ifndef DATASETTYPES_H
+#define DATASETTYPES_H
+
+#include <string>
+#include "pbbam/BamFile.h"
+#include "pbbam/Config.h"
+#include "pbbam/DataSetXsd.h"
+#include "pbbam/internal/DataSetBaseTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The DataSetMetadata class represents the %DataSetMetadata child
+///        element in DataSetXML.
+///
+/// A few top-level elements are built-in, but as pbbam is not primarily a
+/// DataSetXML API, most of the metadata hierarchy needs to be manually managed.
+///
+class PBBAM_EXPORT DataSetMetadata : public internal::DataSetElement
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Constructs a DataSetMetadata with required fields.
+    DataSetMetadata(const std::string& numRecords, const std::string& totalLength);
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    /// \brief Merges DataSetMetadata contents.
+    ///
+    /// Adds contents of \p other to this metadata object
+    ///
+    /// \param[in] other  some other metadata to add to this one
+    /// \returns reference to this object
+    ///
+    DataSetMetadata& operator+=(const DataSetMetadata& other);
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the text of the NumRecords element.
+    ///
+    /// \returns const reference to element text (empty string if not present)
+    ///
+    const std::string& NumRecords() const;
+
+    /// \brief Fetches the text of the TotalLength element.
+    ///
+    /// \returns const reference to element text (empty string if not present)
+    ///
+    const std::string& TotalLength() const;
+
+    /// \brief Fetches the Provenance element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::Provenance& Provenance() const;
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Fetches the text of the NumRecords element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to element text
+    ///
+    std::string& NumRecords();
+
+    /// \brief Fetches the text of the TotalLength element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to element text
+    ///
+    std::string& TotalLength();
+
+    /// \brief Fetches Provenance element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Provenance& Provenance();
+
+    /// \}
+
+public:
+    /// \name Child Elements
+    /// \{
+
+    /// \brief Sets the text of the NumRecords element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns reference to this metadata object
+    ///
+    DataSetMetadata& NumRecords(const std::string& numRecords);
+
+    /// \brief Sets the text of the TotalLength element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns reference to this metadata object
+    ///
+    DataSetMetadata& TotalLength(const std::string& totalLength);
+
+    /// \brief Sets the Provenance child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns reference to this metadata object
+    ///
+    DataSetMetadata& Provenance(const PacBio::BAM::Provenance& provenance);
+
+    /// \}
+};
+
+/// \brief The ExtensionElement class represents an %ExtensionElement element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT ExtensionElement : public internal::DataSetElement
+{
+public:
+    ExtensionElement();
+};
+
+/// \brief The Extensions class represents an %Extensions element in DataSetXML.
+///
+/// The Extensions element is essentially just a list of ExtensionElement
+/// objects.
+///
+class PBBAM_EXPORT Extensions : public internal::DataSetListElement<ExtensionElement>
+{
+public:
+    /// \brief Creates an empty extensions list.
+    Extensions();
+};
+
+class ExternalResources;
+
+/// \brief The ExternalResource class represents an %ExternalResource element in
+///        DataSetXML.
+///
+/// An ExternalResource can itself have a child element, ExternalResources, that
+/// lists related files (e.g. index files).
+///
+class PBBAM_EXPORT ExternalResource : public internal::IndexedDataType
+{
+public:
+    /// \brief Creates an ExternalResource from a BamFile object.
+    ///
+    /// The metatype & resourceId are automatically set.
+    ///
+    ExternalResource(const BamFile& bamFile);
+
+    /// \brief Creates an ExternalResource with provided \p metatype and
+    ///        \p filename as resource ID.
+    ///
+    ExternalResource(const std::string& metatype, const std::string& filename);
+
+public:
+    /// \brief Fetches the resource's ExternalResources child element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ExternalResources& ExternalResources() const;
+
+public:
+    /// \brief Fetches the resource's ExternalResources child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ExternalResources& ExternalResources();
+
+    /// \brief Sets this resource's ExternalResources child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] resources  new value for the element
+    /// \returns reference to this resource object
+    ///
+    ExternalResource& ExternalResources(const PacBio::BAM::ExternalResources& resources);
+
+public:
+    /// \brief Converts an ExternalResource to a BamFile object
+    ///
+    /// \returns corresponding BamFile object for this ExternalResource
+    /// \throws std::runtime_error if fails to open %BAM file (e.g. does not
+    ///         exist, not a %BAM file, etc.)
+    ///
+    /// \deprecated Use the results from DataSet::BamFiles instead. This method
+    ///             cannot resolve relative filepaths and will be removed in the
+    ///             near future.
+    ///
+    BamFile ToBamFile() const;
+};
+
+/// \brief The ExternalResources class represents an %ExternalResources element
+///        in DataSetXML.
+///
+/// The ExternalResources element is essentially just a list of ExternalResource
+/// elements.
+///
+class PBBAM_EXPORT ExternalResources : public internal::DataSetListElement<ExternalResource>
+{
+public:
+    /// \brief Creates an empty resource list.
+    ExternalResources();
+
+    /// \brief Merges \p other resource list with this one.
+    ExternalResources& operator+=(const ExternalResources& other);
+
+public:
+    /// \brief Adds an ExternalResource to this list.
+    void Add(const ExternalResource& ext);
+
+    /// \brief Removes an ExternalResource from this list.
+    void Remove(const ExternalResource& ext);
+
+public:
+    /// \brief Converts resource list to BamFile objects.
+    ///
+    /// \deprecated Use DataSet::BamFiles instead. This method cannot resolve
+    ///             relative filepaths and will be removed in the near future.
+    ///
+    std::vector<BamFile> BamFiles() const;
+};
+
+/// \brief The FileIndex class represents a %FileIndex element in DataSetXML.
+///
+/// A FileIndex is used as an auxiliary to an ExternalResource, providing
+/// information about a data file's index file (e.g. for %BAM files, *.bai or
+/// *.pbi).
+///
+class PBBAM_EXPORT FileIndex : public internal::InputOutputDataType
+{
+public:
+    /// \brief Creates a FileIndex with provided \p metatype and \p filename as
+    ///        resource ID.
+    ///
+    FileIndex(const std::string& metatype, const std::string& filename);
+};
+
+/// \brief The FileIndices class represents a %FileIndices element in DataSetXML.
+///
+/// The FileIndices element is essentially just a list of FileIndex elements,
+/// providing information about a data file's index files (e.g. for %BAM files
+/// this will usually be *.bai and/or *.pbi).
+///
+class PBBAM_EXPORT FileIndices : public internal::DataSetListElement<FileIndex>
+{
+public:
+    /// \brief Creates an empty index list.
+    FileIndices();
+
+public:
+    /// \brief Adds a FileIndex to this list.
+    void Add(const FileIndex& index);
+
+    /// \brief Removes a FileIndex from this list.
+    void Remove(const FileIndex& index);
+};
+
+/// \brief The Filter class represents a %Filter element in DataSetXML.
+///
+/// The Filter element allows analysis pipelines to describe filters on data
+/// that should be respected downstream, without needing to create filtered
+/// intermediate files.
+///
+/// A filter consists of a list of Property elements, each of which must be
+/// passed (logical AND) to pass the filter, e.g. property1 && property2 &&
+/// property3.
+///
+class PBBAM_EXPORT Filter : public internal::DataSetElement
+{
+public:
+    /// \brief Creates an empty filter.
+    Filter();
+
+public:
+    /// \brief Fetches the filter's property list element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::Properties& Properties() const;
+
+public:
+    /// \brief Fetches the filter's property list child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Properties& Properties();
+
+    /// \brief Sets this filter's Properties child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] properties new value for the element
+    /// \returns reference to this filter object
+    ///
+    Filter& Properties(const PacBio::BAM::Properties& properties);
+};
+
+/// \brief The Filters class represents a %Filters list element in DataSetXML.
+///
+/// The Filters element is essentially a list of Filter elements. For analysis
+/// purpose, each filter is considered separately (logical OR) to consider which
+/// data passes, e.g. filter1 || filter2 || filter3.
+///
+class PBBAM_EXPORT Filters : public internal::DataSetListElement<Filter>
+{
+public:
+    /// \brief Creates an empty filter list.
+    Filters();
+
+    /// \brief Merges \p other filter list with this one.
+    Filters& operator+=(const Filters& other);
+
+public:
+    /// \brief Adds a filter to this list.
+    void Add(const Filter& filter);
+
+    /// \brief Removes a filter from this list.
+    void Remove(const Filter& filter);
+};
+
+/// \brief The ParentTool class represents a %ParentTool element in DataSetXML.
+///
+class PBBAM_EXPORT ParentTool : public internal::BaseEntityType
+{
+public:
+    /// \brief Creates an empty %ParentTool element.
+    ParentTool();
+};
+
+/// \brief The Property class represents a %Property element in DataSetXML.
+///
+/// A Property is the primary building block of %DataSetXML filtering. The
+/// %Property element describes a data record's property (or field), some value,
+/// and a comparison operator.
+///
+/// For example, one could filter all %BAM records with a read accuracy at or
+/// above 0.9. In C++ this could be constructed like:
+/// \code{.cpp}
+/// Property p("accuracy", "0.9", ">=");
+/// \endcode
+///
+class PBBAM_EXPORT Property : public internal::DataSetElement
+{
+public:
+    /// \brief Constructs a filter property.
+    Property(const std::string& name, const std::string& value, const std::string& op);
+
+public:
+    /// \brief Fetches the value of property's Name attribute.
+    ///
+    /// \returns const reference to attribute value
+    ///
+    const std::string& Name() const;
+
+    /// \brief Fetches the value of property's Operator attribute.
+    ///
+    /// \returns const reference to attribute value
+    ///
+    const std::string& Operator() const;
+
+    /// \brief Fetches the value of property's Value attribute.
+    ///
+    /// \returns const reference to attribute value
+    ///
+    const std::string& Value() const;
+
+public:
+    /// \brief Fetches the value of property's Name attribute.
+    ///
+    /// \returns non-const reference to attribute value
+    ///
+    std::string& Name();
+
+    /// \brief Fetches the value of property's Operator attribute.
+    ///
+    /// \returns non-const reference to attribute value
+    ///
+    std::string& Operator();
+
+    /// \brief Fetches the value of property's Value attribute.
+    ///
+    /// \returns nonconst reference to attribute value
+    ///
+    std::string& Value();
+
+public:
+    /// \brief Sets this property's Name attribute.
+    ///
+    /// \param[in] name  new value for the attribute
+    /// \returns reference to this property object
+    ///
+    Property& Name(const std::string& name);
+
+    /// \brief Sets this property's Operator attribute.
+    ///
+    /// \param[in] op  new value for the attribute
+    /// \returns reference to this property object
+    ///
+    Property& Operator(const std::string& op);
+
+    /// \brief Sets this property's Value attribute.
+    ///
+    /// \param[in] value  new value for the attribute
+    /// \returns reference to this property object
+    ///
+    Property& Value(const std::string& value);
+};
+
+/// \brief The Properties class represents a %Properties list element in
+///        DataSetXML.
+///
+/// The Properties element is essentially a list of Property elements.
+///
+class PBBAM_EXPORT Properties : public internal::DataSetListElement<Property>
+{
+public:
+    /// \brief Creates an empty property list.
+    Properties();
+
+public:
+    /// \brief Adds a property to this list.
+    void Add(const Property& property);
+
+    /// \brief Removes a property from this list.
+    void Remove(const Property& property);
+};
+
+/// \brief The Provenance class represents a %Provenance element in DataSetXML.
+///
+class PBBAM_EXPORT Provenance : public internal::DataSetElement
+{
+public:
+    /// \brief Creates a empty provenance element.
+    Provenance();
+
+public:
+    /// \brief Fetches the value of CreatedBy attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& CreatedBy() const;
+
+    /// \brief Fetches the value of CommonServicesInstanceId attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& CommonServicesInstanceId() const;
+
+    /// \brief Fetches the value of CreatorUserId attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& CreatorUserId() const;
+
+    /// \brief Fetches the value of ParentJobId attribute.
+    ///
+    /// \returns const reference to attribute value (empty string if not
+    ///          present)
+    ///
+    const std::string& ParentJobId() const;
+
+    /// \brief Fetches the ParentTool child element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ParentTool& ParentTool() const;
+
+public:
+    /// \brief Fetches the value of CreatedBy attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& CreatedBy();
+
+    /// \brief Fetches the value of CommonServicesInstanceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& CommonServicesInstanceId();
+
+    /// \brief Fetches the value of CreatorUserId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& CreatorUserId();
+
+    /// \brief Fetches the value of ParentJobId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to attribute value (empty string if this is
+    ///          a new attribute)
+    ///
+    std::string& ParentJobId();
+
+    /// \brief Fetches the ParentTool element element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ParentTool& ParentTool();
+
+public:
+    /// \brief Sets the CreatedBy attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] createdBy  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& CreatedBy(const std::string& createdBy);
+
+    /// \brief Sets the CommonServicesInstanceId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] id  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& CommonServicesInstanceId(const std::string& id);
+
+    /// \brief Sets the CreatorUserId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] id  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& CreatorUserId(const std::string& id);
+
+    /// \brief Sets the ParentJobId attribute.
+    ///
+    /// This attribute will be created if it does not yet exist.
+    ///
+    /// \param[in] id  new value for the attribute
+    /// \returns reference to this object
+    ///
+    Provenance& ParentJobId(const std::string& id);
+
+    /// \brief Sets the ParentTool child element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] tool  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    Provenance& ParentTool(const PacBio::BAM::ParentTool& tool);
+};
+
+class SubDataSets;
+
+/// \brief The DataSetBase class provides the attributes & child elements shared
+///        by all dataset types.
+///
+/// Client code should not need to use this class directly. It should be
+/// considered as more of an implementation detail and may in fact be removed
+/// from public API in the future. The top-level DataSet is the recommended
+/// entry point.
+///
+class PBBAM_EXPORT DataSetBase : public internal::StrictEntityType
+{
+public:
+    /// \brief Creates a DataSetBase object, or one of its subclasses, from an
+    ///        XML element name (e.g. SubreadSet)
+    ///
+    static std::shared_ptr<DataSetBase> Create(const std::string& typeName);
+
+public:
+    /// \brief Creates an empty, generic DataSetBase.
+    DataSetBase();
+
+protected:
+    /// \brief Creates a DataSetBase with key values initialized.
+    DataSetBase(const std::string& metatype, const std::string& label, const XsdType& xsd);
+
+    /// \brief Returns a new DataSetBase containing a deep copy of contents
+    DataSetBase* DeepCopy() const;
+
+public:
+    /// \brief Merges dataset contents.
+    ///
+    /// Adds contents of \p other to this dataset object
+    ///
+    /// \param[in] other  some other dataset to add to this one
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& operator+=(const DataSetBase& other);
+
+public:
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// \returns const reference to child element
+    /// \throws std::runtime_error if element does not exist
+    ///
+    const PacBio::BAM::ExternalResources& ExternalResources() const;
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::Filters& Filters() const;
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::DataSetMetadata& Metadata() const;
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// \returns const reference to child element
+    ///
+    const PacBio::BAM::SubDataSets& SubDataSets() const;
+
+public:
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns const reference to dataset's NamespaceRegistry
+    ///
+    const NamespaceRegistry& Namespaces() const;
+
+public:
+    /// \brief Fetches the dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::ExternalResources& ExternalResources();
+
+    /// \brief Fetches the dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::Filters& Filters();
+
+    /// \brief Fetches the dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::DataSetMetadata& Metadata();
+
+    /// \brief Fetches the dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \returns non-const reference to child element
+    ///
+    PacBio::BAM::SubDataSets& SubDataSets();
+
+public:
+    /// \brief Sets this dataset's ExternalResources element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] resources  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& ExternalResources(const PacBio::BAM::ExternalResources& resources);
+
+    /// \brief Sets this dataset's Filters element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] filters  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& Filters(const PacBio::BAM::Filters& filters);
+
+    /// \brief Sets this dataset's DataSetMetadata element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] metadata  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& Metadata(const PacBio::BAM::DataSetMetadata& metadata);
+
+    /// \brief Sets this dataset's DataSets element.
+    ///
+    /// This element will be created if it does not yet exist.
+    ///
+    /// \param[in] subdatasets  new value for the element
+    /// \returns reference to this dataset object
+    ///
+    DataSetBase& SubDataSets(const PacBio::BAM::SubDataSets& subdatasets);
+
+public:
+    /// \brief Access this dataset's namespace info.
+    ///
+    /// \returns non-const reference to dataset's NamespaceRegistry
+    ///
+    NamespaceRegistry& Namespaces();
+
+private:
+    NamespaceRegistry registry_;
+};
+
+/// \brief The AlignmentSet class represents an %AlignmentSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT AlignmentSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty AlignmentSet dataset.
+    AlignmentSet();
+};
+
+/// \brief The BarcodeSet class represents a %BarcodeSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT BarcodeSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty BarcodeSet dataset.
+    BarcodeSet();
+};
+
+/// \brief The ConsensusAlignmentSet class represents a %ConsensusAlignmentSet
+///        root element in DataSetXML.
+///
+class PBBAM_EXPORT ConsensusAlignmentSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ConsensusAlignmentSet dataset.
+    ConsensusAlignmentSet();
+};
+
+/// \brief The ConsensusReadSet class represents a %ConsensusReadSet root
+///        element in DataSetXML.
+///
+class PBBAM_EXPORT ConsensusReadSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ConsensusReadSet dataset.
+    ConsensusReadSet();
+};
+
+/// \brief The ContigSet class represents a %ContigSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT ContigSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ContigSet dataset.
+    ContigSet();
+};
+
+/// \brief The HdfSubreadSet class represents a %HdfSubreadSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT HdfSubreadSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty HdfSubreadSet dataset.
+    HdfSubreadSet();
+};
+
+/// \brief The ReferenceSet class represents a %ReferenceSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT ReferenceSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty ReferenceSet dataset.
+    ReferenceSet();
+};
+
+/// \brief The SubDataSets class represents a %DataSets list element in
+///        DataSetXML.
+///
+/// The SubDataSets element is essentially a list of DataSets.
+///
+class PBBAM_EXPORT SubDataSets : public internal::DataSetListElement<DataSetBase>
+{
+public:
+    /// \brief Creates an empty list of sub-datasets.
+    SubDataSets();
+
+public:
+    /// \brief Adds \p other sub-dataset to this list.
+    SubDataSets& operator+=(const DataSetBase& other);  // single
+
+    /// \brief Adds \p other sub-dataset list to this list.
+    SubDataSets& operator+=(const SubDataSets& other);  // list
+
+public:
+    /// \brief Adds a sub-dataset to this list.
+    void Add(const DataSetBase& subdataset);
+
+    /// \brief Removes a sub-dataset from this list.
+    void Remove(const DataSetBase& subdataset);
+};
+
+/// \brief The SubreadSet class represents a %SubreadSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT SubreadSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty SubreadSet dataset.
+    SubreadSet();
+};
+
+/// \brief The TranscriptSet class represents a %TranscriptSet root element in
+///        DataSetXML.
+///
+class PBBAM_EXPORT TranscriptSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty TranscriptSet dataset.
+    TranscriptSet();
+};
+
+/// \brief The TranscriptAlignmentSet class represents a %TranscriptAlignmentSet
+///        root element in DataSetXML.
+///
+class PBBAM_EXPORT TranscriptAlignmentSet : public DataSetBase
+{
+public:
+    /// \brief Creates an empty TranscriptAlignmentSet dataset.
+    TranscriptAlignmentSet();
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "internal/DataSetTypes.inl"
+
+#endif  // DATASETTYPES_H
diff --git a/include/pbbam/DataSetXsd.h b/include/pbbam/DataSetXsd.h

new file mode 100644 (file)

index 0000000..a711cd3
--- /dev/null
+++ b/include/pbbam/DataSetXsd.h
@@ -0,0 +1,120 @@
+// File Description
+/// \file DataSetXsd.h
+/// \brief Defines the XSD- and namespace-related classes for DataSetXML.
+//
+// Author: Derek Barnett
+
+#ifndef DATASETXSD_H
+#define DATASETXSD_H
+
+#include <map>
+#include <string>
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The XsdType enum defines the supported XSD namespaces.
+///
+enum class XsdType
+{
+    NONE,
+    AUTOMATION_CONSTRAINTS,
+    BASE_DATA_MODEL,
+    COLLECTION_METADATA,
+    COMMON_MESSAGES,
+    DATA_MODEL,
+    DATA_STORE,
+    DATASETS,
+    DECL_DATA,
+    PART_NUMBERS,
+    PRIMARY_METRICS,
+    REAGENT_KIT,
+    RIGHTS_AND_ROLES,
+    SAMPLE_INFO,
+    SEEDING_DATA
+};
+
+/// \brief The NamespaceInfo class provides XML namespace info (prefix & URI).
+///
+class PBBAM_EXPORT NamespaceInfo
+{
+public:
+    /// \brief Creates an empty entry.
+    ///
+    /// This constructor only exists for STL container compatibility.
+    ///
+    NamespaceInfo() = default;
+
+    /// \brief Creates a valid info entry.
+    NamespaceInfo(std::string name, std::string uri);
+
+public:
+    /// \brief Fetches namespace name (i.e. prefix)
+    const std::string& Name() const { return name_; }
+
+    /// \brief Fetches namespace URI.
+    const std::string& Uri() const { return uri_; }
+
+private:
+    std::string name_;
+    std::string uri_;
+};
+
+/// \brief The NamespaceRegistry class provides a per-dataset registry of XML
+///        namespace information.
+///
+/// This is used to format XML output - properly prefixing element labels with
+/// namespace as appropriate.
+///
+class PBBAM_EXPORT NamespaceRegistry
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    NamespaceRegistry();
+    NamespaceRegistry(const NamespaceRegistry&) = default;
+    NamespaceRegistry(NamespaceRegistry&&) = default;
+    NamespaceRegistry& operator=(const NamespaceRegistry&) = default;
+    NamespaceRegistry& operator=(NamespaceRegistry&&) = default;
+    ~NamespaceRegistry() = default;
+
+    /// \}
+
+public:
+    /// \name Registry Access
+    /// \{
+
+    /// \brief Fetches namespace info for the dataset's default XSD type.
+    const NamespaceInfo& DefaultNamespace() const;
+
+    /// \brief Fetches dataset's default XSD type.
+    XsdType DefaultXsd() const;
+
+    /// \brief Fetches namespace info for the requested XSD type.
+    const NamespaceInfo& Namespace(const XsdType& xsd) const;
+
+    /// \brief Registers namespace info for a particular XSD type.
+    void Register(const XsdType& xsd, const NamespaceInfo& namespaceInfo);
+
+    /// \brief Updates dataset's default XSD type.
+    void SetDefaultXsd(const XsdType& xsd);
+
+    /// \brief Fetches the XSD type for \p elementLabel.
+    XsdType XsdForElement(const std::string& elementLabel) const;
+
+    /// \brief Fetches the XSD type for a particular URI.
+    XsdType XsdForUri(const std::string& uri) const;
+
+    /// \}
+
+private:
+    std::map<XsdType, NamespaceInfo> data_;
+    XsdType defaultXsdType_ = XsdType::DATASETS;
+};
+
+}  // namespace PacBio
+}  // namespace BAM
+
+#endif  // DATASETXSD_H
diff --git a/include/pbbam/EntireFileQuery.h b/include/pbbam/EntireFileQuery.h

new file mode 100644 (file)

index 0000000..f48e2c8
--- /dev/null
+++ b/include/pbbam/EntireFileQuery.h
@@ -0,0 +1,62 @@
+// File Description
+/// \file EntireFileQuery.h
+/// \brief Defines the EntireFileQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef ENTIREFILEQUERY_H
+#define ENTIREFILEQUERY_H
+
+#include <memory>
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The EntireFileQuery class provides iterable access to a DataSet's
+///        %BAM records, reading through the entire contents of each file.
+///
+/// Input files will be accessed in the order listed in the DataSet.
+///
+/// \include code/EntireFileQuery.txt
+///
+/// Iteration is not limited to only 'const' records. The files themselves will
+/// not be affected, but individual records may be modified if needed.
+///
+/// \include code/EntireFileQuery_NonConst.txt
+///
+/// \note DataSets can be implicitly constructed from %BAM filenames as well.
+///       Thus a single %BAM file can be read through using the following:
+///
+/// \include code/EntireFileQuery_BamFilename.txt
+///
+class PBBAM_EXPORT EntireFileQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new EntireFileQuery, reading through the entire
+    ///        contents of a dataset.
+    ///
+    /// \param[in] dataset  input data source(s)
+    /// \throws std::runtime_error on failure to open/read underlying %BAM
+    ///         files.
+    ///
+    EntireFileQuery(const PacBio::BAM::DataSet& dataset);
+    ~EntireFileQuery() override;
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+private:
+    struct EntireFileQueryPrivate;
+    std::unique_ptr<EntireFileQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namspace PacBio
+
+#endif  // ENTIREFILEQUERY_H
diff --git a/include/pbbam/FastaReader.h b/include/pbbam/FastaReader.h

new file mode 100644 (file)

index 0000000..5fd9505
--- /dev/null
+++ b/include/pbbam/FastaReader.h
@@ -0,0 +1,78 @@
+// File Description
+/// \file FastaReader.h
+/// \brief Defines the FastaReader class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTAREADER_H
+#define FASTAREADER_H
+
+#include <memory>
+#include <vector>
+#include "pbbam/FastaSequence.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+struct FastaReaderPrivate;
+}
+
+///
+/// \brief The FastaReader provides sequential access to FASTA records.
+///
+class FastaReader
+{
+public:
+    ///
+    /// \brief Reads all FASTA sequences from a file
+    ///
+    /// \param fn   FASTA filename
+    /// \return vector of FastaSequence results
+    ///
+    static std::vector<FastaSequence> ReadAll(const std::string& fn);
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    explicit FastaReader(const std::string& fn);
+    FastaReader(const FastaReader&) = delete;
+    FastaReader(FastaReader&&) = default;
+    FastaReader& operator=(const FastaReader&) = delete;
+    FastaReader& operator=(FastaReader&&) = default;
+    ~FastaReader();
+
+    /// \}
+
+public:
+    /// \name Sequence Access
+    /// \{
+
+    ///
+    /// \brief GetNext
+    ///
+    /// \code{cpp}
+    ///
+    /// FastaReader reader{ fn };
+    /// FastaSequence f;
+    /// while (reader.GetNext(f)) {
+    ///     // do stuff with f
+    /// }
+    /// \endcode
+    ///
+    /// \param[out] record
+    /// \return success/failure
+    ///
+    bool GetNext(FastaSequence& record);
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::FastaReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTAREADER_H
diff --git a/include/pbbam/FastaSequence.h b/include/pbbam/FastaSequence.h

new file mode 100644 (file)

index 0000000..c9bc08f
--- /dev/null
+++ b/include/pbbam/FastaSequence.h
@@ -0,0 +1,68 @@
+// File Description
+/// \file FastaSequence.h
+/// \brief Defines the FastaSequence class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTASEQUENCE_H
+#define FASTASEQUENCE_H
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The FastaSequence class represents a FASTA record (name & bases)
+///
+class FastaSequence
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    ///
+    /// \brief FastaSequence
+    /// \param name
+    /// \param bases
+    ///
+    explicit FastaSequence(std::string name, std::string bases);
+
+    FastaSequence() = default;
+    FastaSequence(const FastaSequence&) = default;
+    FastaSequence(FastaSequence&&) = default;
+    FastaSequence& operator=(const FastaSequence&) = default;
+    FastaSequence& operator=(FastaSequence&&) = default;
+    ~FastaSequence() = default;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    ///
+    /// \brief Name
+    /// \return
+    ///
+    const std::string& Name() const;
+
+    ///
+    /// \brief Bases
+    /// \return
+    ///
+    const std::string& Bases() const;
+
+    /// \}
+
+private:
+    std::string name_;
+    std::string bases_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "internal/FastaSequence.inl"
+
+#endif  // FASTASEQUENCE_H
diff --git a/include/pbbam/FastaSequenceQuery.h b/include/pbbam/FastaSequenceQuery.h

new file mode 100644 (file)

index 0000000..10f8086
--- /dev/null
+++ b/include/pbbam/FastaSequenceQuery.h
@@ -0,0 +1,50 @@
+// File Description
+/// \file FastaSequenceQuery.h
+/// \brief Defines the FastaSequenceQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTASEQUENCEQUERY_H
+#define FASTASEQUENCEQUERY_H
+
+#include "pbbam/DataSet.h"
+#include "pbbam/FastaSequence.h"
+#include "pbbam/internal/QueryBase.h"
+
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The FastaSequence class represents a FASTA record (name & bases)
+///
+class FastaSequenceQuery : public internal::QueryBase<FastaSequence>
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    FastaSequenceQuery(const PacBio::BAM::DataSet& dataset);
+    ~FastaSequenceQuery() override;
+
+    /// \}
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(FastaSequence& seq) override;
+
+private:
+    struct FastaSequenceQueryPrivate;
+    std::unique_ptr<FastaSequenceQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTASEQUENCEQUERY_H
diff --git a/include/pbbam/FastqReader.h b/include/pbbam/FastqReader.h

new file mode 100644 (file)

index 0000000..330a062
--- /dev/null
+++ b/include/pbbam/FastqReader.h
@@ -0,0 +1,78 @@
+// File Description
+/// \file FastqReader.h
+/// \brief Defines the FastqReader class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTQREADER_H
+#define FASTQREADER_H
+
+#include <memory>
+#include <vector>
+#include "pbbam/FastqSequence.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+struct FastqReaderPrivate;
+}
+
+///
+/// \brief The FastqReader provides sequential access to Fastq records.
+///
+class FastqReader
+{
+public:
+    ///
+    /// \brief Reads all Fastq sequences from a file
+    ///
+    /// \param fn   Fastq filename
+    /// \return vector of FastqSequence results
+    ///
+    static std::vector<FastqSequence> ReadAll(const std::string& fn);
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    explicit FastqReader(const std::string& fn);
+    FastqReader(const FastqReader&) = delete;
+    FastqReader(FastqReader&& other) = default;
+    FastqReader& operator=(const FastqReader&) = delete;
+    FastqReader& operator=(FastqReader&& other) = default;
+    ~FastqReader();
+
+    /// \}
+
+public:
+    /// \name Sequence Access
+    /// \{
+
+    ///
+    /// \brief GetNext
+    ///
+    /// \code{cpp}
+    ///
+    /// FastqReader reader{ fn };
+    /// FastqSequence f;
+    /// while (reader.GetNext(f)) {
+    ///     // do stuff with f
+    /// }
+    /// \endcode
+    ///
+    /// \param[out] record
+    /// \return success/failure
+    ///
+    bool GetNext(FastqSequence& record);
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::FastqReaderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FASTQREADER_H
diff --git a/include/pbbam/FastqSequence.h b/include/pbbam/FastqSequence.h

new file mode 100644 (file)

index 0000000..ce2e7c5
--- /dev/null
+++ b/include/pbbam/FastqSequence.h
@@ -0,0 +1,74 @@
+// File Description
+/// \file FastqSequence.h
+/// \brief Defines the FastqSequence class.
+//
+// Author: Derek Barnett
+
+#ifndef FASTQSEQUENCE_H
+#define FASTQSEQUENCE_H
+
+#include <pbbam/FastaSequence.h>
+#include <pbbam/QualityValues.h>
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+///
+/// \brief The FastqSequence class represents a FASTQ record (name, bases, and
+///        qualities)
+///
+class FastqSequence : public FastaSequence
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    ///
+    /// \brief FastaSequence
+    /// \param name
+    /// \param bases
+    /// \param qualities
+    ///
+    explicit FastqSequence(std::string name, std::string bases, QualityValues qualities);
+
+    ///
+    /// \brief FastaSequence
+    /// \param name
+    /// \param bases
+    /// \param qualities
+    ///
+    explicit FastqSequence(std::string name, std::string bases, std::string qualities);
+
+    FastqSequence() = default;
+    FastqSequence(const FastqSequence&) = default;
+    FastqSequence(FastqSequence&&) = default;
+    FastqSequence& operator=(const FastqSequence&) = default;
+    FastqSequence& operator=(FastqSequence&&) = default;
+    ~FastqSequence() = default;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    ///
+    /// \brief Qualities
+    /// \return
+    ///
+    const QualityValues& Qualities() const;
+
+    /// \}
+
+private:
+    QualityValues qualities_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "internal/FastqSequence.inl"
+
+#endif  // FASTQSEQUENCE_H
diff --git a/include/pbbam/FrameEncodingType.h b/include/pbbam/FrameEncodingType.h

new file mode 100644 (file)

index 0000000..da7a7a9
--- /dev/null
+++ b/include/pbbam/FrameEncodingType.h
@@ -0,0 +1,31 @@
+// File Description
+/// \file FrameEncodingType.h
+/// \brief Defines the FrameEncodingType enum.
+//
+// Author: Derek Barnett
+
+#ifndef FRAMEENCODINGTYPE_H
+#define FRAMEENCODINGTYPE_H
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the possible encoding modes used in Frames data
+/// (e.g. BamRecord::IPD or BamRecord::PulseWidth).
+///
+/// The LOSSY mode is the default in production output; LOSSLESS mode
+/// being used primarily for internal applications.
+///
+/// \sa https://github.com/PacificBiosciences/PacBioFileFormats/blob/3.0/BAM.rst
+///     for more information on pulse frame encoding schemes.
+///
+enum class FrameEncodingType
+{
+    LOSSY,    ///< 8-bit compression (using CodecV1) of frame data
+    LOSSLESS  ///< 16-bit native frame data
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FRAMEENCODINGTYPE_H
diff --git a/include/pbbam/Frames.h b/include/pbbam/Frames.h

new file mode 100644 (file)

index 0000000..982b2cb
--- /dev/null
+++ b/include/pbbam/Frames.h
@@ -0,0 +1,147 @@
+// File Description
+/// \file Frames.h
+/// \brief Defines the Frames class.
+//
+// Author: Derek Barnett
+
+#ifndef FRAMES_H
+#define FRAMES_H
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The Frames class represents pulse frame data.
+///
+/// Frame data may be stored in either their raw, 16-bit values or
+/// using a lossy, 8-bit compression scheme.
+///
+/// This class is used to store the data and convert between the 2 storage types.
+///
+class PBBAM_EXPORT Frames
+{
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \brief Constructs a Frames object from encoded (lossy, 8-bit) data.
+    ///
+    /// \note This method should probably not be needed often by client code
+    ///       working with frame data. It exists primarily for (internal)
+    ///       parsing & interpretation of the %BAM file contents. The method is
+    ///       available, though, should the conversion operation be needed.
+    ///
+    /// \param[in] codedData    encoded data
+    /// \returns Frames object
+    ///
+    static Frames Decode(const std::vector<uint8_t>& codedData);
+
+    /// \brief Creates encoded, compressed frame data from raw input data.
+    ///
+    /// \param[in] frames   raw frame data
+    /// \returns lossy, 8-bit encoded frame data
+    ///
+    static std::vector<uint8_t> Encode(const std::vector<uint16_t>& frames);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    Frames(std::vector<uint16_t> frames);
+
+    Frames();
+    Frames(const Frames&) = default;
+    Frames(Frames&&) = default;
+    Frames& operator=(const Frames&) = default;
+    Frames& operator=(Frames&&) = default;
+    ~Frames() = default;
+
+    /// \}
+
+public:
+    /// \name Access Data
+    /// \{
+
+    /// \returns Frame data in expanded (not encoded) form
+    std::vector<uint16_t>& DataRaw();
+    const std::vector<uint16_t>& Data() const;
+
+    /// \}
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \returns Frame data in (lossy, 8-bit) encoded form.
+    std::vector<uint8_t> Encode() const;
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    bool operator==(const Frames& other) const;
+    bool operator!=(const Frames& other) const;
+
+    /// \}
+
+public:
+    /// \name STL Compatbility
+    /// \{
+
+    /// \returns A const_iterator to the beginning of the sequence.
+    std::vector<uint16_t>::const_iterator cbegin() const;
+
+    /// \returns A const_iterator to the element past the end of the sequence.
+    std::vector<uint16_t>::const_iterator cend() const;
+
+    /// \returns A const_iterator to the beginning of the sequence.
+    std::vector<uint16_t>::const_iterator begin() const;
+
+    /// \returns A const_iterator to the element past the end of the sequence.
+    std::vector<uint16_t>::const_iterator end() const;
+
+    /// \returns An iterator to the beginning of the sequence.
+    std::vector<uint16_t>::iterator begin();
+
+    /// \returns An iterator to the element past the end of the sequence.
+    std::vector<uint16_t>::iterator end();
+
+    /// \returns The number of frame data points.
+    size_t size() const;
+
+    /// \returns True if the container is empty, false otherwise.
+    bool empty() const;
+
+    /// \}
+
+public:
+    /// \name Access Data
+    /// \{
+
+    /// Sets this record's data.
+    ///
+    /// \param[in] frames data in expanded (not encoded) form
+    /// \returns reference to this object
+    ///
+    Frames& Data(std::vector<uint16_t> frames);
+
+    /// \}
+
+private:
+    std::vector<uint16_t> data_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/Frames.inl"
+
+#endif  // FRAMES_H
diff --git a/include/pbbam/GenomicInterval.h b/include/pbbam/GenomicInterval.h

new file mode 100644 (file)

index 0000000..502552f
--- /dev/null
+++ b/include/pbbam/GenomicInterval.h
@@ -0,0 +1,152 @@
+// File Description
+/// \file GenomicInterval.h
+/// \brief Defines the GenomicInterval class.
+//
+// Author: Derek Barnett
+
+#ifndef GENOMICINTERVAL_H
+#define GENOMICINTERVAL_H
+
+#include <cstddef>
+#include <string>
+#include "pbbam/Config.h"
+#include "pbbam/Interval.h"
+#include "pbbam/Position.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The GenomicInterval class represents a genomic interval (reference
+///        name and 0-based coordinates).
+///
+class PBBAM_EXPORT GenomicInterval
+{
+public:
+    /// \name Constructors & Related Methods
+    ///  \{
+
+    /// \brief Creates an empty genomic interval
+    GenomicInterval() = default;
+
+    /// \brief Creates a genomic interval on sequence with \p name, using range:
+    ///       [\p start, \p stop)
+    GenomicInterval(std::string name, Position start, Position stop);
+
+    /// \brief Creates a genomic interval, using REGION string
+    ///
+    /// "<ref>:<start>-<stop>" ("chr8:200-600")
+    ///
+    /// \note The htslib/samtools REGION string expects start positions to be
+    ///       1-based. However, throughout pbbam (including the rest of this
+    ///       class), we stick to 0-based start coordinates. Thus, while the
+    ///       syntax matches that of samtools, we are using a 0-based start
+    ///       coordinate here.
+    ///
+    GenomicInterval(const std::string& zeroBasedRegionString);
+
+    GenomicInterval(const GenomicInterval&) = default;
+    GenomicInterval(GenomicInterval&&) = default;
+    GenomicInterval& operator=(const GenomicInterval&) = default;
+    GenomicInterval& operator=(GenomicInterval&&) = default;
+    ~GenomicInterval() = default;
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    /// \returns true if same id & underlying interval
+    bool operator==(const GenomicInterval& other) const;
+
+    /// \returns true if either ids or underlying intervals differ
+    bool operator!=(const GenomicInterval& other) const;
+
+    /// \}
+
+public:
+    /// \name Interval Operations
+    /// \{
+
+    /// \returns true if same id and underlying Interval::CoveredBy() other.
+    bool CoveredBy(const GenomicInterval& other) const;
+
+    /// \returns true if same id and underlying Interval::Covers() other.
+    bool Covers(const GenomicInterval& other) const;
+
+    /// \returns true if same id and underlying Interval::Intersects() other.
+    bool Intersects(const GenomicInterval& other) const;
+
+    /// \returns true if underlying Interval::IsValid(), and id/endpoints are
+    ///          non-negative.
+    ///
+    bool IsValid() const;
+
+    /// \returns length of underlying
+    size_t Length() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns interval reference name
+    std::string Name() const;
+
+    /// \returns underlying Interval object
+    PacBio::BAM::Interval<Position> Interval() const;
+
+    /// \returns interval start coordinate
+    Position Start() const;
+
+    /// \returns interval stop coordinate
+    Position Stop() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// Sets this interval's reference name.
+    ///
+    /// \param[in] name
+    /// \returns reference to this interval
+    ///
+    GenomicInterval& Name(std::string name);
+
+    /// Sets this underlying Interval
+    ///
+    /// \param[in] interval
+    /// \returns reference to this interval
+    ///
+    GenomicInterval& Interval(PacBio::BAM::Interval<Position> interval);
+
+    /// Sets this interval's start coordinate.
+    ///
+    /// \param[in] start
+    /// \returns reference to this interval
+    ///
+    GenomicInterval& Start(const Position start);
+
+    /// Sets this interval's stop coordinate.
+    ///
+    /// \param[in] stop
+    /// \returns reference to this interval
+    ///
+    GenomicInterval& Stop(const Position stop);
+
+    /// \}
+
+private:
+    std::string name_;
+    PacBio::BAM::Interval<Position> interval_;
+};
+
+}  // namespace BAM
+}  // namspace PacBio
+
+#include "pbbam/internal/GenomicInterval.inl"
+
+#endif  // GENOMICINTERVAL_H
diff --git a/include/pbbam/GenomicIntervalQuery.h b/include/pbbam/GenomicIntervalQuery.h

new file mode 100644 (file)

index 0000000..6f558ba
--- /dev/null
+++ b/include/pbbam/GenomicIntervalQuery.h
@@ -0,0 +1,75 @@
+// File Description
+/// \file GenomicIntervalQuery.h
+/// \brief Defines the GenomicIntervalQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef GENOMICINTERVALQUERY_H
+#define GENOMICINTERVALQUERY_H
+
+#include <memory>
+#include "pbbam/GenomicInterval.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The GenomicIntervalQuery class provides iterable access to a
+///        DataSet's %BAM records, limiting results to those overlapping a
+///        GenomicInterval.
+///
+/// Example:
+/// \include code/GenomicIntervalQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".bai" index file.
+///       Use BamFile::EnsureStandardIndexExists before creating the query if
+///       one may not be present.
+///
+class PBBAM_EXPORT GenomicIntervalQuery : public internal::IQuery
+{
+public:
+    /// \brief Constructs a new GenomiIntervalQuery, limiting record results to
+    ///        only those overalpping a GenomicInterval.
+    ///
+    /// \param[in] interval genomic interval of interest
+    /// \param[in] dataset  input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         BAI files.
+    ///
+    GenomicIntervalQuery(const GenomicInterval& interval, const PacBio::BAM::DataSet& dataset);
+    ~GenomicIntervalQuery() override;
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+public:
+    /// \brief Sets a new genomic interval.
+    ///
+    /// This allows the same dataset/query to be re-used over multiple regions of
+    /// interest:
+    ///
+    /// \include code/GenomicIntervalQuery_Reuse.txt
+    ///
+    /// \param[in] interval new genomic interval
+    /// \returns reference to this query
+    ///
+    GenomicIntervalQuery& Interval(const GenomicInterval& interval);
+
+    /// \returns Current genomic interval active on this query.
+    const GenomicInterval& Interval() const;
+
+private:
+    struct GenomicIntervalQueryPrivate;
+    std::unique_ptr<GenomicIntervalQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namspace PacBio
+
+#endif  // GENOMICINTERVALQUERY_H
diff --git a/include/pbbam/IRecordWriter.h b/include/pbbam/IRecordWriter.h

new file mode 100644 (file)

index 0000000..a1140a8
--- /dev/null
+++ b/include/pbbam/IRecordWriter.h
@@ -0,0 +1,55 @@
+// File Description
+/// \file IRecordWriter.h
+/// \brief Defines the IRecordWriter interface.
+//
+// Author: Derek Barnett
+
+#ifndef IRECORDWRITER_H
+#define IRECORDWRITER_H
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class BamRecordImpl;
+
+class IRecordWriter
+{
+public:
+    virtual ~IRecordWriter() = default;
+
+public:
+    /// \brief Try to flush any buffered data to file.
+    ///
+    /// \note The underlying implementation may not necessarily flush buffered
+    ///       data immediately, especially in a multithreaded writer situation.
+    ///       Let the writer go out of scope to fully ensure flushing.
+    ///
+    /// \throws std::runtime_error if flush fails
+    ///
+    virtual void TryFlush() = 0;
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] record BamRecord object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    virtual void Write(const BamRecord& record) = 0;
+
+    /// \brief Write a record to the output %BAM file.
+    ///
+    /// \param[in] recordImpl BamRecordImpl object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    virtual void Write(const BamRecordImpl& recordImpl) = 0;
+
+protected:
+    IRecordWriter() = default;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // IRECORDWRITER_H
diff --git a/include/pbbam/IndexedBamWriter.h b/include/pbbam/IndexedBamWriter.h

new file mode 100644 (file)

index 0000000..b943480
--- /dev/null
+++ b/include/pbbam/IndexedBamWriter.h
@@ -0,0 +1,83 @@
+// File Description
+/// \file IndexedBamWriter.h
+/// \brief Defines the IndexedBamWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef INDEXEDBAMWRITER_H
+#define INDEXEDBAMWRITER_H
+
+#include <memory>
+#include <string>
+
+#include "pbbam/Config.h"
+#include "pbbam/IRecordWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamHeader;
+class BamRecord;
+class BamRecordImpl;
+
+namespace internal {
+class IndexedBamWriterPrivate;
+}
+
+///
+/// \brief The IndexedBamWriter class
+///
+///
+///
+///
+///
+///
+class IndexedBamWriter : public IRecordWriter
+{
+public:
+    ///
+    /// \brief IndexedBamWriter
+    ///
+    /// \param[in] filename         path to output %BAM file
+    /// \param[in] header           BamHeader object
+    ///
+    /// \throws std::runtime_error if there was a problem opening the file for
+    ///         writing or if an error occurred while writing the header
+    ///
+    IndexedBamWriter(const std::string& outputFilename, const BamHeader& header);
+
+    ~IndexedBamWriter() override;
+
+    IndexedBamWriter(const IndexedBamWriter&) = delete;
+    IndexedBamWriter(IndexedBamWriter&&) = delete;
+    IndexedBamWriter& operator=(const IndexedBamWriter&) = delete;
+    IndexedBamWriter& operator=(IndexedBamWriter&&) = delete;
+
+public:
+    ///
+    /// \brief TryFlush
+    ///
+    void TryFlush() override;
+
+    ///
+    /// \brief Write
+    ///
+    /// \param[in] record
+    ///
+    void Write(const BamRecord& record) override;
+
+    ///
+    /// \brief Write
+    ///
+    /// \param[in] record
+    ///
+    void Write(const BamRecordImpl& record) override;
+
+private:
+    std::unique_ptr<internal::IndexedBamWriterPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // INDEXEDBAMWRITER_H
diff --git a/include/pbbam/IndexedFastaReader.h b/include/pbbam/IndexedFastaReader.h

new file mode 100644 (file)

index 0000000..6305d5a
--- /dev/null
+++ b/include/pbbam/IndexedFastaReader.h
@@ -0,0 +1,144 @@
+// File Description
+/// \file IndexedFastaReader.h
+/// \brief Defines the IndexedFastaReader class.
+//
+// Author: David Alexander
+
+#ifndef INDEXEDFASTAREADER_H
+#define INDEXEDFASTAREADER_H
+
+#include <htslib/faidx.h>
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include "pbbam/Orientation.h"
+#include "pbbam/Position.h"
+
+namespace PacBio {
+namespace BAM {
+
+class GenomicInterval;
+class BamRecord;
+
+/// \brief The IndexedFastaReader class provides random-access to FASTA file
+///        data.
+///
+class IndexedFastaReader
+{
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    IndexedFastaReader() = delete;
+    IndexedFastaReader(const std::string& filename);
+    IndexedFastaReader(const IndexedFastaReader& src);
+    IndexedFastaReader(IndexedFastaReader&&) = default;
+    IndexedFastaReader& operator=(const IndexedFastaReader& rhs);
+    IndexedFastaReader& operator=(IndexedFastaReader&&) = default;
+    ~IndexedFastaReader();
+
+    /// \}
+
+public:
+    /// name Sequence Access
+    /// \{
+
+    /// \brief Fetches FASTA sequence for desired interval.
+    ///
+    /// \param[in] id       reference sequence name
+    /// \param[in] begin    start position
+    /// \param[in] end      end position
+    ///
+    /// \returns sequence string at desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string Subsequence(const std::string& id, Position begin, Position end) const;
+
+    /// \brief Fetches FASTA sequence for desired interval.
+    ///
+    /// \param[in] interval desired interval
+    ///
+    /// \returns sequence string at desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string Subsequence(const GenomicInterval& interval) const;
+
+    /// \brief Fetches FASTA sequence for desired interval.
+    ///
+    /// \param[in] htslibRegion htslib/samtools-formatted REGION string
+    ///                         representing the desired interval
+    ///
+    /// \returns sequence string at desired interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string Subsequence(const char* htslibRegion) const;
+
+    /// \brief Fetches FASTA sequence corresponding to a BamRecord, oriented and
+    ///        gapped as requested.
+    ///
+    /// For example, "native" orientation and "gapped" will return the reference
+    /// sequence with gaps inserted, as would align against the read in "native"
+    /// orientation.
+    ///
+    /// \param[in] bamRecord        input BamRecord to derive interval/CIGAR
+    ///                             data
+    /// \param[in] orientation      orientation of output
+    /// \param[in] gapped           if true, gaps/padding will be inserted, per
+    ///                             record's CIGAR info.
+    /// \param[in] exciseSoftClips  if true, any soft-clipped positions will be
+    ///                             removed from query ends
+    ///
+    /// \returns sequence string over the record's interval
+    ///
+    /// \throws std::runtime_error on failure to fetch sequence
+    ///
+    std::string ReferenceSubsequence(const BamRecord& bamRecord,
+                                     const Orientation orientation = Orientation::GENOMIC,
+                                     const bool gapped = false,
+                                     const bool exciseSoftClips = false) const;
+
+    /// \}
+
+public:
+    /// \name File Attributes
+    /// \{
+
+    /// \returns true if FASTA file contains a sequence matching \p name
+    bool HasSequence(const std::string& name) const;
+
+    /// \returns the names of the sequence at a specific index in the FASTA file
+    std::string Name(const size_t idx) const;
+
+    /// \returns the names of all sequences stored in the FASTA file
+    std::vector<std::string> Names() const;
+
+    /// \returns number of sequences stored in FASTA file
+    int NumSequences() const;
+
+    /// \returns length of FASTA sequence
+    ///
+    /// \throws std::runtime_error if length could not be determined
+    ///
+    int SequenceLength(const std::string& name) const;
+
+    /// \}
+
+private:
+    std::string filename_;
+    faidx_t* handle_;
+
+private:
+    void Close();
+    bool Open(std::string filename);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // INDEXEDFASTAREADER_H
diff --git a/include/pbbam/Interval.h b/include/pbbam/Interval.h

new file mode 100644 (file)

index 0000000..ed85203
--- /dev/null
+++ b/include/pbbam/Interval.h
@@ -0,0 +1,121 @@
+// File Description
+/// \file Interval.h
+/// \brief Defines the Interval class.
+//
+// Author: Derek Barnett
+
+#ifndef INTERVAL_H
+#define INTERVAL_H
+
+#include <cstddef>
+#include <string>
+#include "pbbam/Config.h"
+
+#define BOOST_ICL_USE_STATIC_BOUNDED_INTERVALS
+#include <boost/icl/discrete_interval.hpp>
+#include <boost/icl/interval_traits.hpp>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief Represents a half-open (right-open) interval [start, stop)
+///
+/// \note This class is agnostic whether the values are 0-based or 1-based.
+///       Client code should primarily work with GenomicInterval, which does
+///       enforce this distinction.
+///
+template <typename T>
+class Interval
+{
+public:
+    using interval_type = boost::icl::discrete_interval<T>;
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty interval [0,0)
+    Interval();
+
+    /// \brief Creates a 'singleton' interval [val,val+1)
+    Interval(const T val);
+
+    /// brief Creates an interval from [start, stop) */
+    Interval(const T start, const T stop);
+
+    Interval(const Interval<T>&) = default;
+    Interval(Interval&&) = default;
+    Interval& operator=(const Interval<T>&) = default;
+    Interval& operator=(Interval<T>&&) = default;
+    ~Interval() = default;
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    /// \returns true if both intervals share the same endpoints
+    bool operator==(const Interval<T>& other) const;
+
+    /// \returns true if either interval's endpoints differ
+    bool operator!=(const Interval<T>& other) const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns interval's start coordinate
+    T Start() const;
+
+    /// Sets this interval's start coordinate.
+    ///
+    /// \param[in] start
+    /// \returns reference to this interval
+    ///
+    Interval<T>& Start(const T& start);
+
+    /// \returns interval's stop coordinate
+    T Stop() const;
+
+    /// Sets this interval's stop coordinate.
+    ///
+    /// \param[in] stop
+    /// \returns reference to this interval
+    ///
+    Interval<T>& Stop(const T& stop);
+
+    /// \}
+
+public:
+    /// \name Interval Operations
+
+    /// \returns true if this interval is fully covered by (or contained in) \p other
+    bool CoveredBy(const Interval<T>& other) const;
+
+    //// \returns true if this interval covers (or contains) \p other
+    bool Covers(const Interval<T>& other) const;
+
+    /// \returns true if intervals interset
+    bool Intersects(const Interval<T>& other) const;
+
+    /// \returns true if interval is valid (e.g. start < stop)
+    bool IsValid() const;
+
+    /// \returns interval length
+    size_t Length() const;
+
+    /// \}
+
+private:
+    interval_type data_;
+};
+
+}  // namespace BAM
+}  // namspace PacBio
+
+#include "pbbam/internal/Interval.inl"
+
+#endif  // GENOMICINTERVAL_H
diff --git a/include/pbbam/LocalContextFlags.h b/include/pbbam/LocalContextFlags.h

new file mode 100644 (file)

index 0000000..ae19a70
--- /dev/null
+++ b/include/pbbam/LocalContextFlags.h
@@ -0,0 +1,43 @@
+// File Description
+/// \file LocalContextFlags.h
+/// \brief Defines the LocalContextFlags enum & helper method(s).
+//
+// Author: Lance Hepler
+
+#ifndef LOCALCONTEXTFLAGS_H
+#define LOCALCONTEXTFLAGS_H
+
+#include <cstdint>
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The LocalContextFlags enum defines the flags that can be used
+///        to describe a subread's "local context", i.e. whether it is
+///        flanked by barcodes/adapters or its pass orientation.
+///
+enum LocalContextFlags : uint8_t
+{
+    NO_LOCAL_CONTEXT = 0,  ///< No context information available
+    ADAPTER_BEFORE = 1,    ///< Adapter precedes subread
+    ADAPTER_AFTER = 2,     ///< Adapter follows subread
+    BARCODE_BEFORE = 4,    ///< Barcode precedes subread
+    BARCODE_AFTER = 8,     ///< Barcode follows subread
+    FORWARD_PASS = 16,     ///< Subread's orientation is 'forward pass'
+    REVERSE_PASS = 32      ///< Subread's orientation is 'reverse pass'
+};
+
+/// \returns a LocalContextFlags value containing the result of the bitwise-OR
+///          operation of \p lhs and \p rhs.
+// constexpr is implicitly inline
+constexpr LocalContextFlags operator|(const LocalContextFlags lhs, const LocalContextFlags rhs)
+{
+    return static_cast<LocalContextFlags>(static_cast<int>(lhs) | static_cast<int>(rhs));
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // LOCALCONTEXTFLAGS_H
diff --git a/include/pbbam/MD5.h b/include/pbbam/MD5.h

new file mode 100644 (file)

index 0000000..37885aa
--- /dev/null
+++ b/include/pbbam/MD5.h
@@ -0,0 +1,22 @@
+// File Description
+/// \file MD5.h
+/// \brief Defines basic MD5 hash utilities
+//
+// Author: Brett Bowman
+
+#ifndef MD5_H
+#define MD5_H
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief MD5 hash of a string as a 32-digit hexadecimal string
+///
+std::string MD5Hash(const std::string& str);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // MD5_H
diff --git a/include/pbbam/MakeUnique.h b/include/pbbam/MakeUnique.h

new file mode 100644 (file)

index 0000000..bb96229
--- /dev/null
+++ b/include/pbbam/MakeUnique.h
@@ -0,0 +1,29 @@
+// File Description
+/// \file BamRecord.h
+/// \brief Defines the BamRecord class.
+//
+// Author: Derek Barnett
+
+#ifndef PBBAM_MAKE_UNIQUE_H
+#define PBBAM_MAKE_UNIQUE_H
+
+// Only include if in pre-C++14 mode
+//
+#if __cplusplus <= 201103L
+
+#include <cstddef>
+#include <memory>
+
+namespace std {
+
+template <typename T, typename... Args>
+inline std::unique_ptr<T> make_unique(Args&&... args)
+{
+    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+}  // namespace std
+
+#endif  // < C++14
+
+#endif  // PBBAM_MAKE_UNIQUE_H
diff --git a/include/pbbam/MoveAppend.h b/include/pbbam/MoveAppend.h

new file mode 100644 (file)

index 0000000..c24e727
--- /dev/null
+++ b/include/pbbam/MoveAppend.h
@@ -0,0 +1,50 @@
+// Author: Derek Barnett
+
+#ifndef MOVEAPPEND_H
+#define MOVEAPPEND_H
+
+#include <iterator>
+#include <utility>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+// \brief Appends content of src vector to dst vector using move semantics.
+///
+/// \param[in]     src  Input vector that will be empty after execution
+/// \param[in,out] dst  Output vector that will be appended to
+///
+template <typename T>
+inline void MoveAppend(std::vector<T>& src, std::vector<T>& dst) noexcept
+{
+    if (dst.empty()) {
+        dst = std::move(src);
+    } else {
+        dst.reserve(dst.size() + src.size());
+        std::move(src.begin(), src.end(), std::back_inserter(dst));
+        src.clear();
+    }
+}
+
+/// \brief Appends content of src vector to dst vector using move semantics.
+///
+/// \param[in]     src  Input vector via perfect forwarding
+/// \param[in,out] dst  Output vector that will be appended to
+///
+template <typename T>
+inline void MoveAppend(std::vector<T>&& src, std::vector<T>& dst) noexcept
+{
+    if (dst.empty()) {
+        dst = std::move(src);
+    } else {
+        dst.reserve(dst.size() + src.size());
+        std::move(src.begin(), src.end(), std::back_inserter(dst));
+        src.clear();
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // MOVEAPPEND_H
diff --git a/include/pbbam/Orientation.h b/include/pbbam/Orientation.h

new file mode 100644 (file)

index 0000000..637e5a5
--- /dev/null
+++ b/include/pbbam/Orientation.h
@@ -0,0 +1,34 @@
+// File Description
+/// \file Orientation.h
+/// \brief Defines the Orientation enum.
+//
+// Author: Derek Barnett
+
+#ifndef ORIENTATION_H
+#define ORIENTATION_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the orientations recognized by BamRecord, for
+///        presenting "per-base" data.
+///
+/// Orientation::NATIVE indicates that data should be presented in the subread's
+/// original form.
+///
+/// Orientation::GENOMIC indicates that data should be presented relative to
+/// genomic forward strand. This means that data will be reversed (or
+/// reverse-complemented) if the subread was aligned to the reverse strand.
+///
+enum class Orientation
+{
+    NATIVE,  ///< Present data in 'raw' original orientation, regardless of aligned Strand
+    GENOMIC  ///< Present data in aligned orientation, always relative to Strand::FORWARD.
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ORIENTATION_H
diff --git a/include/pbbam/PbiBasicTypes.h b/include/pbbam/PbiBasicTypes.h

new file mode 100644 (file)

index 0000000..a8820bb
--- /dev/null
+++ b/include/pbbam/PbiBasicTypes.h
@@ -0,0 +1,81 @@
+// File Description
+/// \file PbiBasicTypes.h
+/// \brief Defines the basic data structures used in PBI lookups.
+//
+// Author: Derek Barnett
+
+#ifndef PBIBASICTYPES_H
+#define PBIBASICTYPES_H
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <utility>
+#include <vector>
+#include "pbbam/Compare.h"
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The IndexResultBlock class represents a contiguous group of records
+///        returned from a PBI lookup.
+///
+/// Contiguous reads that satisfy a PBI lookup query will be merged down into a
+/// single block. This helps to minimize the number of seeks in subsequent read
+/// operations.
+///
+/// An PBI-enabled reader or query can iterate over a list of IndexResultBlocks;
+/// for each block, seeking to the first record and then sequentially reading
+/// 'numReads' consecutive records before needing to seek again.
+///
+struct PBBAM_EXPORT IndexResultBlock
+{
+public:
+    IndexResultBlock(size_t idx, size_t numReads);
+
+    IndexResultBlock() = default;
+    IndexResultBlock(const IndexResultBlock&) = default;
+    IndexResultBlock(IndexResultBlock&&) = default;
+    IndexResultBlock& operator=(const IndexResultBlock&) = default;
+    IndexResultBlock& operator=(IndexResultBlock&&) = default;
+    ~IndexResultBlock() = default;
+
+public:
+    bool operator==(const IndexResultBlock& other) const;
+    bool operator!=(const IndexResultBlock& other) const;
+
+public:
+    size_t firstIndex_ = 0;  ///< index of block's first record in BAM/PBI files (e.g. i-th record)
+    size_t numReads_ = 0;    ///< number of reads in this block
+    int64_t virtualOffset_ = -1;  ///< virtual offset of first record in this block
+};
+
+/// \brief container of PBI result blocks
+///
+using IndexResultBlocks = std::deque<IndexResultBlock>;
+
+/// \brief container of raw PBI indices
+///
+/// This is the primary result of PbiFilter -associated classes. This raw list
+/// can participate in set operations (union, intersect) for compound filters,
+/// and then be merged down into IndexResultBlocks for actual data file
+/// random-access.
+///
+using IndexList = std::vector<size_t>;
+
+/// \brief pair representing a range of PBI indices: where interval
+///        is [first, second)
+///
+/// Used primarily by the PBI's CoordinateSortedData components.
+///
+/// \sa PbiReferenceEntry, PbiRawReferenceData, & ReferenceLookupData
+///
+using IndexRange = std::pair<size_t, size_t>;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/PbiBasicTypes.inl"
+
+#endif  // PBIBASICTYPES_H
diff --git a/include/pbbam/PbiBuilder.h b/include/pbbam/PbiBuilder.h

new file mode 100644 (file)

index 0000000..5c4c241
--- /dev/null
+++ b/include/pbbam/PbiBuilder.h
@@ -0,0 +1,181 @@
+// File Description
+/// \file PbiBuilder.h
+/// \brief Defines the PbiBuilder class.
+//
+// Author: Derek Barnett
+
+#ifndef PBIBUILDER_H
+#define PBIBUILDER_H
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class PbiRawData;
+
+namespace internal {
+class PbiBuilderPrivate;
+}
+
+/// \brief The PbiBuilder class construct PBI index data from %BAM record data.
+///
+/// Records are added one-by-one. This allows for either whole-file indexing of
+/// existing %BAM files or for indexing "on-the-fly" alongside a %BAM file as it
+/// is generated.
+///
+/// For simple PBI creation from existing %BAM files, see PbiFile::CreateFrom.
+/// This is the recommended approach, unless finer control or additional
+/// processing is needed.
+///
+class PBBAM_EXPORT PbiBuilder
+{
+public:
+    /// \brief This enum allows you to control the compression level of the
+    ///        output PBI file.
+    ///
+    /// Values are equivalent to zlib compression levels. See its documentation
+    /// for more details: http://www.zlib.net/manual.html
+    ///
+    enum CompressionLevel
+    {
+        CompressionLevel_0 = 0,
+        CompressionLevel_1 = 1,
+        CompressionLevel_2 = 2,
+        CompressionLevel_3 = 3,
+        CompressionLevel_4 = 4,
+        CompressionLevel_5 = 5,
+        CompressionLevel_6 = 6,
+        CompressionLevel_7 = 7,
+        CompressionLevel_8 = 8,
+        CompressionLevel_9 = 9,
+
+        DefaultCompression = -1,
+        NoCompression = CompressionLevel_0,
+        FastCompression = CompressionLevel_1,
+        BestCompression = CompressionLevel_9
+    };
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Initializes builder to write data to \p pbiFilename.
+    ///
+    /// \param[in] pbiFilename      output filename
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, PbiBuilder will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \throws std::runtime_error if PBI file cannot be opened for writing
+    ///
+    PbiBuilder(const std::string& pbiFilename,
+               const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+               const size_t numThreads = 4);
+
+    /// \brief Initializes builder to write data to \p pbiFilename.
+    ///
+    /// Reference data-tracking structures will be initialized to expect
+    /// \p numReferenceSequences. (This is useful so that we can mark any
+    /// references that lack observed data appropriately).
+    ///
+    /// \param[in] pbiFilename              output filename
+    /// \param[in] numReferenceSequences    number of possible reference
+    ///                                     sequences, e.g. BamHeader::NumSequences
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, PbiBuilder will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \throws std::runtime_error if PBI file cannot be opened for writing
+    ///
+    PbiBuilder(const std::string& pbiFilename, const size_t numReferenceSequences,
+               const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+               const size_t numThreads = 4);
+
+    /// \brief Initializes builder to write data to \p pbiFilename.
+    ///
+    /// Reference data-tracking structures will be initialized to expect
+    /// \p numReferenceSequences, but only if \p isCoordinateSorted is true.
+    ///
+    /// \param[in] pbiFilename              output filename
+    /// \param[in] numReferenceSequences    number of possible reference
+    ///                                     sequences, e.g. BamHeader::NumSequences
+    /// \param[in] isCoordinateSorted       if false, disables reference
+    ///                                     sequence tracking
+    ///                                     (BamHeader::SortOrder != "coordinate")
+    /// \param[in] compressionLevel zlib compression level
+    /// \param[in] numThreads       number of threads for compression. If set to
+    ///                             0, PbiBuilder will attempt to determine a
+    ///                             reasonable estimate. If set to 1, this will
+    ///                             force single-threaded execution. No checks
+    ///                             are made against an upper limit.
+    ///
+    /// \throws std::runtime_error if PBI file cannot be opened for writing
+    ///
+    PbiBuilder(const std::string& pbiFilename, const size_t numReferenceSequences,
+               const bool isCoordinateSorted,
+               const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+               const size_t numThreads = 4);
+
+    /// \brief Destroys builder, writing its data out to PBI file.
+    ///
+    ///
+    /// \note Exceptions are swallowed. Use Close() if you want to catch them.
+    ///
+    ~PbiBuilder() noexcept;
+
+    /// \}
+
+public:
+    /// \name Index Building
+    /// \{
+
+    /// \brief Adds \p record's data to underlying raw data structure.
+    ///
+    /// \note \p vOffset is a BGZF \b virtual offset into the %BAM file. To get
+    ///          this value, you should use one of the following: \n
+    ///        - while reading existing %BAM: BamReader::VirtualTell \n
+    ///        - while writing new %BAM:      BamWriter::Write(const BamRecord& record, int64_t* vOffset) \n
+    ///
+    ///
+    /// To build a PBI index while generating a %BAM file:
+    /// \include code/PbiBuilder_WithWriter.txt
+    ///
+    /// To build a PBI index from an existing %BAM file:
+    /// \include code/PbiBuilder_WithReader.txt
+    ///
+    /// \param[in] record   input BamRecord to pull index data from
+    /// \param[in] vOffset  \b virtual offset into %BAM file where record begins
+    ///
+    void AddRecord(const BamRecord& record, const int64_t vOffset);
+
+    /// \brief Writes data out to PBI file & closes builder.
+    ///
+    /// \note Any exceptions are thrown to caller. If you don't care about
+    ///       catching exceptions with file I/O, just let the builder go out of
+    ///       scope and data will be written, but exceptions swallowed (to avoid
+    ///       throwing from destructor).
+    ///
+    void Close();
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::PbiBuilderPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBIBUILDER_H
diff --git a/include/pbbam/PbiFile.h b/include/pbbam/PbiFile.h

new file mode 100644 (file)

index 0000000..5ec10f2
--- /dev/null
+++ b/include/pbbam/PbiFile.h
@@ -0,0 +1,107 @@
+// File Description
+/// \file PbiFile.h
+/// \brief Defines the PbiFile enums, typedefs, and methods.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILE_H
+#define PBIFILE_H
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include "pbbam/Config.h"
+#include "pbbam/PbiBuilder.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile;
+
+struct PbiFile
+{
+
+    /// \brief This enum describes the PBI file sections
+    ///
+    enum Section
+    {
+        BASIC = 0x0000,      ///< BasicData     (required)
+        MAPPED = 0x0001,     ///< MappedData    (always optional)
+        REFERENCE = 0x0002,  ///< ReferenceData (always optional)
+        BARCODE = 0x0004,    ///< BarcodeData   (always optional)
+
+        ALL = BASIC | MAPPED | REFERENCE | BARCODE  ///< Synonym for 'all sections'
+    };
+
+    /// \brief Helper typedef for storing multiple Section flags.
+    ///
+    using Sections = uint16_t;
+
+    /// \brief This enum describes the PBI file version.
+    enum VersionEnum
+    {
+        Version_3_0_0 = 0x030000,  ///< v3.0.0
+        Version_3_0_1 = 0x030001,  ///< v3.0.1
+
+        CurrentVersion = Version_3_0_1  ///< Synonym for the current PBI version.
+    };
+
+    ///
+    /// \brief The BasicField enum
+    ///
+    enum class BasicField
+    {
+        RG_ID,
+        Q_START,
+        Q_END,
+        ZMW,
+        READ_QUALITY,
+        CONTEXT_FLAG,
+        VIRTUAL_OFFSET
+    };
+
+    ///
+    /// \brief The MappedField enum
+    ///
+    enum class MappedField
+    {
+        T_ID,
+        T_START,
+        T_END,
+        A_START,
+        A_END,
+        N_M,
+        N_MM,
+        N_INS,
+        N_DEL,
+        MAP_QUALITY,
+        STRAND
+    };
+
+    ///
+    /// \brief The BarcodeField enum
+    ///
+    enum class BarcodeField
+    {
+        BC_FORWARD,
+        BC_REVERSE,
+        BC_QUALITY
+    };
+
+    /// \brief Builds PBI index data from the supplied %BAM file and writes a
+    ///        ".pbi" file.
+    ///
+    /// \param[in] bamFile source %BAM file
+    ///
+    /// \throws std::runtime_error if index file could not be created
+    ///
+    static void CreateFrom(
+        const BamFile& bamFile,
+        const PbiBuilder::CompressionLevel compressionLevel = PbiBuilder::DefaultCompression,
+        const size_t numThreads = 4);
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBIFILE_H
diff --git a/include/pbbam/PbiFilter.h b/include/pbbam/PbiFilter.h

new file mode 100644 (file)

index 0000000..0af1be4
--- /dev/null
+++ b/include/pbbam/PbiFilter.h
@@ -0,0 +1,247 @@
+// File Description
+/// \file PbiFilter.h
+/// \brief Defines the PbiFilter class & helper 'concept'.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILTER_H
+#define PBIFILTER_H
+
+#include <boost/concept_check.hpp>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <tuple>
+#include "pbbam/DataSet.h"
+#include "pbbam/PbiBasicTypes.h"
+#include "pbbam/PbiRawData.h"
+#include "pbbam/Unused.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+struct PbiFilterPrivate;
+}
+
+/// \brief The PbiFilterConcept class provides compile-time enforcement of the
+///        required interface for PbiFilter's child filters.
+///
+template <typename T>
+struct PbiFilterConcept
+{
+    BOOST_CONCEPT_USAGE(PbiFilterConcept)
+    {
+        // All PBI filters (built-in or client-define) need only provide this
+        // interface:
+        //
+        //    bool Accepts(const PbiRawData& index, const size_t row) const;
+        //
+        PbiRawData index;
+        auto result = filter.Accepts(index, 0);
+        UNUSED(result);
+    }
+
+private:
+    T filter;
+    //    PbiRawData index;
+};
+
+/// \brief The PbiFilter class provides a mechanism for performing PBI-enabled
+///        lookups.
+///
+/// The PbiFilter API is designed to be flexible, both built-in and for
+/// client-side customization. Built-in filters are provided for common queries,
+/// and client code can define and use custom filters as well. More complex
+/// filtering rules can be constructed via composition of simpler child filters.
+///
+/// Filter objects used as children of PbiFilter need only provide a method that
+/// matches this signature:
+///
+/// \include code/PbiFilter_Interface.txt
+///
+/// This requirement is enforced internally, using the PbiFilterConcept to
+/// require a compatible interface without requiring inheritance. This approach
+/// allows composition of heterogeneous filter types without worrying about a
+/// class hierarchy, pointer ownership across library/client boundaries, etc.
+///
+/// Thus a client application can define a custom filter if the built-in filters
+/// do not quite meet requirements. This filter may then be used in further
+/// PbiFilter composition, or directly to PbiFilterQuery
+///
+/// \include code/PbiFilter_CustomFilter.txt
+///
+/// As mentioned above, complex filters can be built up using multiple "child"
+/// filters. These complex filters are constructed by using either
+/// PbiFilter::Union (logical-OR over all direct children) or
+/// PbiFilter::Intersection (logical-AND over direct children).
+///
+/// \include code/PbiFilter_Composition.txt
+///
+class PBBAM_EXPORT PbiFilter
+{
+public:
+    enum CompositionType
+    {
+        INTERSECT,
+        UNION
+    };
+
+public:
+    /// \name Set Operations
+    /// \{
+
+    /// \brief Creates a PbiFilter that acts as an intersection of the input
+    ///        filters.
+    ///
+    /// A record must satisfy \b all of this filter's direct "child" filters.
+    ///
+    /// \param[in] filters  vector of child filters
+    /// \returns composite filter
+    ///
+    static PbiFilter Intersection(std::vector<PbiFilter> filters);
+
+    /// \brief Creates a PbiFilter that acts as a union of the input filters.
+    ///
+    /// A record must satisfy \b any of this filter's direct "child" filters.
+    ///
+    /// \param[in] filters  vector of child filters
+    /// \returns composite filter
+    ///
+    static PbiFilter Union(std::vector<PbiFilter> filters);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a PbiFilter from a %DataSet's described filters.
+    ///
+    /// A DataSet may contain a Filters element, itself a list of Filter
+    /// elements. Each Filter element will contain a Properties element, itself
+    /// a list of Property elements.
+    ///
+    /// The Filters hierarchy looks like this (in its XML output):
+    /// \verbinclude examples/plaintext/PbiFilter_DataSetXmlFilters.txt
+    ///
+    /// The resulting PbiFilter represents a union over all Filter elements,
+    /// with each Filter element requiring an intersection of all of its
+    /// Property criteria. These Property elements are mapped to built-in PBI
+    /// filter types. To use the labels in the example XML above, the filter
+    /// created here is equivalent to:
+    ///
+    /// (A && B) || (C && D)
+    ///
+    /// If a DataSet lacks any Filters, then an empty PbiFilter will be created
+    /// - corresponding to the dataset's entire contents.
+    ///
+    /// \param[in] dataset  maybe containing filters
+    /// \returns composite filter
+    ///
+    static PbiFilter FromDataSet(const DataSet& dataset);
+
+public:
+    /// \brief Creates an empty filter.
+    ///
+    /// \note An empty filter will result in all records being returned, e.g.
+    ///       for query iteration.
+    ///
+    /// \param[in] type composition type. Any additional child filters added to
+    ///                 this composite will be treated according to this type.
+    ///                 If INTERSECT, a record must match all child filters. If
+    ///                 UNION, a record must match any child filter.
+    ///
+    PbiFilter(const CompositionType type = INTERSECT);
+
+    /// \brief Creates a composite filter (of INTERSECT type) with an initial
+    ///        child filter.
+    ///
+    /// \note T must satisfy PbiFilterConcept
+    ///
+    /// \param[in] filter initial child filter
+    ///
+    template <typename T>
+    PbiFilter(T filter);
+
+    /// \brief Creates composite filter (of INTERSECT type) with a list of
+    ///        initial child filters.
+    ///
+    /// \param[in] filters initial child filters
+    ///
+    PbiFilter(std::vector<PbiFilter> filters);
+
+    PbiFilter(const PbiFilter&);
+    PbiFilter(PbiFilter&&) noexcept = default;
+    PbiFilter& operator=(const PbiFilter&);
+    PbiFilter& operator=(PbiFilter&&) noexcept = default;
+    ~PbiFilter() = default;
+
+    /// \}
+
+public:
+    /// \name Composition
+    /// \{
+
+    /// \brief Adds a new child filter of type T.
+    ///
+    /// \param[in] filter   additional child filter. Type T must satisfy
+    ///                     PbiFilterConcept.
+    /// \returns reference to this filter
+    ///
+    template <typename T>
+    PbiFilter& Add(T filter);
+
+    /// \brief Adds a new child filter.
+    ///
+    /// \param[in] filter   additional child filter
+    /// \returns reference to this filter
+    ///
+    PbiFilter& Add(PbiFilter filter);
+
+    /// \brief Add child filters.
+    ///
+    /// \param[in] filters  additional child filters
+    /// \returns reference to this filter
+    ///
+    PbiFilter& Add(std::vector<PbiFilter> filters);
+
+    /// \returns true if this filter has no child filters.
+    bool IsEmpty() const;
+
+    /// \returns number of child filters
+    size_t NumChildren() const;
+
+    /// \returns filter type (intersect, union)
+    CompositionType Type() const;
+
+    /// \}
+
+public:
+    /// \name Lookup
+    /// \{
+
+    /// \brief Performs the PBI index lookup, combining child results a
+    ///        composite filter.
+    ///
+    /// \param[in] idx  PBI (raw) index object
+    /// \param[in] row  record number in %BAM/PBI files
+    ///
+    /// \returns true if record at \p row passes this filter criteria,
+    ///          including children (if any)
+    ///
+    bool Accepts(const BAM::PbiRawData& idx, const size_t row) const;
+
+    /// \}
+
+private:
+    std::unique_ptr<internal::PbiFilterPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/PbiFilterTypes.h"
+#include "pbbam/internal/PbiFilter.inl"
+
+#endif  // PBIFILTER_H
diff --git a/include/pbbam/PbiFilterQuery.h b/include/pbbam/PbiFilterQuery.h

new file mode 100644 (file)

index 0000000..1469ef9
--- /dev/null
+++ b/include/pbbam/PbiFilterQuery.h
@@ -0,0 +1,75 @@
+// File Description
+/// \file PbiFilterQuery.h
+/// \brief Defines the PbiFilterQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILTERQUERY_H
+#define PBIFILTERQUERY_H
+
+#include <vector>
+#include "pbbam/Config.h"
+#include "pbbam/PbiFilter.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The PbiFilter class provides iterable access to a DataSet's %BAM
+///        records, limiting results to those matching filter criteria.
+///
+/// Example:
+/// \include code/PbiFilterQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT PbiFilterQuery : public internal::IQuery
+{
+public:
+    ///
+    /// \brief Creates a new PbiFilterQuery, limiting record results to only
+    ///        those matching filter criteria defined in the DataSet XML.
+    ///
+    /// \param[in] dataset input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         PBI files.
+    ///
+    PbiFilterQuery(const DataSet& dataset);
+
+    /// \brief Creates a new PbiFilterQuery, limiting record results to only
+    ///        those matching filter criteria
+    ///
+    /// \param[in] filter   filtering criteria
+    /// \param[in] dataset  input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         PBI files.
+    ///
+    PbiFilterQuery(const PbiFilter& filter, const DataSet& dataset);
+
+    ~PbiFilterQuery() override;
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+    /// \brief Return number of records that pass the provided filter
+    ///
+    uint32_t NumReads() const;
+
+private:
+    struct PbiFilterQueryPrivate;
+    std::unique_ptr<PbiFilterQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBIFILTERQUERY_H
diff --git a/include/pbbam/PbiFilterTypes.h b/include/pbbam/PbiFilterTypes.h

new file mode 100644 (file)

index 0000000..fe82243
--- /dev/null
+++ b/include/pbbam/PbiFilterTypes.h
@@ -0,0 +1,899 @@
+// File Description
+/// \file PbiFilterTypes.h
+/// \brief Defines the built-in PBI filters.
+//
+// Author: Derek Barnett
+
+#ifndef PBIFILTERTYPES_H
+#define PBIFILTERTYPES_H
+
+#include <boost/optional.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include "pbbam/Compare.h"
+#include "pbbam/PbiFile.h"
+#include "pbbam/PbiFilter.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+
+/// \internal
+///
+/// Provides basic container for value/compare-type pair
+///
+template <typename T>
+struct FilterBase
+{
+public:
+    T value_;
+    boost::optional<std::vector<T> > multiValue_;
+    Compare::Type cmp_;
+
+protected:
+    FilterBase(T value, const Compare::Type cmp);
+    FilterBase(std::vector<T> values, const Compare::Type cmp = Compare::EQUAL);
+
+protected:
+    bool CompareHelper(const T& lhs) const;
+
+private:
+    bool CompareSingleHelper(const T& lhs) const;
+    bool CompareMultiHelper(const T& lhs) const;
+};
+
+/// \internal
+///
+/// Dispatches the lookup to BarcodeLookupData
+///
+template <typename T, PbiFile::BarcodeField field>
+struct BarcodeDataFilterBase : public FilterBase<T>
+{
+protected:
+    BarcodeDataFilterBase(T value, const Compare::Type cmp);
+    BarcodeDataFilterBase(std::vector<T> values, const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \internal
+///
+/// Dispatches the lookup to BasicLookupData
+///
+template <typename T, PbiFile::BasicField field>
+struct BasicDataFilterBase : public FilterBase<T>
+{
+protected:
+    BasicDataFilterBase(T value, const Compare::Type cmp);
+    BasicDataFilterBase(std::vector<T> values, const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \internal
+///
+/// Dispatches the lookup to MappedLookupData
+///
+template <typename T, PbiFile::MappedField field>
+struct MappedDataFilterBase : public FilterBase<T>
+{
+protected:
+    MappedDataFilterBase(T value, const Compare::Type cmp);
+    MappedDataFilterBase(std::vector<T> values, const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+}  // namespace internal
+
+/// \brief The PbiAlignedEndFilter class provides a PbiFilter-compatible filter
+///        on aligned end.
+///
+/// Example: \include code/PbiAlignedEndFilter.txt
+///
+/// \sa BamRecord::AlignedEnd
+///
+struct PbiAlignedEndFilter
+    : public internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::A_END>
+{
+public:
+    /// \brief Creates a filter on aligned end.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiAlignedEndFilter(const uint32_t position, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiAlignedLengthFilter class provides a PbiFilter-compatible
+///        filter on aligned length.
+///
+/// Example: \include code/PbiAlignedLengthFilter.txt
+///
+/// \sa BamRecord::AlignedEnd, BamRecord::AlignedStart
+///
+struct PbiAlignedLengthFilter : public internal::FilterBase<uint32_t>
+{
+public:
+    /// \brief Creates a filter on aligned length.
+    ///
+    /// \param[in] length value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiAlignedLengthFilter(const uint32_t length, const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \brief The PbiAlignedStartFilter class provides a PbiFilter-compatible
+///        filter on aligned start.
+///
+/// Example: \include code/PbiAlignedStartFilter.txt
+///
+/// \sa BamRecord::AlignedStart
+///
+struct PbiAlignedStartFilter
+    : public internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::A_START>
+{
+public:
+    /// \brief Creates a filter on aligned start.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiAlignedStartFilter(const uint32_t position, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiAlignedStrandFilter class provides a PbiFilter-compatible
+///        filter on aligned strand.
+///
+/// Example: \include code/PbiAlignedStrandFilter.txt
+///
+/// \sa BamRecord::AlignedStrand
+///
+struct PbiAlignedStrandFilter
+    : public internal::MappedDataFilterBase<Strand, PbiFile::MappedField::STRAND>
+{
+public:
+    /// \brief Creates a strand filter.
+    ///
+    /// \param[in] strand  strand value to compare on
+    /// \param[in] cmp     compare type
+    ///
+    PbiAlignedStrandFilter(const Strand strand, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiBarcodeFilter class provides a PbiFilter-compatible filter on
+///        barcode ID.
+///
+/// Any record with this barcode ID (forward or reverse) will pass this filter.
+///
+/// Example: \include code/PbiBarcodeFilter.txt
+///
+/// \sa BamRecord::BarcodeForward, BamRecord::BarcodeReverse
+///
+struct PbiBarcodeFilter
+{
+public:
+    /// \brief Creates a single-value barcode filter.
+    ///
+    /// \param[in] barcode  barcode ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeFilter(const int16_t barcode, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' barcode filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly, in either bc_forward or bc_reverse.
+    ///
+    /// \param[in] whitelist  barcode IDs to compare on
+    ///
+    PbiBarcodeFilter(std::vector<int16_t> whitelist, const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    PbiFilter compositeFilter_;
+};
+
+/// \brief The PbiBarcodeForwardFilter class provides a PbiFilter-compatible
+///        filter on forward barcode ID.
+///
+/// Example: \include code/PbiBarcodeForwardFilter.txt
+///
+/// \sa BamRecord::BarcodeForward
+///
+struct PbiBarcodeForwardFilter
+    : public internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_FORWARD>
+{
+public:
+    /// \brief Creates a single-value forward barcode filter.
+    ///
+    /// \param[in] bcFwdId  (forward) barcode ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeForwardFilter(const int16_t bcFwdId, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' forward barcode filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly, in bc_forward.
+    ///
+    /// \param[in] whitelist  barcode IDs to compare on
+    ///
+    PbiBarcodeForwardFilter(std::vector<int16_t> whitelist,
+                            const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiBarcodeQualityFilter class provides a PbiFilter-compatible
+///        filter on  barcode quality.
+///
+/// Example: \include code/PbiBarcodeQualityFilter.txt
+///
+/// \sa BamRecord::BarcodeQuality
+///
+struct PbiBarcodeQualityFilter
+    : public internal::BarcodeDataFilterBase<uint8_t, PbiFile::BarcodeField::BC_QUALITY>
+{
+public:
+    /// \brief Creates a single-value barcode quality filter.
+    ///
+    /// \param[in] bcQuality    barcode quality to compare on
+    /// \param[in] cmp          compare type
+    ///
+    PbiBarcodeQualityFilter(const uint8_t bcQuality, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiBarcodeReverseFilter class provides a PbiFilter-compatible
+///        filter on forward barcode ID.
+///
+/// Example: \include code/PbiBarcodeReverseFilter.txt
+///
+/// \sa BamRecord::BarcodeReverse
+///
+struct PbiBarcodeReverseFilter
+    : public internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_REVERSE>
+{
+public:
+    /// \brief Creates a single-value reverse barcode filter.
+    ///
+    /// \param[in] bcRevId  (reverse) barcode ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodeReverseFilter(const int16_t bcRevId, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' reverse barcode filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly, in bc_reverse.
+    ///
+    /// \param[in] whitelist  barcode IDs to compare on
+    ///
+    PbiBarcodeReverseFilter(std::vector<int16_t> whitelist,
+                            const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiBarcodesFilter class provides a PbiFilter-compatible filter on
+///        both forward & reverse barcode IDs.
+///
+/// A record must match both IDs to pass the filter.
+///
+/// Example: \include code/PbiBarcodesFilter.txt
+///
+/// \sa BamRecord::Barcodes
+///
+struct PbiBarcodesFilter
+{
+public:
+    /// \brief Creates a barcodes filter from a std::pair of IDs.
+    ///
+    /// pair.first -> BarcodeForward\n
+    /// pair.second -> BarcodeReverse
+    ///
+    /// \param[in] barcodes barcode IDs to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiBarcodesFilter(const std::pair<int16_t, int16_t> barcodes,
+                      const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a barcodes filter from forward & reverse IDs.
+    ///
+    /// \param[in] bcForward    forward barcode ID to compare on
+    /// \param[in] bcReverse    reverse barcode ID to compare on
+    /// \param[in] cmp          compare type
+    ///
+    PbiBarcodesFilter(const int16_t bcForward, const int16_t bcReverse,
+                      const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    PbiFilter compositeFilter_;
+};
+
+/// \brief The PbiIdentityFilter class provides a PbiFilter-compatible filter on
+///        read identity (% aligned match).
+///
+/// Read identity is equivalent to: 1.0 - (nMM + nDel + nIns)/readLength.
+///
+/// Example: \include code/PbiIdentityFilter.txt
+///
+struct PbiIdentityFilter : public internal::FilterBase<float>
+{
+public:
+    /// \brief Creates a read identity filter.
+    ///
+    /// \param[in] identity value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiIdentityFilter(const float identity, const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \brief The PbiLocalContextFilter class provides a PbiFilter-compatible
+///        filter on local context (adapter, barcode, etc.).
+///
+/// The primary Compare::Type operators intended for this filter are:
+/// Compare::EQUAL, Compare::NOT_EQUAL, Compare::CONTAINS, and
+/// Compare::NOT_CONTAINS.
+///
+/// Example: \include code/PbiLocalContextFilter.txt
+///
+struct PbiLocalContextFilter
+    : public internal::BasicDataFilterBase<LocalContextFlags, PbiFile::BasicField::CONTEXT_FLAG>
+{
+public:
+    PbiLocalContextFilter(const LocalContextFlags& flags, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiMapQualityFilter class provides a PbiFilter-compatible filter on
+///        mapping quality.
+///
+/// Example: \include code/PbiMapQualityFilter.txt
+///
+/// \sa BamRecord::MapQuality
+///
+struct PbiMapQualityFilter
+    : public internal::MappedDataFilterBase<uint8_t, PbiFile::MappedField::MAP_QUALITY>
+{
+public:
+    /// \brief Creates a map quality filter.
+    ///
+    /// \param[in] mapQual  value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiMapQualityFilter(const uint8_t mapQual, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiMovieNameFilter class provides a PbiFilter-compatible filter
+///        on movie name.
+///
+/// Example: \include code/PbiMovieNameFilter.txt
+///
+/// \sa BamRecord::MovieName
+///
+struct PbiMovieNameFilter
+{
+public:
+    /// \brief Creates a single-value movie name filter.
+    ///
+    /// \param[in] movieName    movie name to compare on
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match movie name, exactly.
+    ///
+    PbiMovieNameFilter(const std::string& movieName, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' movie name filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    movie names to compare on
+    ///
+    PbiMovieNameFilter(const std::vector<std::string>& whitelist,
+                       const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    PbiFilter compositeFilter_;
+    Compare::Type cmp_;
+};
+
+/// \brief The PbiNumDeletedBasesFilter class provides a PbiFilter-compatible
+///        filter on the number of deleted bases.
+///
+/// Example: \include code/PbiNumDeletedBasesFilter.txt
+///
+/// \sa BamRecord::NumDeletedBases
+///
+struct PbiNumDeletedBasesFilter
+    : public internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_DEL>
+{
+public:
+    /// \brief Creates a filter on the number of deleted bases.
+    ///
+    /// \param[in] numDeletions value to compare on
+    /// \param[in] cmp          compare type
+    ///
+    PbiNumDeletedBasesFilter(const size_t numDeletions, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiNumInsertededBasesFilter class provides a PbiFilter-compatible
+///        filter on the number of inserted bases.
+///
+/// Example: \include code/PbiNumInsertedBasesFilter.txt
+///
+/// \sa BamRecord::NumInsertedBases
+///
+struct PbiNumInsertedBasesFilter
+    : public internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_INS>
+{
+public:
+    /// \brief Creates a filter on the number of inserted bases.
+    ///
+    /// \param[in] numInsertions    value to compare on
+    /// \param[in] cmp              compare type
+    ///
+    PbiNumInsertedBasesFilter(const size_t numInsertions, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiNumMatchesFilter class provides a PbiFilter-compatible filter
+///        on the number of matched bases.
+///
+/// Example: \include code/PbiNumMatchesFilter.txt
+///
+/// \sa BamRecord::NumMatches
+///
+struct PbiNumMatchesFilter
+    : public internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_M>
+{
+public:
+    /// \brief Creates a filter on the number of matched bases.
+    ///
+    /// \param[in] numMatchedBases  value to compare on
+    /// \param[in] cmp              compare type
+    ///
+    PbiNumMatchesFilter(const size_t numMatchedBases, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiNumMismatchesFilter class provides a PbiFilter-compatible
+///        filter on the number of mismatched bases.
+///
+/// Example: \include code/PbiNumMismatchesFilter.txt
+///
+/// \sa BamRecord::NumMismatches
+///
+struct PbiNumMismatchesFilter
+    : public internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_MM>
+{
+public:
+    /// \brief Creates a filter on the number of mismatched bases.
+    ///
+    /// \param[in] numMismatchedBases   value to compare on
+    /// \param[in] cmp                  compare type
+    ///
+    PbiNumMismatchesFilter(const size_t numMismatchedBases,
+                           const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiQueryEndFilter class provides a PbiFilter-compatible filter
+///        on query end.
+///
+/// Example: \include code/PbiQueryEndFilter.txt
+///
+/// \sa BamRecord::QueryEnd
+///
+struct PbiQueryEndFilter : public internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::Q_END>
+{
+public:
+    /// \brief Creates a filter on query end position.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiQueryEndFilter(const int32_t position, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiQueryLengthFilter class provides a PbiFilter-compatible filter
+///        on query length.
+///
+/// queryLength = (queryEnd - queryStart)
+///
+/// Example: \include code/PbiQueryLengthFilter.txt
+///
+/// \sa BamRecord::QueryEnd, BamRecord::QueryStart
+///
+struct PbiQueryLengthFilter : public internal::FilterBase<int32_t>
+{
+public:
+    /// \brief Creates a filter on query length
+    ///
+    /// \param[in] length   value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiQueryLengthFilter(const int32_t length, const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+};
+
+/// \brief The PbiQueryNameFilter class provides a PbiFilter-compatible filter
+///        on name length.
+///
+/// Example: \include code/PbiQueryNameFilter.txt
+///
+/// \sa BamRecord::FullName
+///
+struct PbiQueryNameFilter
+{
+public:
+    /// \brief Creates a single-value query name filter.
+    ///
+    /// \param[in] qname    query name to compare on
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match query name, exactly.
+    ///
+    PbiQueryNameFilter(const std::string& qname, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' query name filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    query names to compare on
+    ///
+    PbiQueryNameFilter(const std::vector<std::string>& whitelist,
+                       const Compare::Type cmp = Compare::EQUAL);
+
+    PbiQueryNameFilter(const PbiQueryNameFilter& other);
+    ~PbiQueryNameFilter();
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    struct PbiQueryNameFilterPrivate;
+    std::unique_ptr<PbiQueryNameFilterPrivate> d_;
+};
+
+/// \brief The PbiQueryStartFilter class provides a PbiFilter-compatible filter
+///        on query start.
+///
+/// Example: \include code/PbiQueryStartFilter.txt
+///
+/// \sa BamRecord::QueryStart
+///
+struct PbiQueryStartFilter
+    : public internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::Q_START>
+{
+public:
+    /// \brief Creates a filter on query start position.
+    ///
+    /// \param[in] position value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiQueryStartFilter(const int32_t position, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiReadAccuracyFilter class provides a PbiFilter-compatible filter
+///        on read accuracy.
+///
+/// Example: \include code/PbiReadAccuracyFilter.txt
+///
+/// \sa BamRecord::ReadAccuracy
+///
+struct PbiReadAccuracyFilter
+    : public internal::BasicDataFilterBase<Accuracy, PbiFile::BasicField::READ_QUALITY>
+{
+public:
+    /// \brief Creates a filter on read accuracy.
+    ///
+    /// \param[in] accuracy value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReadAccuracyFilter(const Accuracy accuracy, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiReadGroupFilter class provides a PbiFilter-compatible filter
+///        on read group.
+///
+/// Example: \include code/PbiReadGroupFilter.txt
+///
+/// \sa BamRecord::ReadGroup,
+///     BamRecord::ReadGroupId,
+///     BamRecord::ReadGroupNumericId
+///
+struct PbiReadGroupFilter
+    : public internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::RG_ID>
+{
+public:
+    /// \brief Creates a filter on read group (numeric) ID value
+    ///
+    /// \param[in] rgId     numeric read group ID
+    /// \param[in] cmp      compare type
+    ///
+    /// \sa BamRecord::ReadGroupNumericId
+    ///
+    PbiReadGroupFilter(const int32_t rgId, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a filter on printable read group ID value
+    ///
+    /// \param[in] rgId     read group ID string
+    /// \param[in] cmp      compare type
+    ///
+    /// \sa BamRecord::ReadGroupId
+    ///
+    PbiReadGroupFilter(const std::string& rgId, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a filter on read group (object).
+    ///
+    /// \param[in] rg   read group object
+    /// \param[in] cmp  compare type
+    ///
+    /// \sa BamRecord::ReadGroup
+    ///
+    PbiReadGroupFilter(const ReadGroupInfo& rg, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' filter on read group numeric IDs.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    read group IDs to compare on
+    ///
+    PbiReadGroupFilter(std::vector<int32_t> whitelist, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' filter on read group printable IDs.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    read group ID strings to compare on
+    ///
+    PbiReadGroupFilter(const std::vector<std::string>& whitelist,
+                       const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' filter using read group objects.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    read group objects to compare on
+    ///
+    PbiReadGroupFilter(const std::vector<ReadGroupInfo>& whitelist,
+                       const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiReferenceEndFilter class provides a PbiFilter-compatible
+///        filter on reference end.
+///
+/// Example: \include code/PbiReferenceEndFilter.txt
+///
+/// \sa BamRecord::ReferenceEnd
+///
+struct PbiReferenceEndFilter
+    : public internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::T_END>
+{
+public:
+    /// \brief Creates a filter on reference end.
+    ///
+    /// \param[in] tEnd     value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReferenceEndFilter(const uint32_t tEnd, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiReferenceIdFilter class provides a PbiFilter-compatible
+///        filter on reference ID.
+///
+/// Example: \include code/PbiReferenceIdFilter.txt
+///
+/// \sa BamRecord::ReferenceId
+///
+struct PbiReferenceIdFilter
+    : public internal::MappedDataFilterBase<int32_t, PbiFile::MappedField::T_ID>
+{
+public:
+    /// \brief Creates a single-value reference ID filter.
+    ///
+    /// \param[in] tId  reference ID to compare on
+    /// \param[in] cmp  compare type
+    ///
+    PbiReferenceIdFilter(const int32_t tId, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' reference ID filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    reference IDs to compare on
+    ///
+    PbiReferenceIdFilter(std::vector<int32_t> whitelist, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiReferenceNameFilter class provides a PbiFilter-compatible
+///        filter on reference name.
+///
+/// Example: \include code/PbiReferenceNameFilter.txt
+///
+/// \sa BamRecord::ReferenceName
+///
+struct PbiReferenceNameFilter
+{
+public:
+    /// \brief Creates a single-value reference name filter.
+    ///
+    /// \param[in] rname    reference ID to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReferenceNameFilter(std::string rname, Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' reference name filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    reference names to compare on
+    ///
+    PbiReferenceNameFilter(std::vector<std::string> whitelist,
+                           const Compare::Type cmp = Compare::EQUAL);
+
+public:
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    mutable bool initialized_ = false;
+    mutable PbiFilter subFilter_;
+    std::string rname_;
+    boost::optional<std::vector<std::string> > rnameWhitelist_;
+    Compare::Type cmp_;
+
+private:
+    // marked const so we can delay setup of filter in Accepts(), once we have
+    // access to PBI/BAM input. modified values marked mutable accordingly
+    void Initialize(const PbiRawData& idx) const;
+};
+
+/// \brief The PbiReferenceStartFilter class provides a PbiFilter-compatible
+///        filter on reference start.
+///
+/// Example: \include code/PbiReferenceStartFilter.txt
+///
+/// \sa BamRecord::ReferenceStart
+///
+struct PbiReferenceStartFilter
+    : public internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::T_START>
+{
+public:
+    /// \brief Creates a filter on reference start.
+    ///
+    /// \param[in] tStart   value to compare on
+    /// \param[in] cmp      compare type
+    ///
+    PbiReferenceStartFilter(const uint32_t tStart, const Compare::Type cmp = Compare::EQUAL);
+};
+
+/// \brief The PbiZmwFilter class provides a PbiFilter-compatible filter on
+///        ZMW hole number.
+///
+/// Example: \include code/PbiZmwFilter.txt
+///
+/// \sa BamRecord::HoleNumber
+///
+struct PbiZmwFilter : public internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::ZMW>
+{
+public:
+    /// \brief Creates a single-value ZMW hole number filter.
+    ///
+    /// \param[in] zmw  value to compare on
+    /// \param[in] cmp  compare type
+    ///
+    PbiZmwFilter(const int32_t zmw, const Compare::Type cmp = Compare::EQUAL);
+
+    /// \brief Creates a 'whitelisted' ZMW hole number filter.
+    ///
+    /// \note There is no compare type parameter here, it is always
+    ///       Compare::EQUAL. Records will match at least one value from the
+    ///       whitelist, exactly.
+    ///
+    /// \param[in] whitelist    ZMW hole numbers to compare on
+    ///
+    PbiZmwFilter(std::vector<int32_t> whitelist, const Compare::Type cmp = Compare::EQUAL);
+};
+
+// ----------------------------------------------
+// NOTE: modulo filtering only enabled for ZMW.
+//
+// I need to generalize more if we're going to use
+// this on more fields.
+// ----------------------------------------------
+
+enum class FilterHash
+{
+    UNSIGNED_LONG_CAST,
+    BOOST_HASH_COMBINE,
+};
+
+struct PbiZmwModuloFilter
+{
+    PbiZmwModuloFilter(const uint32_t denominator, const uint32_t value,
+                       const FilterHash hashtype = FilterHash::UNSIGNED_LONG_CAST,
+                       const Compare::Type = Compare::EQUAL);
+
+    /// \brief Performs the actual index lookup.
+    ///
+    /// Most client code should not need to use this method directly.
+    ///
+    bool Accepts(const PbiRawData& idx, const size_t row) const;
+
+private:
+    uint32_t denominator_;
+    uint32_t value_;
+    FilterHash hash_;
+    Compare::Type cmp_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/PbiFilterTypes.inl"
+
+#endif  // PBIFILTERTYPES_H
diff --git a/include/pbbam/PbiIndexedBamReader.h b/include/pbbam/PbiIndexedBamReader.h

new file mode 100644 (file)

index 0000000..7d2ed6b
--- /dev/null
+++ b/include/pbbam/PbiIndexedBamReader.h
@@ -0,0 +1,118 @@
+// File Description
+/// \file PbiIndexedBamReader.h
+/// \brief Defines the PbiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#ifndef PBIINDEXEDBAMREADER_H
+#define PBIINDEXEDBAMREADER_H
+
+#include <string>
+#include "pbbam/BamFile.h"
+#include "pbbam/BamReader.h"
+#include "pbbam/PbiBasicTypes.h"
+#include "pbbam/PbiFilter.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+struct PbiIndexedBamReaderPrivate;
+}
+
+/// \brief The PbiIndexedBamReader class provides read-only iteration over %BAM
+///        records, limited to some filtering criteria.
+///
+/// The PacBio BAM index (*.pbi) is used to allow random-access operations.
+///
+class PBBAM_EXPORT PbiIndexedBamReader : public BamReader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Constructs %BAM reader, with an initial filter.
+    ///
+    /// All reads that satisfy the filter will be available.
+    ///
+    /// \param[in] filter       PbiFilter or compatible object
+    /// \param[in] bamFilename  input %BAM filename
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(PbiFilter filter, const std::string& bamFilename);
+
+    /// \brief Constructs %BAM reader, with an initial filter.
+    ///
+    /// All reads that satisfy the filter will be available.
+    ///
+    /// \param[in] filter       PbiFilter or compatible object
+    /// \param[in] bamFile      input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(PbiFilter filter, BamFile bamFile);
+
+    /// \brief Constructs %BAM reader, with no initial filter.
+    ///
+    /// Useful for delaying either specifying the filtering criteria or
+    /// performing the PBI lookups.
+    ///
+    /// \param[in] bamFilename  input %BAM filename
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(const std::string& bamFilename);
+
+    /// \brief Constructs %BAM reader, with no initial filter.
+    ///
+    /// Useful for delaying either specifying the filtering criteria or
+    /// performing the PBI lookups.
+    ///
+    /// \param[in] bamFile      input BamFile object
+    ///
+    /// \throws std::runtime_error if either file (*.bam or *.pbi) cannot be
+    ///         read
+    ///
+    PbiIndexedBamReader(BamFile bamFile);
+
+    ~PbiIndexedBamReader() override;
+
+    /// \}
+
+public:
+    /// \name Filtering & Index Data
+    /// \{
+
+    /// \returns the current filter active on this reader
+    const PbiFilter& Filter() const;
+
+    uint32_t NumReads() const;
+
+    //    /// \returns the reader's underlying index data
+    //    const PbiIndex& Index() const;
+
+public:
+    /// \brief Sets a new filter on the reader.
+    ///
+    /// \param[in] filter
+    /// \returns reference to this reader
+    ///
+    PbiIndexedBamReader& Filter(PbiFilter filter);
+
+    /// \}
+
+protected:
+    int ReadRawData(BGZF* bgzf, bam1_t* b) override;
+
+private:
+    std::unique_ptr<internal::PbiIndexedBamReaderPrivate> d_;
+};
+
+}  // namespace internal
+}  // namespace BAM
+
+#endif  // PBIINDEXEDBAMREADER_H
diff --git a/include/pbbam/PbiRawData.h b/include/pbbam/PbiRawData.h

new file mode 100644 (file)

index 0000000..b340dee
--- /dev/null
+++ b/include/pbbam/PbiRawData.h
@@ -0,0 +1,486 @@
+// File Description
+/// \file PbiRawData.h
+/// \brief Defines the classes used for working with raw PBI data.
+//
+// Author: Derek Barnett
+
+#ifndef PBIRAWDATA_H
+#define PBIRAWDATA_H
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+#include "pbbam/Config.h"
+#include "pbbam/PbiFile.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamRecord;
+class DataSet;
+
+/// \brief The PbiRawBarcodeData class represents the raw data stored in the
+///        "BarcodeData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawBarcodeData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure, preallocating space for a known
+    ///        number of records.
+    PbiRawBarcodeData(uint32_t numReads);
+
+    PbiRawBarcodeData() = default;
+    PbiRawBarcodeData(const PbiRawBarcodeData&) = default;
+    PbiRawBarcodeData(PbiRawBarcodeData&&) = default;
+    PbiRawBarcodeData& operator=(const PbiRawBarcodeData&) = default;
+    PbiRawBarcodeData& operator=(PbiRawBarcodeData&&) = default;
+    ~PbiRawBarcodeData() = default;
+
+    /// \}
+
+public:
+    /// \name Index Construction
+    /// \{
+
+    /// \brief Adds a record's barcode data.
+    ///
+    /// \param[in] b    %BAM record
+    ///
+    void AddRecord(const BamRecord& b);
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<int16_t> bcForward_;
+    std::vector<int16_t> bcReverse_;
+    std::vector<int8_t> bcQual_;
+
+    /// \}
+};
+
+/// \brief The PbiRawMappedData class represents the raw data stored in the
+///        "MappedData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawMappedData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure, preallocating space for a known
+    ///        number of records.
+    PbiRawMappedData(uint32_t numReads);
+
+    PbiRawMappedData() = default;
+    PbiRawMappedData(const PbiRawMappedData&) = default;
+    PbiRawMappedData(PbiRawMappedData&&) = default;
+    PbiRawMappedData& operator=(const PbiRawMappedData&) = default;
+    PbiRawMappedData& operator=(PbiRawMappedData&&) = default;
+    ~PbiRawMappedData() = default;
+
+    /// \}
+
+public:
+    /// \name Index Construction
+    /// \{
+
+    /// \brief Adds a record's mapping data.
+    ///
+    /// \param[in] b    %BAM record
+    ///
+    void AddRecord(const BamRecord& b);
+
+    /// \}
+
+public:
+    /// \name Index Data Query
+    /// \{
+
+    /// \brief Calculates the number of deleted bases for a particular record.
+    ///
+    /// Convenvience method. Equivalent to:
+    /// \code{.cpp}
+    /// NumDeletedAndInsertedBasesAt(i).first;
+    /// \endcode
+    ///
+    /// \param[in] recordIndex  i-th record
+    /// \returns number of deleted bases
+    ///
+    uint32_t NumDeletedBasesAt(size_t recordIndex) const;
+
+    /// \brief Calculates the number of inserted bases for a particular record.
+    ///
+    /// Convenvience method. Equivalent to:
+    /// \code{.cpp}
+    /// NumDeletedAndInsertedBasesAt(i).second;
+    /// \endcode
+    ///
+    /// \param[in] recordIndex  i-th record
+    /// \returns number of inserted bases
+    ///
+    uint32_t NumInsertedBasesAt(size_t recordIndex) const;
+
+    /// \brief Calculates the number of deleted & inserted bases for a
+    ///        particular record.
+    ///
+    /// \param[in] recordIndex  i-th record in the data set
+    /// \returns a pair consisting of (numDeletions,numInsertions)
+    ///
+    std::pair<uint32_t, uint32_t> NumDeletedAndInsertedBasesAt(size_t recordIndex) const;
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<int32_t> tId_;
+    std::vector<uint32_t> tStart_;
+    std::vector<uint32_t> tEnd_;
+    std::vector<uint32_t> aStart_;
+    std::vector<uint32_t> aEnd_;
+    std::vector<uint8_t> revStrand_;
+    std::vector<uint32_t> nM_;
+    std::vector<uint32_t> nMM_;
+    std::vector<uint8_t> mapQV_;
+
+    /// \}
+};
+
+/// \brief The PbiReferenceEntryClass represents a single reference in the PBI
+///        CoordinateSorted section.
+///
+/// A reference entry consists of an associated reference ID (tId), as well as
+/// start and end indices into the %BAM or PBI.
+///
+/// \note Rows are given in the interval [start, end).
+///
+class PBBAM_EXPORT PbiReferenceEntry
+{
+public:
+    using ID = uint32_t;
+    using Row = uint32_t;
+
+public:
+    static const ID UNMAPPED_ID;
+    static const Row UNSET_ROW;
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a default entry.
+    ///
+    /// - default ID:   PbiReferenceEntry::UNMAPPED_ID \n
+    /// - default rows: PbiReferenceEntry::UNSET_ROW
+    ///
+    PbiReferenceEntry();
+
+    /// \brief Creates a reference entry, with no rows set.
+    ///
+    /// - default rows: PbiReferenceEntry::UNSET_ROW
+    ///
+    PbiReferenceEntry(ID id);
+
+    /// \brief Creates a reference entry, with rows set.
+    ///
+    PbiReferenceEntry(ID id, Row beginRow, Row endRow);
+
+    PbiReferenceEntry(const PbiReferenceEntry&) = default;
+    PbiReferenceEntry(PbiReferenceEntry&&) = default;
+    PbiReferenceEntry& operator=(const PbiReferenceEntry&) = default;
+    PbiReferenceEntry& operator=(PbiReferenceEntry&&) = default;
+    ~PbiReferenceEntry() = default;
+
+    bool operator==(const PbiReferenceEntry& other) const;
+
+    /// \}
+
+public:
+    /// \name Reference Data Members
+    /// \{
+
+    ID tId_;
+    Row beginRow_;
+    Row endRow_;
+
+    /// \}
+};
+
+/// \brief The PbiRawReferenceData class represents the raw data stored in the
+///        "CoordinateSortedData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawReferenceData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure, preallocating space for a
+    ///        number of references.
+    ///
+    /// This constructor is recommended as this is the safest way to ensure that
+    /// references without observed mappings are included in the final output.
+    ///
+    PbiRawReferenceData(uint32_t numRefs);
+
+    PbiRawReferenceData() = default;
+    PbiRawReferenceData(const PbiRawReferenceData&) = default;
+    PbiRawReferenceData(PbiRawReferenceData&&) = default;
+    PbiRawReferenceData& operator=(const PbiRawReferenceData&) = default;
+    PbiRawReferenceData& operator=(PbiRawReferenceData&&) = default;
+    ~PbiRawReferenceData() = default;
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<PbiReferenceEntry> entries_;
+
+    /// \}
+};
+
+/// \brief The PbiRawBasicData class represents the raw data stored in the
+///        "BasicData" section of the PBI index.
+///
+class PBBAM_EXPORT PbiRawBasicData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty data structure, preallocating space for a known
+    ///        number of records.
+    PbiRawBasicData(uint32_t numReads);
+
+    PbiRawBasicData() = default;
+    PbiRawBasicData(const PbiRawBasicData&) = default;
+    PbiRawBasicData(PbiRawBasicData&&) = default;
+    PbiRawBasicData& operator=(const PbiRawBasicData&) = default;
+    PbiRawBasicData& operator=(PbiRawBasicData&&) = default;
+    ~PbiRawBasicData() = default;
+
+    /// \}
+
+public:
+    /// \name Index Construction
+    /// \{
+
+    /// \brief Adds a record's mapping data.
+    ///
+    /// \param[in] b        %BAM record
+    /// \param[in] offset   \b virtual file offset where record begins
+    ///
+    void AddRecord(const BamRecord& b, int64_t offset);
+
+    /// \}
+
+public:
+    /// \name Raw Data Containers
+    /// \{
+
+    std::vector<int32_t> rgId_;
+    std::vector<int32_t> qStart_;
+    std::vector<int32_t> qEnd_;
+    std::vector<int32_t> holeNumber_;
+    std::vector<float> readQual_;
+    std::vector<uint8_t> ctxtFlag_;
+    std::vector<int64_t> fileOffset_;
+    std::vector<uint16_t> fileNumber_;
+
+    /// \}
+};
+
+/// \brief The PbiRawData class provides an representation of raw PBI index
+///        data, used mostly for construction or I/O.
+///
+/// The PbiRawData class itself provides access to a few high-level attributes
+/// (e.g. version, number of records, etc.). The actual index data is stored
+/// in its member components:
+///     PbiRawBasicData,
+///     PbiRawMappedData,
+///     PbiRawReferenceData, &
+///     PbiRawBarcodeData .
+///
+class PBBAM_EXPORT PbiRawData
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Loads raw PBI data from a file.
+    ///
+    /// \param[in] pbiFilename      ".pbi" filename
+    ///
+    /// \throws std::runtime_error if file contents cannot be loaded properly
+    ///
+    PbiRawData(std::string pbiFilename);
+
+    /// \brief Loads a raw, aggregate PBI data from a dataset
+    ///
+    /// This constructor creates a raw index object that contains an aggregation
+    /// of index data across the dataset.
+    ///
+    /// \note ReferenceData (the per-reference table for coordinate-sorted data)
+    ///       is not currently available for the index aggregate. All other
+    ///       per-record data sections will be present.
+    ///
+    /// \param[in] dataset  DataSet object
+    ///
+    /// \throws std::runtime_error if file(s) contents cannot be loaded properly
+    ///
+    explicit PbiRawData(const DataSet& dataset);
+
+    PbiRawData() = default;
+    PbiRawData(const PbiRawData&) = default;
+    PbiRawData(PbiRawData&&) = default;
+    PbiRawData& operator=(const PbiRawData&) = default;
+    PbiRawData& operator=(PbiRawData&&) = default;
+    ~PbiRawData() = default;
+
+    /// \}
+
+public:
+    /// \name PBI General Attributes
+    /// \{
+
+    /// \returns true if index has BarcodeData section
+    bool HasBarcodeData() const;
+
+    /// \returns true if index has MappedData section
+    bool HasMappedData() const;
+
+    /// \returns true if index has ReferenceData section
+    bool HasReferenceData() const;
+
+    /// \returns true if index has \b section
+    /// \param[in] section PbiFile::Section identifier
+    ///
+    bool HasSection(const PbiFile::Section section) const;
+
+    /// \returns index filename ("*.pbi")
+    ///
+    /// \note Returns an empty string if the underlying data was calculated in
+    ///       code or aggregated from a DataSet, rather than loaded from a
+    ///       single PBI file.
+    ///
+    std::string Filename() const;
+
+    /// \returns enum flags representing the file sections present
+    PbiFile::Sections FileSections() const;
+
+    /// \returns the number of records in the PBI(s)
+    uint32_t NumReads() const;
+
+    /// \returns the PBI file's version
+    PbiFile::VersionEnum Version() const;
+
+    /// \}
+
+public:
+    /// \name Raw Data Components
+    /// \{
+
+    /// \returns const reference to BarcodeData lookup structure
+    ///
+    /// May be empty, check result of HasBarcodeData.
+    ///
+    const PbiRawBarcodeData& BarcodeData() const;
+
+    /// \returns const reference to BasicData lookup structure
+    const PbiRawBasicData& BasicData() const;
+
+    /// \returns const reference to MappedData lookup structure
+    ///
+    /// May be empty, check result of HasMappedData.
+    ///
+    const PbiRawMappedData& MappedData() const;
+
+    /// \returns const reference to reference data lookup structure
+    ///
+    /// May be empty, check result of HasReferenceData.
+    ///
+    const PbiRawReferenceData& ReferenceData() const;
+
+    /// \}
+
+public:
+    /// \name PBI General Attributes
+    /// \{
+
+    /// \brief Sets the file section flags.
+    ///
+    /// \param[in] sections     section flags
+    /// \returns reference to this index
+    ///
+    PbiRawData& FileSections(PbiFile::Sections sections);
+
+    /// \brief Sets the number of indexed records.
+    ///
+    /// \param[in] num  number of records
+    /// \returns reference to this index
+    ///
+    PbiRawData& NumReads(uint32_t num);
+
+    /// \brief Sets PBI file version.
+    ///
+    /// \param[in] version  file version
+    /// \returns reference to this index
+    ///
+    PbiRawData& Version(PbiFile::VersionEnum version);
+
+    /// \}
+
+public:
+    /// \name Raw Data Components
+    /// \{
+
+    /// \returns reference to BarcodeData lookup structure
+    ///
+    /// May be empty, check result of HasBarcodeData.
+    ///
+    PbiRawBarcodeData& BarcodeData();
+
+    /// \returns reference to BasicData lookup structure
+    PbiRawBasicData& BasicData();
+
+    /// \returns reference to MappedData lookup structure
+    ///
+    /// May be empty, check result of HasMappedData.
+    ///
+    PbiRawMappedData& MappedData();
+
+    /// \returns reference to reference data lookup structure
+    ///
+    /// May be empty, check result of HasReferenceData.
+    ///
+    PbiRawReferenceData& ReferenceData();
+
+    /// \}
+
+private:
+    std::string filename_;
+    PbiFile::VersionEnum version_ = PbiFile::CurrentVersion;
+    PbiFile::Sections sections_ = PbiFile::ALL;
+    uint32_t numReads_ = 0;
+    PbiRawBarcodeData barcodeData_;
+    PbiRawMappedData mappedData_;
+    PbiRawReferenceData referenceData_;
+    PbiRawBasicData basicData_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/PbiRawData.inl"
+
+#endif  // PBIRAWDATA_H
diff --git a/include/pbbam/Position.h b/include/pbbam/Position.h

new file mode 100644 (file)

index 0000000..15cb7f5
--- /dev/null
+++ b/include/pbbam/Position.h
@@ -0,0 +1,32 @@
+// File Description
+/// \file Position.h
+/// \brief Defines the Position typedef.
+//
+// Author: Derek Barnett
+
+#ifndef POSITION_H
+#define POSITION_H
+
+#include <cstdint>
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This type is used to refer to genomic positions.
+/// \typedef typedef int32_t PacBio::BAM::Position
+///
+/// We use a signed integer because SAM/BAM uses the -1 value to indicate
+/// unknown or unmapped positions.
+///
+using Position = int32_t;
+
+/// \brief This constant is widely used as a "missing" or "invalid" position
+///        marker.
+///
+static const Position UnmappedPosition{-1};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // POSITION_H
diff --git a/include/pbbam/ProgramInfo.h b/include/pbbam/ProgramInfo.h

new file mode 100644 (file)

index 0000000..e758877
--- /dev/null
+++ b/include/pbbam/ProgramInfo.h
@@ -0,0 +1,184 @@
+// File Description
+/// \file ProgramInfo.h
+/// \brief Defines the ProgramInfo class.
+//
+// Author: Derek Barnett
+
+#ifndef PROGRAMINFO_H
+#define PROGRAMINFO_H
+
+#include <map>
+#include <string>
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ProgramInfo class represents a program entry (\@PG) in the SAM
+///        header.
+///
+class PBBAM_EXPORT ProgramInfo
+{
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \brief Creates a ProgramInfo object from SAM-formatted text.
+    ///
+    /// \param[in] sam  SAM-formatted text
+    /// \returns program info object
+    ///
+    static ProgramInfo FromSam(const std::string& sam);
+
+    /// \brief Converts a ProgramInfo object to its SAM-formatted text.
+    ///
+    /// \param[in] prog     input ProgramInfo object
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    static std::string ToSam(const ProgramInfo& prog);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a program info object with an ID.
+    ///
+    /// \param[in] id       program ID (\@PG:ID)
+    ///
+    ProgramInfo(std::string id);
+
+    ProgramInfo() = default;
+    ProgramInfo(const ProgramInfo&) = default;
+    ProgramInfo(ProgramInfo&&) = default;
+    ProgramInfo& operator=(const ProgramInfo&) = default;
+    ProgramInfo& operator=(ProgramInfo&&) = default;
+    ~ProgramInfo() = default;
+
+    /// \}
+
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \returns true if program info is valid
+    ///
+    /// Currently this checks to see that ProgramInfo::Id does not contain an
+    /// empty string.
+    ///
+    bool IsValid() const;
+
+    /// \brief Converts this object to its SAM-formatted text.
+    ///
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    std::string ToSam() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns string value of \@PG:CL
+    std::string CommandLine() const;
+
+    /// \returns any non-standard tags added to the \@PG entry
+    ///
+    /// Result map consists of {tagName => value}.
+    ///
+    std::map<std::string, std::string> CustomTags() const;
+
+    /// \returns string value of \@PG:DS
+    std::string Description() const;
+
+    /// \returns string value of \@PG:ID
+    std::string Id() const;
+
+    /// \returns string value of \@PG:PN
+    std::string Name() const;
+
+    /// \returns string value of \@PG:PP
+    std::string PreviousProgramId() const;
+
+    /// \returns string value of \@PG:VN
+    std::string Version() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets the value for \@PG:CL
+    ///
+    /// \param[in] cmd      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& CommandLine(std::string cmd);
+
+    /// \brief Sets a new collection of non-standard tags.
+    ///
+    /// Custom tag map entries should consist of {tagName => value}.
+    ///
+    /// \param[in] custom      new tags
+    /// \returns reference to this object
+    ///
+    ProgramInfo& CustomTags(std::map<std::string, std::string> custom);
+
+    /// \brief Sets the value for \@PG:DS
+    ///
+    /// \param[in] description      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Description(std::string description);
+
+    /// \brief Sets the value for \@PG:ID
+    ///
+    /// \param[in] id      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Id(std::string id);
+
+    /// \brief Sets the value for \@PG:PN
+    ///
+    /// \param[in] name      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Name(std::string name);
+
+    /// \brief Sets the value for \@PG:PP
+    ///
+    /// \param[in] id      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& PreviousProgramId(std::string id);
+
+    /// \brief Sets the value for \@PG:VN
+    ///
+    /// \param[in] version      new value
+    /// \returns reference to this object
+    ///
+    ProgramInfo& Version(std::string version);
+
+    /// \}
+
+private:
+    std::string commandLine_;        // CL:<CommandLine>
+    std::string description_;        // DS:<Description>
+    std::string id_;                 // ID:<ID>  * must be unique for valid SAM *
+    std::string name_;               // PN:<Name>
+    std::string previousProgramId_;  // PP:<PreviousProgramID>
+    std::string version_;            // VN:<Version>
+
+    // custom attributes
+    std::map<std::string, std::string> custom_;  // tag => value
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/ProgramInfo.inl"
+
+#endif  // PROGRAMINFO_H
diff --git a/include/pbbam/PulseBehavior.h b/include/pbbam/PulseBehavior.h

new file mode 100644 (file)

index 0000000..494b7cc
--- /dev/null
+++ b/include/pbbam/PulseBehavior.h
@@ -0,0 +1,25 @@
+// File Description
+/// \file PulseBehavior.h
+/// \brief Defines the PulseBehavior enum.
+//
+// Author: Derek Barnett
+
+#ifndef PULSEBEHAVIOR_H
+#define PULSEBEHAVIOR_H
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the pulsecall modes supported by BamRecord tag
+///        accessors.
+///
+enum class PulseBehavior
+{
+    BASECALLS_ONLY,  ///< "Squashed" pulses not included, only basecalls.
+    ALL              ///< All pulses included.
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PULSEBEHAVIOR_H
diff --git a/include/pbbam/PulseExclusionReason.h b/include/pbbam/PulseExclusionReason.h

new file mode 100644 (file)

index 0000000..c6c2c9e
--- /dev/null
+++ b/include/pbbam/PulseExclusionReason.h
@@ -0,0 +1,28 @@
+// File Description
+/// \file PulseExclusionReason.h
+/// \brief Defines the PulseExclusionReason enum.
+//
+// Author: Derek Barnett
+
+#ifndef PULSE_EXCLUSION_REASON_H
+#define PULSE_EXCLUSION_REASON_H
+
+#include <cstdint>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the possible pulse exclusion reasons
+///
+enum class PulseExclusionReason : uint8_t
+{
+    BASE = 0,
+    SHORT_PULSE,
+    BURST,
+    PAUSE
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PULSE_EXCLUSION_REASON_H
diff --git a/include/pbbam/QNameQuery.h b/include/pbbam/QNameQuery.h

new file mode 100644 (file)

index 0000000..8861fce
--- /dev/null
+++ b/include/pbbam/QNameQuery.h
@@ -0,0 +1,58 @@
+// File Description
+/// \file QNameQuery.h
+/// \brief Defines the QNameQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef QNAMEQUERY_H
+#define QNAMEQUERY_H
+
+#include <memory>
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The QNameQuery class provides iterable access to a DataSet's records,
+///        with each iteration of the query returning a contiguous block of
+///        records that share a name.
+///
+/// There is no random-access here. It is simply a sequential read-through,
+/// grouping contiguous results that share a BamRecord::FullName.
+///
+/// \note The name is not ideal - but for legacy reasons, it will remain as-is
+///       for now. It will likely become something more explicit, like
+///       "SequentialQNameGroupQuery", so that the name "QNameQuery" will be
+///       available for a built-in query on a QNAME filter (or whitelist). This
+///       will make it more consistent with other queries (ReadAccuracyQuery,
+///       SubreadLengthQuery, ZmwQuery, etc).
+///
+class PBBAM_EXPORT QNameQuery : public internal::IGroupQuery
+{
+public:
+    /// \brief Creates a new QNameQuery.
+    ///
+    /// \param[in] dataset      input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM files
+    ///
+    QNameQuery(const DataSet& dataset);
+    ~QNameQuery() override;
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(std::vector<BamRecord>& records) override;
+
+private:
+    struct QNameQueryPrivate;
+    std::unique_ptr<QNameQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // QNAMEQUERY_H
diff --git a/include/pbbam/QualityValue.h b/include/pbbam/QualityValue.h

new file mode 100644 (file)

index 0000000..310b99d
--- /dev/null
+++ b/include/pbbam/QualityValue.h
@@ -0,0 +1,84 @@
+// File Description
+/// \file QualityValue.h
+/// \brief Defines the QualityValue class.
+//
+// Author: Derek Barnett
+
+#ifndef QUALITYVALUE_H
+#define QUALITYVALUE_H
+
+#include <cstdint>
+#include <string>
+#include <vector>
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The QualityValue class represents a FASTQ-compatible quality value.
+///
+/// Integers are clamped to [0, 93] (corresponding to ASCII printable chars
+/// [!-~]).
+///
+/// Use QualityValue::FromFastq for constructing entries from FASTQ encoding
+/// characters. Otherwise, the resulting QualityValue will be interpreted using
+/// the character's numeric value (ignoring the FASTQ offset of 33).
+///
+class PBBAM_EXPORT QualityValue
+{
+public:
+    static const uint8_t MAX;
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \brief Creates a QualityValue from a FASTQ-encoding character.
+    ///
+    /// \param[in] c    FASTQ character
+    /// \returns quality value representing (c - 33)
+    ///
+    static QualityValue FromFastq(const char c);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    ///  \{
+
+    /// \brief Creates a QualityValue with specified value.
+    ///
+    /// \param[in] value    quality value
+    ///
+    QualityValue(const uint8_t value = 0);
+
+    QualityValue(const QualityValue&) = default;
+    QualityValue(QualityValue&&) = default;
+    QualityValue& operator=(const QualityValue&) = default;
+    QualityValue& operator=(QualityValue&&) = default;
+    ~QualityValue() = default;
+
+    /// \}
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \returns the FASTQ-encoding char for this QualityValue
+    char Fastq() const;
+
+    /// \returns the integer value of this QualityValue
+    operator uint8_t() const;
+
+    /// \}
+
+private:
+    uint8_t value_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/QualityValue.inl"
+
+#endif  // QUALITYVALUE_H
diff --git a/include/pbbam/QualityValues.h b/include/pbbam/QualityValues.h

new file mode 100644 (file)

index 0000000..6d7cd72
--- /dev/null
+++ b/include/pbbam/QualityValues.h
@@ -0,0 +1,134 @@
+// File Description
+/// \file QualityValues.h
+/// \brief Defines the QualityValues class.
+//
+// Author: Derek Barnett
+
+#ifndef QUALITYVALUES_H
+#define QUALITYVALUES_H
+
+#include <cstdint>
+#include <string>
+#include <vector>
+#include "pbbam/QualityValue.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The QualityValues class represents a sequence of FASTQ-compatible
+///        quality values. See QualityValue documentation for more details.
+///
+class PBBAM_EXPORT QualityValues : public std::vector<QualityValue>
+{
+public:
+    /// \brief Creates a QualityValues object from a FASTQ-encoded string.
+    ///
+    /// \param[in] fastq    FASTQ-encoded string
+    /// \returns corresponding QualityValues object
+    ///
+    static QualityValues FromFastq(const std::string& fastq);
+
+public:
+    /// \name Constructors & Related Methods
+    ///  \{
+
+    /// \brief Default constructor - creates an empty QualityValues object.
+
+    /// \brief Creates a QualityValues object from a FASTQ-encoded string.
+    ///
+    /// \param[in] fastqString  FASTQ-encoded string
+    ///
+    explicit QualityValues(const std::string& fastqString);
+
+    /// \brief Creates a QualityValues object from a vector of QualityValue
+    ///        elements.
+    ///
+    /// \param[in] quals    vector of QualityValue elements
+    ///
+    QualityValues(std::vector<QualityValue> quals);
+
+    /// \brief Creates a QualityValues object from a vector of (numeric) quality
+    ///        values.
+    ///
+    /// \param[in] quals    vector of quality value numbers
+    ///
+    explicit QualityValues(const std::vector<uint8_t>& quals);
+
+    /// \brief Creates a QualityValues object from the contents of the range:
+    ///        [first, last)
+    ///
+    /// \param[in] first    input iterator, whose element is a numeric quality
+    /// \param[in] last     input iterator, whose element is a numeric quality
+    ///
+    QualityValues(const std::vector<uint8_t>::const_iterator first,
+                  const std::vector<uint8_t>::const_iterator last);
+
+    /// \brief Creates a QualityValues object from the contents of the range:
+    ///        [first, last)
+    ///
+    /// \param[in] first    input iterator, whose element is a QualityValue
+    /// \param[in] last     input iterator, whose element is a QualityValue
+    ///
+    QualityValues(const QualityValues::const_iterator first,
+                  const QualityValues::const_iterator last);
+
+    QualityValues() = default;
+    QualityValues(const QualityValues&) = default;
+    QualityValues(QualityValues&&) = default;
+    QualityValues& operator=(const QualityValues&) = default;
+    QualityValues& operator=(QualityValues&&) = default;
+    ~QualityValues() = default;
+
+    QualityValues& operator=(std::vector<QualityValue> quals);
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    bool operator==(const std::string& other) const;
+    bool operator!=(const std::string& other) const;
+
+    /// \}
+
+public:
+    /// \name Iterators
+    /// \{
+
+    /// \returns a const_iterator to the beginning of the sequence
+    std::vector<QualityValue>::const_iterator cbegin() const;
+
+    /// \returns a const_iterator to the element following the last element
+    std::vector<QualityValue>::const_iterator cend() const;
+
+    /// \returns a const_iterator to the beginning of the sequence
+    std::vector<QualityValue>::const_iterator begin() const;
+
+    /// \returns a const_iterator to the element following the last element
+    std::vector<QualityValue>::const_iterator end() const;
+
+    /// \returns an iterator to the beginning of the sequence
+    std::vector<QualityValue>::iterator begin();
+
+    /// \returns an iterator to the element following the last element
+    std::vector<QualityValue>::iterator end();
+
+    /// \}
+
+public:
+    /// \name Conversion Methods
+    /// \{
+
+    /// \returns the FASTQ-encoded string for this sequence of quality values
+    std::string Fastq() const;
+
+    /// \}
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/QualityValues.inl"
+
+#endif  // QUALITYVALUES_H
diff --git a/include/pbbam/ReadAccuracyQuery.h b/include/pbbam/ReadAccuracyQuery.h

new file mode 100644 (file)

index 0000000..10f5027
--- /dev/null
+++ b/include/pbbam/ReadAccuracyQuery.h
@@ -0,0 +1,68 @@
+// File Description
+/// \file ReadAccuracyQuery.h
+/// \brief Defines the ReadAccuracyQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef READACCURACYQUERY_H
+#define READACCURACYQUERY_H
+
+#include <vector>
+#include "pbbam/Accuracy.h"
+#include "pbbam/Compare.h"
+#include "pbbam/Config.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ReadAccuracyQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a read accuracy
+///        criterion.
+///
+/// Example:
+/// \include code/ReadAccuracyQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT ReadAccuracyQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new ReadAccuracyQuery, limiting record results to only
+    ///        those matching a read accuracy criterion.
+    ///
+    /// \param[in] accuracy     read accuracy value
+    /// \param[in] compareType  compare operator
+    /// \param[in] dataset      input data source(s)
+    ///
+    /// \sa BamRecord::ReadAccuracy
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI
+    ///         files.
+    ///
+    ReadAccuracyQuery(const Accuracy accuracy, const Compare::Type compareType,
+                      const DataSet& dataset);
+
+    ~ReadAccuracyQuery() override;
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+    uint32_t NumReads() const;
+
+private:
+    struct ReadAccuracyQueryPrivate;
+    std::unique_ptr<ReadAccuracyQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // READACCURACYQUERY_H
diff --git a/include/pbbam/ReadGroupInfo.h b/include/pbbam/ReadGroupInfo.h

new file mode 100644 (file)

index 0000000..c5d810d
--- /dev/null
+++ b/include/pbbam/ReadGroupInfo.h
@@ -0,0 +1,596 @@
+// File Description
+/// \file ReadGroupInfo.h
+/// \brief Defines the ReadGroupInfo class.
+//
+// Author: Derek Barnett
+
+#ifndef READGROUPINFO_H
+#define READGROUPINFO_H
+
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include "pbbam/Config.h"
+#include "pbbam/exception/InvalidSequencingChemistryException.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum describes the base features that may be present in a read
+///        group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class BaseFeature
+{
+    DELETION_QV,
+    DELETION_TAG,
+    INSERTION_QV,
+    MERGE_QV,
+    SUBSTITUTION_QV,
+    SUBSTITUTION_TAG,
+    IPD,
+    PULSE_WIDTH,
+    PKMID,
+    PKMEAN,
+    PKMID2,
+    PKMEAN2,
+    LABEL,
+    LABEL_QV,
+    ALT_LABEL,
+    ALT_LABEL_QV,
+    PULSE_MERGE_QV,
+    PULSE_CALL,
+    PRE_PULSE_FRAMES,
+    PULSE_CALL_WIDTH,
+    START_FRAME,
+    PULSE_EXCLUSION
+};
+
+/// \brief This enum describes the encoding types used for frame data within a
+///        read group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class FrameCodec
+{
+    RAW,
+    V1
+};
+
+/// \brief This enum describes the experimental design of the barcodes within a
+///        read group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class BarcodeModeType
+{
+    NONE,
+    SYMMETRIC,
+    ASYMMETRIC,
+    TAILED
+};
+
+/// \brief This enum describes the type of value encoded by barcode quality,
+///        within a read group's records.
+///
+/// This information is stored in its description (\@RG:DS).
+///
+enum class BarcodeQualityType
+{
+    NONE,
+    SCORE,
+    PROBABILITY
+};
+
+/// \brief This enum describes the instrument type / platform model,
+///        within a read group's records.
+///
+/// This information is stored in its description (\@RG:PM).
+///
+enum class PlatformModelType
+{
+    ASTRO,
+    RS,
+    SEQUEL
+};
+
+/// \brief The ReadGroupInfo class represents a read group entry (\@RG) in the
+///        SAM header.
+///
+class PBBAM_EXPORT ReadGroupInfo
+{
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \brief Creates a ReadGroupInfo object from SAM-formatted text.
+    ///
+    /// \param[in] sam  SAM-formatted text
+    /// \returns read group info object
+    ///
+    static ReadGroupInfo FromSam(const std::string& sam);
+
+    /// \brief Converts a ReadGroupInfo object to its SAM-formatted text.
+    ///
+    /// \param[in] rg     input ReadGroupInfo object
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    static std::string ToSam(const ReadGroupInfo& rg);
+
+    /// \brief Converts a read group ID (string) to its numeric value.
+    ///
+    /// \param[in] rgId     read group ID string
+    /// \returns numeric value of ID
+    ///
+    static int32_t IdToInt(const std::string& rgId);
+
+    /// \brief Converts a read group ID number to its string representation.
+    ///
+    /// \param[in] id     read group ID number
+    /// \returns hexadecimal string representation of ID
+    ///
+    static std::string IntToId(const int32_t id);
+
+    /// \returns sequencing chemistry from (bindingKig, sequencingKit,
+    ///          basecallerVersion)
+    ///
+    static std::string SequencingChemistryFromTriple(const std::string& bindingKit,
+                                                     const std::string& sequencingKit,
+                                                     const std::string& basecallerVersion);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates an empty read group of UNKNOWN read type.
+    ReadGroupInfo();
+
+    /// \brief Creates a read group info object with an ID.
+    ///
+    /// \param[in] id   string representation of read group ID
+    ///
+    ReadGroupInfo(std::string id);
+
+    /// \brief Creates a read group info object from a movie name & read type.
+    ///
+    /// \param[in] movieName    sequencing movie name
+    /// \param[in] readType     string version of record type
+    ///
+    /// \sa RecordType
+    ///
+    ReadGroupInfo(std::string movieName, std::string readType);
+
+    /// \brief Creates a read group info object from a movie name, read type,
+    ///        and platform model.
+    ///
+    /// \param[in] movieName    sequencing movie name
+    /// \param[in] readType     string version of record type
+    /// \param[in] platform     platform model type
+    ///
+    /// \sa RecordType
+    ///
+    ReadGroupInfo(std::string movieName, std::string readType, PlatformModelType platform);
+
+    ReadGroupInfo(const ReadGroupInfo&) = default;
+    ReadGroupInfo(ReadGroupInfo&&) = default;
+    ReadGroupInfo& operator=(const ReadGroupInfo&) = default;
+    ReadGroupInfo& operator=(ReadGroupInfo&&) = default;
+    ~ReadGroupInfo() = default;
+
+    /// \}
+
+public:
+    /// \name Comparison Operators
+    /// \{
+
+    bool operator==(const ReadGroupInfo& other) const;
+
+    /// \}
+
+public:
+    /// \name Conversion & Validation
+    /// \{
+
+    /// \returns true if read group info is valid
+    ///
+    /// Currently this checks to see that ReadGroupInfo::Id does not contain an
+    /// empty string.
+    ///
+    bool IsValid() const;
+
+    /// \brief Converts this object to its SAM-formatted text.
+    ///
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    std::string ToSam() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns the number of barcode sequences in BarcodeFile
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    size_t BarcodeCount() const;
+
+    /// \returns name of FASTA file containing barcode sequences
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    std::string BarcodeFile() const;
+
+    /// \returns MD5 hash of the contents of BarcodeFile
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    std::string BarcodeHash() const;
+
+    /// \returns experimental design type of barcodes
+    ///
+    /// \throws std::runtime_error if barcode data not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    BarcodeModeType BarcodeMode() const;
+
+    /// \returns type of value encoded in the 'bq' tag
+    ///
+    /// \throws std::runtime_error if barcode data is not set.
+    ///         Check HasBarcodeData if this data may be absent.
+    ///
+    BarcodeQualityType BarcodeQuality() const;
+
+    /// \returns basecaller version number (e.g. "2.1")
+    std::string BasecallerVersion() const;
+
+    /// \returns tag name in use for the specified for base feature
+    std::string BaseFeatureTag(BaseFeature feature) const;
+
+    /// \returns binding kit part number (e.g. "100236500")
+    std::string BindingKit() const;
+
+    /// \returns true if reads are classified as spike-in controls
+    bool Control() const;
+
+    /// \returns any non-standard tags added to the \@PG entry
+    ///
+    /// Result map consists of {tagName => value}.
+    ///
+    std::map<std::string, std::string> CustomTags() const;
+
+    /// \returns string value of \@RG:DT
+    std::string Date() const;
+
+    /// \returns string value of \@RG:FO
+    std::string FlowOrder() const;
+
+    /// \returns frame rate in Hz
+    std::string FrameRateHz() const;
+
+    /// \returns true if read group has barcode data
+    bool HasBarcodeData() const;
+
+    /// \returns true if read group has an entry for the specified base feature
+    bool HasBaseFeature(BaseFeature feature) const;
+
+    /// \returns string value of \@RG:ID
+    std::string Id() const;
+
+    /// \returns codec type in use for IPD
+    FrameCodec IpdCodec() const;
+
+    /// \returns string value of \@RG:KS
+    std::string KeySequence() const;
+
+    /// \returns string value of \@RG:LB
+    std::string Library() const;
+
+    /// \returns movie name (stored in \@RG:PU)
+    std::string MovieName() const;
+
+    /// \returns string value of \@RG:PL
+    std::string Platform() const;
+
+    /// \returns string value of \@RG:PM
+    PlatformModelType PlatformModel() const;
+
+    /// \returns string value of \@RG:PI
+    std::string PredictedInsertSize() const;
+
+    /// \returns string value of \@RG:PG
+    std::string Programs() const;
+
+    /// \returns codec type in use for PulseWidth
+    FrameCodec PulseWidthCodec() const;
+
+    /// \returns string value of read type
+    std::string ReadType() const;
+
+    /// \returns string value of \@RG:SM
+    std::string Sample() const;
+
+    /// \returns string value of \@RG:CN
+    std::string SequencingCenter() const;
+
+    /// \returns sequencing chemistry name
+    std::string SequencingChemistry() const;
+
+    /// \returns sequencing kit part number
+    std::string SequencingKit() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets read group's barcode data.
+    ///
+    /// Barcode fields are either absent or all must be present.
+    ///
+    /// \param[in] barcodeFile      barcode filename
+    /// \param[in] barcodeHash      MD5 hash of barcode file
+    /// \param[in] barcodeCount     number of records in barcode file
+    /// \param[in] barcodeMode      experimental design of barcodes
+    /// \param[in] barcodeQuality   type of barcode quality value
+    ///
+    /// \sa BarcodeFile \n
+    ///     BarcodeHash \n
+    ///     BarcodeCount \n
+    ///     BarcodeMode \n
+    ///     BarcodeQuality \n
+    ///     ReadGroupInfo::ClearBarcodeData
+    ///
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BarcodeData(std::string barcodeFile, std::string barcodeHash,
+                               size_t barcodeCount, BarcodeModeType barcodeMode,
+                               BarcodeQualityType barcodeQuality);
+
+    /// \brief Sets the basecaller version number.
+    ///
+    /// \param[in] versionNumber   new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BasecallerVersion(std::string versionNumber);
+
+    /// \brief Sets the tag to be used for a particular base feature.
+    ///
+    /// \param[in] feature      feature type begin updated
+    /// \param[in] tag          new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BaseFeatureTag(BaseFeature feature, std::string tag);
+
+    /// \brief Sets the binding kit part number.
+    ///
+    /// \param[in] kitNumber    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& BindingKit(std::string kitNumber);
+
+    /// \brief Removes all barcode data from this read group.
+    ///
+    /// \returns reference to this read group
+    ///
+    ReadGroupInfo& ClearBarcodeData();
+
+    /// \brief Removes all base features from this read group.
+    ///
+    /// \returns reference to this read group
+    ///
+    ReadGroupInfo& ClearBaseFeatures();
+
+    /// \brief Sets whether read group's records are classifed as spike-in
+    ///        controls.
+    ///
+    /// \param[in] ctrl     true if records are spike-in controls
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Control(bool ctrl);
+
+    /// \brief Sets a new collection of non-standard tags.
+    ///
+    /// Custom tag map entries should consist of {tagName => value}.
+    ///
+    /// \param[in] custom      new tags
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& CustomTags(std::map<std::string, std::string> custom);
+
+    /// \brief Sets the value for \@RG:DT
+    ///
+    /// \param[in] date      new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Date(std::string date);
+
+    /// \brief Sets the value for \@RG:FO
+    ///
+    /// \param[in] order     new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& FlowOrder(std::string order);
+
+    /// \brief Sets the frame rate.
+    ///
+    /// \param[in] frameRateHz     string value of frame rate in Hz
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& FrameRateHz(std::string frameRateHz);
+
+    /// \brief Sets the read group's ID.
+    ///
+    /// \param[in] id     string value of ID
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Id(std::string id);
+
+    /// \brief Sets the read group's ID, from movie name & read type
+    ///
+    /// \param[in] movieName    sequencing movie name
+    /// \param[in] readType     string version of read type
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Id(const std::string& movieName, const std::string& readType);
+
+    /// \brief Sets the codec type used for IPD
+    ///
+    /// \param[in] codec    codec type
+    /// \param[in] tag      IPD tag
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& IpdCodec(FrameCodec codec, std::string tag = std::string());
+
+    /// \brief Sets the value for \@RG:KS
+    ///
+    /// \param[in] sequence      new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& KeySequence(std::string sequence);
+
+    /// \brief Sets the value for \@RG:LB
+    ///
+    /// \param[in] library      new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Library(std::string library);
+
+    /// \brief Sets the value for movie name (stored in \@RG:PU).
+    ///
+    /// \param[in] movieName    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& MovieName(std::string movieName);
+
+    /// \brief Sets the value for \@RG:PI
+    ///
+    /// \param[in] size         new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& PredictedInsertSize(std::string size);
+
+    /// \brief Sets the value for \@RG:PG
+    ///
+    /// \param[in] programs     new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Programs(std::string programs);
+
+    /// \brief Sets the value for \@RG:PM
+    ///
+    /// \param[in] platformModel new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& PlatformModel(PlatformModelType platform);
+
+    /// \brief Sets the codec type used for PulseWidth
+    ///
+    /// \param[in] codec    codec type
+    /// \param[in] tag      pulse width tag
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& PulseWidthCodec(FrameCodec codec, std::string tag = std::string());
+
+    /// \brief Sets the read type.
+    ///
+    /// \param[in] type    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& ReadType(std::string type);
+
+    /// \brief Removes a particular base feature from this read group.
+    ///
+    /// \param[in] feature      feature to remove
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& RemoveBaseFeature(BaseFeature feature);
+
+    /// \brief Sets the value for \@RG:SM
+    ///
+    /// \param[in] sample       new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& Sample(std::string sample);
+
+    /// \brief Sets the value for \@RG:CN
+    ///
+    /// \param[in] center       new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& SequencingCenter(std::string center);
+
+    /// \brief Sets the sequencing kit part number.
+    ///
+    /// \param[in] kitNumber    new value
+    /// \returns reference to this object
+    ///
+    ReadGroupInfo& SequencingKit(std::string kitNumber);
+
+    /// \}
+
+private:
+    std::string id_;                   // ID * must be unique for valid SAM *
+    std::string sequencingCenter_;     // CN
+    std::string date_;                 // DT * (ISO-8601) *
+    std::string flowOrder_;            // FO
+    std::string keySequence_;          // KS
+    std::string library_;              // LB
+    std::string programs_;             // PG
+    std::string predictedInsertSize_;  // PI
+    std::string movieName_;            // PU
+    std::string sample_;               // SM
+
+    PlatformModelType platformModel_ = PlatformModelType::SEQUEL;  // PM
+
+    // DS:<Description> components
+    std::string readType_;
+    std::string bindingKit_;
+    std::string sequencingKit_;
+    std::string basecallerVersion_;
+    mutable std::string sequencingChemistry_;
+    std::string frameRateHz_;
+    bool control_ = false;
+    FrameCodec ipdCodec_ = FrameCodec::V1;
+    FrameCodec pulseWidthCodec_ = FrameCodec::V1;
+    bool hasBarcodeData_ = false;
+    std::string barcodeFile_;
+    std::string barcodeHash_;
+    size_t barcodeCount_ = 0;
+    BarcodeModeType barcodeMode_ = BarcodeModeType::NONE;
+    BarcodeQualityType barcodeQuality_ = BarcodeQualityType::NONE;
+    std::map<BaseFeature, std::string> features_;
+
+    // custom attributes
+    std::map<std::string, std::string> custom_;  // tag => value
+
+private:
+    std::string EncodeSamDescription() const;
+    void DecodeSamDescription(const std::string& description);
+};
+
+/// \brief Creates a read group ID from a movie name & read type.
+///
+/// \param[in] movieName    sequencing movie name
+/// \param[in] readType     string version of read type
+///
+/// \returns hexadecimal string read group ID
+///
+PBBAM_EXPORT
+std::string MakeReadGroupId(const std::string& movieName, const std::string& readType);
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/ReadGroupInfo.inl"
+
+#endif  // READGROUPINFO_H
diff --git a/include/pbbam/RecordType.h b/include/pbbam/RecordType.h

new file mode 100644 (file)

index 0000000..8ca115c
--- /dev/null
+++ b/include/pbbam/RecordType.h
@@ -0,0 +1,46 @@
+// File Description
+/// \file RecordType.h
+/// \brief Defines the RecordType enum.
+//
+// Author: Derek Barnett
+
+#ifndef RECORDTYPE_H
+#define RECORDTYPE_H
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the possible PacBio BAM record types.
+///
+/// \sa ReadGroupInfo::ReadType
+///
+enum class RecordType
+{
+    ZMW,         ///< Polymerase read
+    HQREGION,    ///< High-quality region
+    SUBREAD,     ///< Subread
+    CCS,         ///< Circular consensus sequence
+    SCRAP,       ///< Additional sequence (barcodes, adapters, etc.)
+    UNKNOWN,     ///< Unknown read type
+    TRANSCRIPT,  ///< Transcript
+
+    POLYMERASE = ZMW  ///< \deprecated as of PacBio BAM spec v 3.0.4 (use RecordType::ZMW instead)
+};
+
+///
+/// \brief IsCcsOrTranscript
+///
+/// CCS & Transcript type records handle queryStart/End in the same way. This
+/// status is checked in several places, so this is a convenient helper.
+///
+/// \param[in] type
+///
+inline bool IsCcsOrTranscript(const RecordType type)
+{
+    return type == RecordType::CCS || type == RecordType::TRANSCRIPT;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // RECORDTYPE_H
diff --git a/include/pbbam/SamTagCodec.h b/include/pbbam/SamTagCodec.h

new file mode 100644 (file)

index 0000000..209dc6d
--- /dev/null
+++ b/include/pbbam/SamTagCodec.h
@@ -0,0 +1,60 @@
+// File Description
+/// \file SamTagCodec.h
+/// \brief Defines the SamTagCodec class.
+//
+// Author: Derek Barnett
+
+#ifndef SAMTAGCODEC_H
+#define SAMTAGCODEC_H
+
+#include <string>
+#include "pbbam/Config.h"
+#include "pbbam/TagCollection.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The SamTagCodec class provides text-based encoding/decoding of %BAM
+///        tag data.
+///
+/// \note SamTagCodec is mostly an implementation and/or testing detail, and may
+///       be removed from the public API.
+///
+class PBBAM_EXPORT SamTagCodec
+{
+public:
+    /// \name Tag Collection Methods
+    /// \{
+
+    /// \brief Creates a TagCollection from SAM-formatted tag data.
+    ///
+    /// \param[in] tagString    SAM-formmated string
+    /// \returns resulting tag collection
+    ///
+    static TagCollection Decode(const std::string& tagString);
+
+    /// \brief Creates SAM-formatted string from a TagCollection.
+    ///
+    /// \param[in] tags     TagCollection containing tag data
+    /// \returns SAM-formatted string
+    ///
+    static std::string Encode(const PacBio::BAM::TagCollection& tags);
+};
+
+///
+/// \brief creates a tag per the SAM/BAM text format
+///
+/// \param tag    tag name
+/// \param value  tag value
+///
+/// \return formatted tag string
+///
+inline std::string MakeSamTag(std::string tag, std::string value)
+{
+    return '\t' + std::move(tag) + ':' + std::move(value);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // SAMTAGCODEC_H
diff --git a/include/pbbam/SamWriter.h b/include/pbbam/SamWriter.h

new file mode 100644 (file)

index 0000000..2411c21
--- /dev/null
+++ b/include/pbbam/SamWriter.h
@@ -0,0 +1,100 @@
+// File Description
+/// \file SamWriter.h
+/// \brief Defines the SamWriter class.
+//
+// Author: Derek Barnett
+
+#ifndef SAMWRITER_H
+#define SAMWRITER_H
+
+#include <memory>
+#include <string>
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/IRecordWriter.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+class SamWriterPrivate;
+}
+
+/// \brief The SamWriter class provides a writing interface for creating
+///        new SAM files.
+///
+/// \note The underlying buffered data may not be flushed to the file until the
+///       destructor is called. Trying to access the file (reading, stat-ing,
+///       indexing, etc.) before the SamWriter is destroyed yields undefined
+///       behavior. Enclose the SamWriter in some form of local scope (curly
+///       braces, a separate function, etc.) to ensure that its destructor is
+///       called before proceeding to read-based operations.
+///
+/// \code{.cpp}
+///  {
+///     SamWriter w(...);
+///     // write data
+///  }
+///  // now safe to access the new file
+/// \endcode
+///
+///
+class SamWriter : public IRecordWriter
+{
+public:
+    /// \brief Opens a SAM file for writing & writes the header information.
+    ///
+    /// \note Set \p filename to "-" for stdout.
+    ///
+    /// \param[in] filename     path to output SAM file
+    /// \param[in] header       BamHeader object
+    ///
+    /// \throws std::runtime_error if there was a problem opening the file for
+    ///         writing or if an error occurred while writing the header
+    ///
+    SamWriter(std::string filename, const BamHeader& header);
+
+    /// Fully flushes all buffered data & closes file.
+    ///
+    ~SamWriter() override;
+
+    SamWriter(const SamWriter&) = delete;
+    SamWriter(SamWriter&&) = default;
+    SamWriter& operator=(const SamWriter&) = delete;
+    SamWriter& operator=(SamWriter&&) = default;
+
+public:
+    /// \brief Try to flush any buffered data to file.
+    ///
+    /// \note The underlying implementation may not necessarily flush buffered
+    ///       data immediately, especially in a multithreaded writer situation.
+    ///       Let the SamWriter go out of scope to fully ensure flushing.
+    ///
+    /// \throws std::runtime_error if flush fails
+    ///
+    void TryFlush() override;
+
+    /// \brief Write a record to the output SAM file.
+    ///
+    /// \param[in] record BamRecord object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecord& record) override;
+
+    /// \brief Write a record to the output SAM file.
+    ///
+    /// \param[in] recordImpl BamRecordImpl object
+    ///
+    /// \throws std::runtime_error on failure to write
+    ///
+    void Write(const BamRecordImpl& recordImpl) override;
+
+private:
+    std::unique_ptr<internal::SamWriterPrivate> d_;
+};
+
+}  // namesapce BAM
+}  // namespace PacBio
+
+#endif  // SAMWRITER_H
diff --git a/include/pbbam/SequenceInfo.h b/include/pbbam/SequenceInfo.h

new file mode 100644 (file)

index 0000000..4816862
--- /dev/null
+++ b/include/pbbam/SequenceInfo.h
@@ -0,0 +1,194 @@
+// File Description
+/// \file SequenceInfo.h
+/// \brief Defines the SequenceInfo class.
+//
+// Author: Derek Barnett
+
+#ifndef SEQUENCEINFO_H
+#define SEQUENCEINFO_H
+
+#include <map>
+#include <string>
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The SequenceInfo class represents a program entry (\@SQ) in the SAM
+///        header.
+///
+class PBBAM_EXPORT SequenceInfo
+{
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \brief Creates a SequenceInfo object from SAM-formatted text.
+    ///
+    /// \param[in] sam  SAM-formatted text
+    /// \returns program info object
+    ///
+    static SequenceInfo FromSam(const std::string& sam);
+
+    /// \brief Converts a SequenceInfo object to its SAM-formatted text.
+    ///
+    /// \param[in] seq     input SequenceInfo object
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    static std::string ToSam(const SequenceInfo& seq);
+
+    /// \}
+
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a sequence info object with name & (optional) length.
+    ///
+    /// \param[in] name       sequence name (\@SQ:SN)
+    /// \param[in] length     sequence length (\@SQ:LN)
+    ///
+    SequenceInfo(std::string name, std::string length = "0");
+
+    SequenceInfo() = default;
+    SequenceInfo(const SequenceInfo&) = default;
+    SequenceInfo(SequenceInfo&&) = default;
+    SequenceInfo& operator=(const SequenceInfo&) = default;
+    SequenceInfo& operator=(SequenceInfo&&) = default;
+    ~SequenceInfo() = default;
+
+    /// \}
+
+public:
+    /// \name Operators
+    /// \{
+
+    bool operator==(const SequenceInfo& other) const;
+    bool operator!=(const SequenceInfo& other) const;
+
+    /// \}
+
+public:
+    /// \name Conversion & Validation
+    ///
+
+    /// \returns true if sequence info is valid
+    ///
+    /// Currently this checks to see that Name is non-empty and Length is within
+    /// the accepted range.
+    ///
+    bool IsValid() const;
+
+    /// \brief Converts this object to its SAM-formatted text.
+    ///
+    /// \returns SAM-formatted text (no trailing newline)
+    ///
+    std::string ToSam() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \returns string value of \@SQ:AS
+    std::string AssemblyId() const;
+
+    /// \returns string value of \@SQ:M5
+    std::string Checksum() const;
+
+    /// \returns any non-standard tags added to the \@PG entry
+    ///
+    /// Result map consists of {tagName => value}.
+    ///
+    std::map<std::string, std::string> CustomTags() const;
+
+    /// \returns string value of \@SQ:LN
+    std::string Length() const;
+
+    /// \returns string value of \@SQ:SN
+    std::string Name() const;
+
+    /// \returns string value of \@SQ:SP
+    std::string Species() const;
+
+    /// \returns string value of \@SQ:UR
+    std::string Uri() const;
+
+    /// \}
+
+public:
+    /// \name Attributes
+    /// \{
+
+    /// \brief Sets the value for \@SQ:AS
+    ///
+    /// \param[in] id      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& AssemblyId(std::string id);
+
+    /// \brief Sets the value for \@SQ:M5
+    ///
+    /// \param[in] checksum      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Checksum(std::string checksum);
+
+    /// \brief Sets a new collection of non-standard tags.
+    ///
+    /// Custom tag map entries should consist of {tagName => value}.
+    ///
+    /// \param[in] custom      new tags
+    /// \returns reference to this object
+    ///
+    SequenceInfo& CustomTags(std::map<std::string, std::string> custom);
+
+    /// \brief Sets the value for \@SQ:LN
+    ///
+    /// \param[in] length      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Length(std::string length);
+
+    /// \brief Sets the value for \@SQ:SN
+    ///
+    /// \param[in] name      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Name(std::string name);
+
+    /// \brief Sets the value for \@SQ:SP
+    ///
+    /// \param[in] species     new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Species(std::string species);
+
+    /// \brief Sets the value for \@SQ:UR
+    ///
+    /// \param[in] uri      new value
+    /// \returns reference to this object
+    ///
+    SequenceInfo& Uri(std::string uri);
+
+    /// \}
+
+private:
+    std::string name_;        // SN:<Name>    * must be unique for valid SAM *
+    std::string length_;      // LN:<Length>  * must be within [0 - 2^31-1] *
+    std::string assemblyId_;  // AS:<AssemblyId>
+    std::string checksum_;    // M5:<Checksum>
+    std::string species_;     // SP:<Species>
+    std::string uri_;         // UR:<URI>
+
+    // custom attributes
+    std::map<std::string, std::string> custom_;  // tag => value
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/SequenceInfo.inl"
+
+#endif  // SEQUENCEINFO_H
diff --git a/include/pbbam/Strand.h b/include/pbbam/Strand.h

new file mode 100644 (file)

index 0000000..6bc5cc1
--- /dev/null
+++ b/include/pbbam/Strand.h
@@ -0,0 +1,27 @@
+// File Description
+/// \file Strand.h
+/// \brief Defines the Strand enum.
+//
+// Author: Derek Barnett
+
+#ifndef STRAND_H
+#define STRAND_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the strand orientations used for reporting
+///        alignment-related information.
+///
+enum class Strand
+{
+    FORWARD,  ///< Forward strand
+    REVERSE   ///< Reverse strand
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // STRAND_H
diff --git a/include/pbbam/StringUtilities.h b/include/pbbam/StringUtilities.h

new file mode 100644 (file)

index 0000000..9fff48a
--- /dev/null
+++ b/include/pbbam/StringUtilities.h
@@ -0,0 +1,48 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_STRINGUTILITIES_H
+#define PBBAM_STRINGUTILITIES_H
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief Splits a string into tokens
+///
+/// \param[in] line     input string
+/// \param[in] delim    character to split on
+///
+/// \returns vector of tokens
+///
+inline std::vector<std::string> Split(const std::string& line, const char delim = '\t')
+{
+    std::vector<std::string> tokens;
+    std::istringstream lineStream(line);
+    std::string token;
+    while (std::getline(lineStream, token, delim))
+        tokens.push_back(token);
+    return tokens;
+}
+
+/// \brief Remove all whitespace from input string (start, end, & internal)
+///
+/// \param[in] input    original string
+///
+/// \returns new string with no whitespace
+///
+inline std::string RemoveAllWhitespace(std::string input)
+{
+    input.erase(
+        std::remove_if(input.begin(), input.end(), [](const char c) { return std::isspace(c); }),
+        input.end());
+    return input;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBBAM_STRINGUTILITIES_H
diff --git a/include/pbbam/SubreadLengthQuery.h b/include/pbbam/SubreadLengthQuery.h

new file mode 100644 (file)

index 0000000..8eeef3d
--- /dev/null
+++ b/include/pbbam/SubreadLengthQuery.h
@@ -0,0 +1,66 @@
+// File Description
+/// \file SubreadLengthQuery.h
+/// \brief Defines the SubreadLengthQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef SUBREADLENGTHQUERY_H
+#define SUBREADLENGTHQUERY_H
+
+#include <cstdint>
+#include <vector>
+#include "pbbam/Compare.h"
+#include "pbbam/Config.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The SubreadLengthQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a subread length
+///        criterion.
+///
+/// Example:
+/// \include code/SubreadLengthQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT SubreadLengthQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new SubreadLengthQuery, limiting record results to only
+    ///        those matching a subread length criterion.
+    ///
+    /// \param[in] length       subread length value
+    /// \param[in] compareType  compare operator
+    /// \param[in] dataset      input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or PBI
+    ///         files.
+    ///
+    SubreadLengthQuery(const int32_t length, const Compare::Type compareType,
+                       const DataSet& dataset);
+
+    ~SubreadLengthQuery();
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+    uint32_t NumReads() const;
+
+private:
+    struct SubreadLengthQueryPrivate;
+    std::unique_ptr<SubreadLengthQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // SUBREADLENGTHQUERY_H
diff --git a/include/pbbam/Tag.h b/include/pbbam/Tag.h

new file mode 100644 (file)

index 0000000..1604e9a
--- /dev/null
+++ b/include/pbbam/Tag.h
@@ -0,0 +1,418 @@
+// File Description
+/// \file Tag.h
+/// \brief Defines the Tag class.
+//
+// Author: Derek Barnett
+
+#ifndef TAG_H
+#define TAG_H
+
+#include <boost/variant.hpp>
+#include <cstdint>
+#include <string>
+#include <vector>
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum is used to describe the exact (C++) data type held by a
+///        Tag.
+///
+enum class TagDataType
+{
+    INVALID = 0,       ///< boost::blank
+    INT8,              ///< int8_t
+    UINT8,             ///< uint8_t
+    INT16,             ///< int16_t
+    UINT16,            ///< uint16_t
+    INT32 = 5,         ///< int32_t
+    UINT32,            ///< uint32_t
+    FLOAT,             ///< float
+    STRING,            ///< std::string
+    INT8_ARRAY,        ///< std::vector<int8_t>
+    UINT8_ARRAY = 10,  ///< std::vector<uint8_t>
+    INT16_ARRAY,       ///< std::vector<int16_t>
+    UINT16_ARRAY,      ///< std::vector<uint16_t>
+    INT32_ARRAY,       ///< std::vector<int32_t>
+    UINT32_ARRAY,      ///< std::vector<uint32_t>
+    FLOAT_ARRAY = 15   ///< std::vector<float>
+};
+
+/// \brief This enum provides additional instructions on interpreting the tag's
+///        value.
+///
+/// Some C++ data types (e.g. std::string) may represent more than one BAM tag
+/// type ('H' vs 'Z'). Thus a TagModifier may be used to indicate how to
+/// properly distinguish between these shared data types.
+///
+enum class TagModifier
+{
+    /// \brief This value indicates that the tag has no modifiers set.
+    ///
+    NONE = 0,
+
+    /// \brief This modifier marks an integer as ASCII.
+    ///
+    /// SAM/BAM has the concept of an ASCII character that is distinct from an
+    /// 8-bit integer. However, there is no such pure separation in C++ - as
+    /// int8_t/uint8_t are likely implemented as typedefs around char/unsigned
+    /// char. Thus this modifier can be used to indicate a tag's value should be
+    /// interpreted as a printable, ASCII character.
+    ///
+    ASCII_CHAR,
+
+    /// \brief This modifier marks std::string data as "hex string", rather than
+    ///        a regular string.
+    ///
+    /// SAM/BAM has a distinction between regular strings and "Hex format"
+    /// strings. However, they are both manipulated in C++ via std::string. Thus
+    /// this modifier can be used to indicate that a tag's string data should be
+    /// interpreted as "Hex format" rather than a regular, literal string.
+    ///
+    HEX_STRING
+};
+
+/// \brief The Tag class represents a SAM/BAM record tag value.
+///
+/// SAM/BAM tags may store values from a variety of types: varying fixed-width
+/// integers, strings, arrays of data, etc.
+///
+/// The Tag class allow tags to be handled in a generic fashion, while
+/// maintaining a high level of type-safety. Only those types recognized by the
+/// SAM/BAM format are allowed, and extracting the value from a tag is subject
+/// to allowed conversion rules, as well.
+///
+// Inspired by (but greatly simplified & modified from) the boost::variant
+// wrapper approach taken by DynamicCpp (https://code.google.com/p/dynamic-cpp)
+//
+class PBBAM_EXPORT Tag
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a Tag from a signed 8-bit integer or character.
+    ///
+    /// Without a TagModifier, the resulting Tag will be annotated as containing
+    /// an 8-bit integer, whether the input \p value was an integer or a char.
+    /// For ASCII tags, use one of these methods:
+    /// \include code/Tag_AsciiCtor.txt
+    ///
+    Tag(int8_t value);
+
+    /// \brief Creates a Tag from a signed 8-bit integer or character,
+    ///        applying the provided modifier.
+    ///
+    /// This method allows direct construction of an ASCII character, rather
+    /// than an 8-bit integer (e.g. Tag('A', TagModifier::ASCII_CHAR) ).
+    ///
+    /// \throws runtime_error if \p modifier is not valid for int8_t data
+    ///
+    Tag(int8_t value, const TagModifier mod);
+
+    /// \brief Creates a Tag from an unsigned 8-bit integer or character.
+    ///
+    /// Without a TagModifier, the resulting Tag will be annotated as containing
+    /// an 8-bit unsigned integer, whether the input \p value was an integer or
+    /// a char. For ASCII tags, use one of these methods:
+    /// \include code/Tag_AsciiCtor.txt
+    ///
+    Tag(uint8_t value);
+
+    /// \brief Creates a Tag from 16-bit integer.
+    Tag(int16_t value);
+
+    /// \brief Creates a Tag from 16-bit unsigned integer.
+    Tag(uint16_t value);
+
+    /// \brief Creates a Tag from 32-bit signed integer.
+    Tag(int32_t value);
+
+    /// \brief Creates a Tag from 32-bit unsigned integer.
+    Tag(uint32_t value);
+
+    /// \brief Creates a Tag from floating-point value.
+    Tag(float value);
+
+    /// \brief Creates a Tag from string data.
+    Tag(std::string value);
+
+    /// \brief Creates a Tag from string data, adding modifier.
+    ///
+    /// \throws runtime_error if \p modifier is not valid for string data
+    ///
+    Tag(std::string value, TagModifier mod);
+
+    /// \brief Creates a Tag from a vector of 8-bit integers.
+    Tag(std::vector<int8_t> value);
+
+    /// \brief Creates a Tag from a vector of 8-bit unsigned integers.
+    Tag(std::vector<uint8_t> value);
+
+    /// \brief Creates a Tag from a vector of 16-bit integers.
+    Tag(std::vector<int16_t> value);
+
+    /// \brief Creates a Tag from a vector of 16-bit unsigned integers.
+    Tag(std::vector<uint16_t> value);
+
+    /// Constructs a Tag from a vector of 32-bit integers.
+    Tag(std::vector<int32_t> value);
+
+    /// \brief Creates a Tag from a vector of 32-bit unsigned integers.
+    Tag(std::vector<uint32_t> value);
+
+    /// \brief Creates a Tag from a vector of floating-point values.
+    Tag(std::vector<float> value);
+
+    Tag() = default;
+    Tag(const Tag&) = default;
+    Tag(Tag&&) = default;
+    Tag& operator=(const Tag&) = default;
+    Tag& operator=(Tag&&) = default;
+    ~Tag() = default;
+
+    Tag& operator=(boost::blank value);
+    Tag& operator=(int8_t value);
+    Tag& operator=(uint8_t value);
+    Tag& operator=(int16_t value);
+    Tag& operator=(uint16_t value);
+    Tag& operator=(int32_t value);
+    Tag& operator=(uint32_t value);
+    Tag& operator=(float value);
+    Tag& operator=(std::string value);
+    Tag& operator=(std::vector<int8_t> value);
+    Tag& operator=(std::vector<uint8_t> value);
+    Tag& operator=(std::vector<int16_t> value);
+    Tag& operator=(std::vector<uint16_t> value);
+    Tag& operator=(std::vector<int32_t> value);
+    Tag& operator=(std::vector<uint32_t> value);
+    Tag& operator=(std::vector<float> value);
+
+    bool operator==(const Tag& other) const;
+    bool operator!=(const Tag& other) const;
+
+    /// \}
+
+public:
+    /// \name Data Conversion & Validation
+    /// \{
+
+    /// \brief Converts the tag value to an ASCII character.
+    ///
+    /// Tag must hold an integral type, within the valid ASCII range [33-127].
+    ///
+    /// \returns ASCII character value
+    /// \throws std::runtime_error if not ASCII-compatible
+    ///
+    char ToAscii() const;
+
+    /// \returns tag data as signed 8-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    int8_t ToInt8() const;
+
+    /// \returns tag data as unsigned 8-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    uint8_t ToUInt8() const;
+
+    /// \returns tag data as signed 16-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    int16_t ToInt16() const;
+
+    /// \returns tag data as unsigned 16-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    uint16_t ToUInt16() const;
+
+    /// \returns tag data as signed 32-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    int32_t ToInt32() const;
+
+    /// \returns tag data as unsigned 32-bit (casting if needed)
+    /// \throws std::runtime_error if not integral data, or out of valid range
+    uint32_t ToUInt32() const;
+
+    /// \returns tag data as float
+    /// \throws std::runtime_error if tag does not contain a value of
+    ///         explicit type: float
+    float ToFloat() const;
+
+    /// \returns tag data as std::string
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::string
+    std::string ToString() const;
+
+    /// \returns tag data as std::vector<int8_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<int8_t>
+    std::vector<int8_t> ToInt8Array() const;
+
+    /// \returns tag data as std::vector<uint8_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<uint8_t>
+    std::vector<uint8_t> ToUInt8Array() const;
+
+    /// \returns tag data as std::vector<int16_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<int16_t>
+    std::vector<int16_t> ToInt16Array() const;
+
+    /// \returns tag data as std::vector<uint16_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<uint16_t>
+    std::vector<uint16_t> ToUInt16Array() const;
+
+    /// \returns tag data as std::vector<int32_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<int32_t>
+    std::vector<int32_t> ToInt32Array() const;
+
+    /// \returns tag data as std::vector<uint32_t>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<uint32_t>
+    std::vector<uint32_t> ToUInt32Array() const;
+
+    /// \returns tag data as std::vector<float>
+    /// \throws std::runtime_error if tag does not contain a value of explicit
+    ///         type: std::vector<float>
+    std::vector<float> ToFloatArray() const;
+
+    /// \}
+
+public:
+    /// \name Data Conversion & Validation
+    ///
+
+    /// \returns true if tag is null (e.g. default-constructed)
+    bool IsNull() const;
+
+    /// \returns true if tag contains a value of type: int8_t
+    bool IsInt8() const;
+
+    /// \returns true if tag contains a value of type: uint8_t
+    bool IsUInt8() const;
+
+    /// \returns true if tag contains a value of type: int16_t
+    bool IsInt16() const;
+
+    /// \returns true if tag contains a value of type: uint16_t
+    bool IsUInt16() const;
+
+    /// \returns true if tag contains a value of type: int32_t
+    bool IsInt32() const;
+
+    /// \returns true if tag contains a value of type: uint32_t
+    bool IsUInt32() const;
+
+    /// \returns true if tag contains a value of type: float
+    bool IsFloat() const;
+
+    /// \returns true if tag contains a value of type: std::string
+    bool IsString() const;
+
+    /// \returns true if tag contains a value of type: std::string \b AND has a
+    ///          TagModifier of TagModifier::HEX_STRING
+    bool IsHexString() const;
+
+    /// \returns true if tag contains a value of type: std::vector<int8_t>
+    bool IsInt8Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<uint8_t>
+    bool IsUInt8Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<int16_t>
+    bool IsInt16Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<uint16_t>
+    bool IsUInt16Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<int32_t>
+    bool IsInt32Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<uint32_t>
+    bool IsUInt32Array() const;
+
+    /// \returns true if tag contains a value of type: std::vector<float>
+    bool IsFloatArray() const;
+
+    /// \returns true if tag contains a value with any signed integer type
+    bool IsSignedInt() const;
+
+    /// \returns true if tag contains a value with any unsigned integer type
+    bool IsUnsignedInt() const;
+
+    /// \returns true if tag contains a value with any integer type
+    bool IsIntegral() const;
+
+    /// \returns true if tag contains a value with any integer or float type
+    bool IsNumeric() const;
+
+    /// \returns true if tag contains a vector containing signed integers
+    bool IsSignedArray() const;
+
+    /// \returns true if tag contains a vector containing unsigned integers
+    bool IsUnsignedArray() const;
+
+    /// \returns true if tag contains a vector containing integers
+    bool IsIntegralArray() const;
+
+    /// \returns true if tag contains a vector (integers or floats)
+    bool IsArray() const;
+
+    /// \}
+
+public:
+    /// \name Type & Modifier Attributes
+    /// \{
+
+    /// \returns enum value for current tag data
+    TagDataType Type() const;
+
+    /// \returns printable type name for current tag data
+    std::string Typename() const;
+
+    /// \returns true if tag data modifier \p m is set
+    bool HasModifier(const TagModifier m) const;
+
+    /// \returns current tag data modifier
+    TagModifier Modifier() const;
+
+    /// \brief Sets tag data modifier.
+    ///
+    /// \param[in] m    new modifier value
+    ///
+    /// \returns reference to this tag
+    Tag& Modifier(const TagModifier m);
+
+    /// \}
+
+private:
+    // clang-format off
+    // NOTE - keep this synced with TagDataType enum ordering
+    using var_t = boost::variant<boost::blank, // <-- default constructor creates variant of this type
+                                 int8_t,
+                                 uint8_t,
+                                 int16_t,
+                                 uint16_t,
+                                 int32_t,
+                                 uint32_t,
+                                 float,
+                                 std::string,
+                                 std::vector<int8_t>,
+                                 std::vector<uint8_t>,
+                                 std::vector<int16_t>,
+                                 std::vector<uint16_t>,
+                                 std::vector<int32_t>,
+                                 std::vector<uint32_t>,
+                                 std::vector<float> >;
+
+    var_t data_;
+    TagModifier modifier_ = TagModifier::NONE;
+    // clang-format on
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/Tag.inl"
+
+#endif  // TAG_H
diff --git a/include/pbbam/TagCollection.h b/include/pbbam/TagCollection.h

new file mode 100644 (file)

index 0000000..6faf234
--- /dev/null
+++ b/include/pbbam/TagCollection.h
@@ -0,0 +1,33 @@
+// File Description
+/// \file TagCollection.h
+/// \brief Defines the TagCollection class.
+//
+// Author: Derek Barnett
+
+#ifndef TAGCOLLECTION_H
+#define TAGCOLLECTION_H
+
+#include <map>
+#include <string>
+#include "pbbam/Config.h"
+#include "pbbam/Tag.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The TagCollection class represents a collection (or "dictionary") of
+///        tags.
+///
+/// Tags are mapped to their tag name, a 2-character string.
+///
+class PBBAM_EXPORT TagCollection : public std::map<std::string, Tag>
+{
+public:
+    /// \returns true if the collection contains a tag with \p name
+    inline bool Contains(const std::string& name) const { return count(name) != 0; }
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // TAGCOLLECTION_H
diff --git a/include/pbbam/Unused.h b/include/pbbam/Unused.h

new file mode 100644 (file)

index 0000000..6b72ff9
--- /dev/null
+++ b/include/pbbam/Unused.h
@@ -0,0 +1,15 @@
+#ifndef PBBAM_UNUSED_H
+#define PBBAM_UNUSED_H
+
+namespace PacBio {
+namespace BAM {
+
+template <typename T>
+void UNUSED(const T&)
+{
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBBAM_UNUSED_H
diff --git a/include/pbbam/Validator.h b/include/pbbam/Validator.h

new file mode 100644 (file)

index 0000000..dae4b48
--- /dev/null
+++ b/include/pbbam/Validator.h
@@ -0,0 +1,156 @@
+// File Description
+/// \file Validator.h
+/// \brief Defines the Validator class.
+//
+// Author: Derek Barnett
+
+#ifndef VALIDATOR_H
+#define VALIDATOR_H
+
+#include <cstddef>
+#include <limits>
+#include "pbbam/Config.h"
+#include "pbbam/exception/ValidationException.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamFile;
+class BamHeader;
+class BamRecord;
+class ReadGroupInfo;
+
+/// \brief The Validator class provides validation for %BAM data.
+///
+/// There are 2 ways to use this class. If you are only compared with a quick &
+/// dirty, yes/no validation, then you can use the IsValid() methods. This will
+/// swallow the specific cause of the failure, but you don't have to catch an
+/// exception and handle it in your client code. If you want to know,
+/// specifically, what failed, then you can use the Validate*() methods that
+/// will throw a ValidationException if the object is invalid. This exception
+/// will provide more details as to what failed and why.
+///
+/// See documentation for Config.h for details on building pbbam with
+/// auto-validation enabled.
+///
+class PBBAM_EXPORT Validator
+{
+public:
+    /// \brief Checks that a %BAM file conforms to the %PacBio specification.
+    ///
+    /// When \p entireFile is false, this method only checks file metadata. If
+    /// \p entireFile is true, all records are checked as well.
+    ///
+    /// \param[in] file         %BAM header to validate
+    /// \param[in] entireFile   check records in addition to metadata
+    /// \returns true if \p file passes validation checks
+    ///
+    /// \sa Validator::ValidateFileMetdata, Validator::ValidateEntireFile
+    ///
+    static bool IsValid(const BamFile& file, const bool entireFile);
+
+    /// \brief Checks that a %BAM header conforms to the %PacBio specification.
+    ///
+    /// \returns true if \p header passes validation checks
+    ///
+    /// \sa Validator::Validate(const BamHeader& header)
+    ///
+    static bool IsValid(const BamHeader& header);
+
+    /// \brief Checks that a %BAM read group conforms to the %PacBio
+    ///        specification.
+    ///
+    /// \returns true if \p rg passes validation checks
+    ///
+    /// \sa Validator::Validate(const ReadGroupInfo& rg)
+    ///
+    static bool IsValid(const ReadGroupInfo& rg);
+
+    /// \brief Checks that a %BAM record conforms to the %PacBio specification.
+    ///
+    /// \returns true if \p record passes validation checks
+    ///
+    /// \sa Validator::Validate(const BamRecord& record)
+    ///
+    static bool IsValid(const BamRecord& record);
+
+public:
+    Validator() = delete;
+
+    /// \brief Checks that a %BAM file's header conforms to the
+    ///        %PacBio specification.
+    ///
+    /// This validation step checks the SAM/%BAM version number, sort order,
+    /// PacBioBAM version number, and calls Validate(readGroup) internally for
+    /// all read groups.
+    ///
+    /// \param[in] file         %BAM header to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p header fails validation checks
+    ///
+    static void Validate(const BamHeader& header,
+                         const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM read group conforms to the %PacBio
+    ///        specification.
+    ///
+    /// \param[in] rg           %BAM read group to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p rg fails validation checks
+    ///
+    static void Validate(const ReadGroupInfo& rg,
+                         const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM record conforms to the %PacBio specification.
+    ///
+    /// \param[in] record       %BAM record to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p record fails validation checks
+    ///
+    static void Validate(const BamRecord& record,
+                         const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM file's (entire) contents conform to the
+    ///        %PacBio specification.
+    ///
+    /// This is equivalent to:
+    ///
+    /// \code
+    /// Validator::ValidateMetadata(file);
+    /// EntireFileQuery query(file);
+    /// for (const BamRecord& record : query)
+    ///     Validator::Validate(record);
+    /// \endcode
+    ///
+    /// \param[in] file         %BAM file to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p file fails validation checks
+    ///
+    static void ValidateEntireFile(const BamFile& file,
+                                   const size_t maxErrors = std::numeric_limits<size_t>::max());
+
+    /// \brief Checks that a %BAM file's metadata conforms to the
+    ///        %PacBio specification.
+    ///
+    /// This validation step checks the filename, ensures EOF marker, and
+    /// presence of PBI. It also calls Validate(file.Header()) internally.
+    ///
+    /// \param[in] file         %BAM header to validate
+    /// \param[in] maxErrors    maximum number of errors to allow before throwing
+    ///
+    /// \throws ValidationException if \p header fails validation checks
+    ///
+    static void ValidateFileMetadata(const BamFile& file,
+                                     const size_t maxErrors = std::numeric_limits<size_t>::max());
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "internal/Validator.inl"
+
+#endif  // VALIDATOR_H
diff --git a/include/pbbam/ZmwGroupQuery.h b/include/pbbam/ZmwGroupQuery.h

new file mode 100644 (file)

index 0000000..9c91f52
--- /dev/null
+++ b/include/pbbam/ZmwGroupQuery.h
@@ -0,0 +1,59 @@
+// File Description
+/// \file ZmwGroupQuery.h
+/// \brief Defines the ZmwGroupQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWGROUPQUERY_H
+#define ZMWGROUPQUERY_H
+
+#include <cstdint>
+#include <vector>
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ZmwGroupQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a ZMW hole number
+///        whitelist, and grouping those results by hole number.
+///
+/// Example:
+/// \include code/ZmwGroupQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT ZmwGroupQuery : public internal::IGroupQuery
+{
+public:
+    /// \brief Creates a new ZmwGroupQuery, limiting record results to only
+    ///        those matching a ZMW hole number criterion.
+    ///
+    /// \param[in] zmwWhitelist     vector of allowed ZMW hole numbers
+    /// \param[in] dataset          input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         PBI files.
+    ///
+    ZmwGroupQuery(const std::vector<int32_t>& zmwWhitelist, const DataSet& dataset);
+    ~ZmwGroupQuery();
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(std::vector<BamRecord>& records) override;
+
+private:
+    struct ZmwGroupQueryPrivate;
+    std::unique_ptr<ZmwGroupQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWGROUPQUERY_H
diff --git a/include/pbbam/ZmwQuery.h b/include/pbbam/ZmwQuery.h

new file mode 100644 (file)

index 0000000..7b96d74
--- /dev/null
+++ b/include/pbbam/ZmwQuery.h
@@ -0,0 +1,61 @@
+// File Description
+/// \file ZmwQuery.h
+/// \brief Defines the ZmwQuery class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWQUERY_H
+#define ZMWQUERY_H
+
+#include <cstdint>
+#include <vector>
+#include "pbbam/Config.h"
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ZmwQuery class provides iterable access to a DataSet's
+///        %BAM records, limiting results to those matching a ZMW hole number
+///        whitelist.
+///
+/// Example:
+/// \include code/ZmwQuery.txt
+///
+/// \note Currently, all %BAM files must have a corresponding ".pbi" index file.
+///       Use BamFile::EnsurePacBioIndexExists before creating the query if one
+///       may not be present.
+///
+class PBBAM_EXPORT ZmwQuery : public internal::IQuery
+{
+public:
+    /// \brief Creates a new ZmwQuery, limiting record results to only
+    ///        those matching a ZMW hole number criterion.
+    ///
+    /// \param[in] zmwWhitelist     vector of allowed ZMW hole numbers
+    /// \param[in] dataset          input data source(s)
+    ///
+    /// \throws std::runtime_error on failure to open/read underlying %BAM or
+    ///         PBI files.
+    ///
+    ZmwQuery(std::vector<int32_t> zmwWhitelist, const DataSet& dataset);
+
+    ~ZmwQuery();
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(BamRecord& r) override;
+
+private:
+    struct ZmwQueryPrivate;
+    std::unique_ptr<ZmwQueryPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWQUERY_H
diff --git a/include/pbbam/ZmwType.h b/include/pbbam/ZmwType.h

new file mode 100644 (file)

index 0000000..6447cc7
--- /dev/null
+++ b/include/pbbam/ZmwType.h
@@ -0,0 +1,28 @@
+// File Description
+/// \file ZmwType.h
+/// \brief Defines the ZmwType enum.
+//
+// Author: Armin Töpfer
+
+#ifndef ZMWTYPE_H
+#define ZMWTYPE_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the different ZMW categories of scraps
+///
+enum class ZmwType : char
+{
+    CONTROL = 'C',
+    MALFORMED = 'M',
+    NORMAL = 'N',
+    SENTINEL = 'S'
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWTYPE_H
diff --git a/include/pbbam/ZmwTypeMap.h b/include/pbbam/ZmwTypeMap.h

new file mode 100644 (file)

index 0000000..8d871d4
--- /dev/null
+++ b/include/pbbam/ZmwTypeMap.h
@@ -0,0 +1,30 @@
+// File Description
+/// \file ZmwTypeMap.h
+/// \brief Defines the ZmwTypeMap class.
+//
+// Author: Armin Töpfer
+
+#ifndef ZMWTYPEMAP_H
+#define ZMWTYPEMAP_H
+
+#include <map>
+
+#include "pbbam/Config.h"
+#include "pbbam/ZmwType.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ZmwTypeMap class provides mapping between char codes and
+///        ZmwType enum keys.
+///
+class ZmwTypeMap
+{
+public:
+    static std::map<char, ZmwType> ParseChar;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWTYPEMAP_H
diff --git a/include/pbbam/exception/BundleChemistryMappingException.h b/include/pbbam/exception/BundleChemistryMappingException.h

new file mode 100644 (file)

index 0000000..2849e04
--- /dev/null
+++ b/include/pbbam/exception/BundleChemistryMappingException.h
@@ -0,0 +1,43 @@
+// File Description
+/// \file BundleChemistryMappingException.h
+/// \brief Defines the BundleChemistryMappingException class.
+//
+// Author: Derek Barnett, Lance Hepler
+
+#ifndef BUNDLECHEMISTRYMAPPINGEXCEPTION_H
+#define BUNDLECHEMISTRYMAPPINGEXCEPTION_H
+
+#include <exception>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The BundleChemistryMappingException class represents an exception
+///        that will be thrown when an invalid sequencing chemistry combination
+///        is encountered.
+///
+class BundleChemistryMappingException : public std::exception
+{
+public:
+    BundleChemistryMappingException(std::string mappingXml, std::string msg)
+        : mappingXml_(std::move(mappingXml))
+        , what_(std::string("invalid ") + mappingXml_ + ": " + std::move(msg))
+    {
+    }
+
+    // This is a work around for the Intel PHI compiler (icpc)
+    ~BundleChemistryMappingException() throw() {}
+
+public:
+    const char* what() const noexcept override { return what_.c_str(); }
+
+protected:
+    std::string mappingXml_;
+    std::string what_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BUNDLECHEMISTRYMAPPINGEXCEPTION_H
diff --git a/include/pbbam/exception/InvalidSequencingChemistryException.h b/include/pbbam/exception/InvalidSequencingChemistryException.h

new file mode 100644 (file)

index 0000000..501aa2e
--- /dev/null
+++ b/include/pbbam/exception/InvalidSequencingChemistryException.h
@@ -0,0 +1,61 @@
+// File Description
+/// \file InvalidSequencingChemistryException.h
+/// \brief Defines the InvalidSequencingChemistryException class.
+//
+// Author: Derek Barnett
+
+#ifndef INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H
+#define INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H
+
+#include <exception>
+#include <sstream>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The InvalidSequencingChemistryException class represents an exception
+///        that will be thrown when an invalid sequencing chemistry combination
+///        is encountered.
+///
+class InvalidSequencingChemistryException : public std::exception
+{
+public:
+    InvalidSequencingChemistryException(std::string bindingKit, std::string sequencingKit,
+                                        std::string basecallerVersion)
+        : bindingKit_(std::move(bindingKit))
+        , sequencingKit_(std::move(sequencingKit))
+        , basecallerVersion_(std::move(basecallerVersion))
+    {
+        std::ostringstream s;
+        s << "unsupported sequencing chemistry combination:\n"
+          << "    binding kit:        " << bindingKit_ << '\n'
+          << "    sequencing kit:     " << sequencingKit_ << '\n'
+          << "    basecaller version: " << basecallerVersion_ << '\n';
+        what_ = s.str();
+    }
+
+    // This is a work around for the Intel PHI compiler (icpc)
+    ~InvalidSequencingChemistryException() throw() {}
+
+public:
+    const std::string& BindingKit() const { return bindingKit_; }
+
+    const std::string& SequencingKit() const { return sequencingKit_; }
+
+    const std::string& BasecallerVersion() const { return basecallerVersion_; }
+
+public:
+    const char* what() const noexcept override { return what_.c_str(); }
+
+protected:
+    std::string bindingKit_;
+    std::string sequencingKit_;
+    std::string basecallerVersion_;
+    std::string what_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // INVALIDSEQUENCINGCHEMISTRYEXCEPTION_H
diff --git a/include/pbbam/exception/ValidationException.h b/include/pbbam/exception/ValidationException.h

new file mode 100644 (file)

index 0000000..0ff7d1d
--- /dev/null
+++ b/include/pbbam/exception/ValidationException.h
@@ -0,0 +1,58 @@
+// File Description
+/// \file ValidationException.h
+/// \brief Defines the ValidationException class.
+//
+// Author: Derek Barnett
+
+#ifndef VALIDATIONEXCEPTION_H
+#define VALIDATIONEXCEPTION_H
+
+#include <map>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The ValidationExecption represents an exception that will be thrown
+///        when any error is encountered using the Validator API. In addition to
+///        a default display message, it provides programmatic access to all
+///        reported error messages.
+///
+/// \sa Validator::Validate(const BamRecord& record)
+///
+class ValidationException : public std::runtime_error
+{
+public:
+    using ErrorList = std::vector<std::string>;
+    using ErrorMap = std::map<std::string, ErrorList>;
+
+public:
+    ValidationException(ErrorMap fileErrors, ErrorMap readGroupErrors, ErrorMap recordErrors);
+
+    // This is a work around for the Intel PHI compiler (icpc)
+    ~ValidationException() throw() {}
+
+public:
+    const ErrorMap& FileErrors() const;
+    const ErrorMap& ReadGroupErrors() const;
+    const ErrorMap& RecordErrors() const;
+
+    const char* what() const noexcept override;
+
+private:
+    ErrorMap fileErrors_;
+    ErrorMap readGroupErrors_;
+    ErrorMap recordErrors_;
+    std::string msg_;
+
+private:
+    void FormatMessage();
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VALIDATIONEXCEPTION_H
diff --git a/include/pbbam/internal/Accuracy.inl b/include/pbbam/internal/Accuracy.inl

new file mode 100644 (file)

index 0000000..49f8d12
--- /dev/null
+++ b/include/pbbam/internal/Accuracy.inl
@@ -0,0 +1,25 @@
+// File Description
+/// \file Accuracy.inl
+/// \brief Inline implementations for the Accuracy class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Accuracy.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline Accuracy::Accuracy(float accuracy)
+{
+    if (accuracy < Accuracy::MIN)
+        accuracy = Accuracy::MIN;
+    else if (accuracy > Accuracy::MAX)
+        accuracy = Accuracy::MAX;
+    accuracy_ = accuracy;
+}
+
+inline Accuracy::operator float() const
+{ return accuracy_; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/BamHeader.inl b/include/pbbam/internal/BamHeader.inl

new file mode 100644 (file)

index 0000000..c5bfd5d
--- /dev/null
+++ b/include/pbbam/internal/BamHeader.inl
@@ -0,0 +1,103 @@
+// File Description
+/// \file BamHeader.inl
+/// \brief Inline implementations for the BamHeader class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamHeader.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class BamHeaderPrivate
+{
+public:
+    std::string version_;
+    std::string pacbioBamVersion_;
+    std::string sortOrder_;
+    std::map<std::string, std::string> headerLineCustom_;
+
+    std::map<std::string, ReadGroupInfo> readGroups_; // id => read group info
+    std::map<std::string, ProgramInfo> programs_;     // id => program info
+    std::vector<std::string> comments_;
+
+    // we need to preserve insertion order, use lookup for access by name
+    std::vector<SequenceInfo> sequences_;
+    std::map<std::string, int32_t> sequenceIdLookup_;
+};
+
+} // namespace internal
+
+inline BamHeader::BamHeader()
+    : d_{std::make_shared<internal::BamHeaderPrivate>()}
+{ }
+
+inline BamHeader BamHeader::operator+(const BamHeader& other) const
+{ return DeepCopy() += other; }
+
+inline BamHeader& BamHeader::AddComment(std::string comment)
+{ d_->comments_.push_back(std::move(comment)); return *this; }
+
+inline BamHeader& BamHeader::AddProgram(ProgramInfo pg)
+{ d_->programs_[pg.Id()] = std::move(pg); return *this; }
+
+inline BamHeader& BamHeader::AddReadGroup(ReadGroupInfo readGroup)
+{ d_->readGroups_[readGroup.Id()] = std::move(readGroup); return *this; }
+
+inline BamHeader& BamHeader::ClearComments()
+{ d_->comments_.clear(); return* this; }
+
+inline BamHeader& BamHeader::ClearPrograms()
+{ d_->programs_.clear(); return *this; }
+
+inline BamHeader& BamHeader::ClearReadGroups()
+{ d_->readGroups_.clear(); return *this; }
+
+inline std::vector<std::string> BamHeader::Comments() const
+{ return d_->comments_; }
+
+inline BamHeader& BamHeader::Comments(std::vector<std::string> comments)
+{ d_->comments_ = std::move(comments); return *this; }
+
+inline bool BamHeader::HasProgram(const std::string& id) const
+{ return d_->programs_.find(id) != d_->programs_.cend(); }
+
+inline bool BamHeader::HasReadGroup(const std::string& id) const
+{ return d_->readGroups_.find(id) != d_->readGroups_.cend(); }
+
+inline bool BamHeader::HasSequence(const std::string& name) const
+{ return d_->sequenceIdLookup_.find(name) != d_->sequenceIdLookup_.cend(); }
+
+inline size_t BamHeader::NumSequences() const
+{ return d_->sequences_.size(); }
+
+inline std::string BamHeader::PacBioBamVersion() const
+{ return d_->pacbioBamVersion_; }
+
+inline SequenceInfo BamHeader::Sequence(const int32_t id) const
+{ return d_->sequences_.at(id); }
+
+inline std::string BamHeader::SequenceLength(const int32_t id) const
+{ return Sequence(id).Length(); }
+
+inline std::string BamHeader::SequenceName(const int32_t id) const
+{ return Sequence(id).Name(); }
+
+inline std::vector<SequenceInfo> BamHeader::Sequences() const
+{ return d_->sequences_; }
+
+inline std::string BamHeader::SortOrder() const
+{ return d_->sortOrder_; }
+
+inline BamHeader& BamHeader::SortOrder(std::string order)
+{ d_->sortOrder_ = std::move(order); return *this; }
+
+inline std::string BamHeader::Version() const
+{ return d_->version_; }
+
+inline BamHeader& BamHeader::Version(std::string version)
+{ d_->version_ = std::move(version); return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/BamRecord.inl b/include/pbbam/internal/BamRecord.inl

new file mode 100644 (file)

index 0000000..ae90855
--- /dev/null
+++ b/include/pbbam/internal/BamRecord.inl
@@ -0,0 +1,51 @@
+// File Description
+/// \file BamRecord.inl
+/// \brief Inline implementations for the BamRecord class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline BamRecord BamRecord::Clipped(const BamRecord& input,
+                                    const ClipType clipType,
+                                    const PacBio::BAM::Position start,
+                                    const PacBio::BAM::Position end)
+{
+    return input.Clipped(clipType, start, end);
+}
+
+inline BamRecord BamRecord::Clipped(const ClipType clipType,
+                                    const PacBio::BAM::Position start,
+                                    const PacBio::BAM::Position end) const
+{
+    BamRecord result(*this);
+    result.Clip(clipType, start, end);
+    return result;
+}
+
+inline BamRecord BamRecord::Mapped(const BamRecord& input,
+                                   const int32_t referenceId,
+                                   const Position refStart,
+                                   const Strand strand,
+                                   const Cigar& cigar,
+                                   const uint8_t mappingQuality)
+{
+    return input.Mapped(referenceId, refStart, strand, cigar, mappingQuality);
+}
+
+inline BamRecord BamRecord::Mapped(const int32_t referenceId,
+                                   const Position refStart,
+                                   const Strand strand,
+                                   const Cigar& cigar,
+                                   const uint8_t mappingQuality) const
+{
+    BamRecord result(*this);
+    result.Map(referenceId, refStart, strand, cigar, mappingQuality);
+    return result;
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/BamRecordBuilder.inl b/include/pbbam/internal/BamRecordBuilder.inl

new file mode 100644 (file)

index 0000000..67926ee
--- /dev/null
+++ b/include/pbbam/internal/BamRecordBuilder.inl
@@ -0,0 +1,43 @@
+// File Description
+/// \file BamRecordBuilder.inl
+/// \brief Inline implementations for the BamRecordBuilder class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamRecordBuilder.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline BamRecordBuilder& BamRecordBuilder::Bin(const uint32_t bin)
+{ core_.bin = bin; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::Flag(const uint32_t flag)
+{ core_.flag = flag; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::InsertSize(const int32_t iSize)
+{ core_.isize = iSize; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::MapQuality(const uint8_t mapQual)
+{ core_.qual = mapQual; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::MatePosition(const int32_t pos)
+{ core_.mpos = pos; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::MateReferenceId(const int32_t id)
+{ core_.mtid = id; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::Position(const int32_t pos)
+{ core_.pos = pos; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::Qualities(std::string qualities)
+{ qualities_ = std::move(qualities); return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::ReferenceId(const int32_t id)
+{ core_.tid = id; return *this; }
+
+inline BamRecordBuilder& BamRecordBuilder::Tags(TagCollection tags)
+{ tags_ = std::move(tags); return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/BamRecordImpl.inl b/include/pbbam/internal/BamRecordImpl.inl

new file mode 100644 (file)

index 0000000..03533e5
--- /dev/null
+++ b/include/pbbam/internal/BamRecordImpl.inl
@@ -0,0 +1,181 @@
+// File Description
+/// \file BamRecordImpl.inl
+/// \brief Inline implementations for the BamRecordImpl class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamRecordImpl.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline uint32_t BamRecordImpl::Bin() const
+{ return d_->core.bin; }
+
+inline BamRecordImpl& BamRecordImpl::Bin(uint32_t bin)
+{ d_->core.bin = bin; return *this; }
+
+inline uint32_t BamRecordImpl::Flag() const
+{ return d_->core.flag; }
+
+inline BamRecordImpl& BamRecordImpl::Flag(uint32_t flag)
+{ d_->core.flag = flag; return *this; }
+
+inline int32_t BamRecordImpl::InsertSize() const
+{ return d_->core.isize; }
+
+inline BamRecordImpl& BamRecordImpl::InsertSize(int32_t iSize)
+{ d_->core.isize = iSize; return *this; }
+
+inline uint8_t BamRecordImpl::MapQuality() const
+{ return d_->core.qual; }
+
+inline BamRecordImpl& BamRecordImpl::MapQuality(uint8_t mapQual)
+{ d_->core.qual = mapQual; return *this; }
+
+inline PacBio::BAM::Position BamRecordImpl::MatePosition() const
+{ return d_->core.mpos; }
+
+inline BamRecordImpl& BamRecordImpl::MatePosition(PacBio::BAM::Position pos)
+{ d_->core.mpos = pos; return *this; }
+
+inline int32_t BamRecordImpl::MateReferenceId() const
+{ return d_->core.mtid; }
+
+inline BamRecordImpl& BamRecordImpl::MateReferenceId(int32_t id)
+{ d_->core.mtid = id; return *this; }
+
+inline PacBio::BAM::Position BamRecordImpl::Position() const
+{ return d_->core.pos; }
+
+inline BamRecordImpl& BamRecordImpl::Position(PacBio::BAM::Position pos)
+{ d_->core.pos = pos; return *this; }
+
+inline int32_t BamRecordImpl::ReferenceId() const
+{ return d_->core.tid; }
+
+inline BamRecordImpl& BamRecordImpl::ReferenceId(int32_t id)
+{ d_->core.tid = id; return *this; }
+
+inline bool BamRecordImpl::IsDuplicate() const
+{ return (d_->core.flag & BamRecordImpl::DUPLICATE) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetDuplicate(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::DUPLICATE;
+    else    d_->core.flag &= ~BamRecordImpl::DUPLICATE;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsFailedQC() const
+{ return (d_->core.flag & BamRecordImpl::FAILED_QC) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetFailedQC(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::FAILED_QC;
+    else    d_->core.flag &= ~BamRecordImpl::FAILED_QC;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsFirstMate() const
+{ return (d_->core.flag & BamRecordImpl::MATE_1) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetFirstMate(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::MATE_1;
+    else    d_->core.flag &= ~BamRecordImpl::MATE_1;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsMapped() const
+{ return (d_->core.flag & BamRecordImpl::UNMAPPED) == 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetMapped(bool ok)
+{
+    if (ok) d_->core.flag &= ~BamRecordImpl::UNMAPPED;
+    else    d_->core.flag |=  BamRecordImpl::UNMAPPED;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsMateMapped() const
+{ return (d_->core.flag & BamRecordImpl::MATE_UNMAPPED) == 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetMateMapped(bool ok)
+{
+    if (ok) d_->core.flag &= ~BamRecordImpl::MATE_UNMAPPED;
+    else    d_->core.flag |=  BamRecordImpl::MATE_UNMAPPED;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsMateReverseStrand() const
+{ return (d_->core.flag & BamRecordImpl::MATE_REVERSE_STRAND) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetMateReverseStrand(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::MATE_REVERSE_STRAND;
+    else    d_->core.flag &= ~BamRecordImpl::MATE_REVERSE_STRAND;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsPaired() const
+{ return (d_->core.flag & BamRecordImpl::PAIRED) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetPaired(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::PAIRED;
+    else    d_->core.flag &= ~BamRecordImpl::PAIRED;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsPrimaryAlignment() const
+{ return (d_->core.flag & BamRecordImpl::SECONDARY) == 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetPrimaryAlignment(bool ok)
+{
+    if (ok) d_->core.flag &= ~BamRecordImpl::SECONDARY;
+    else    d_->core.flag |=  BamRecordImpl::SECONDARY;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsProperPair() const
+{ return (d_->core.flag & BamRecordImpl::PROPER_PAIR) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetProperPair(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::PROPER_PAIR;
+    else    d_->core.flag &= ~BamRecordImpl::PROPER_PAIR;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsReverseStrand() const
+{ return (d_->core.flag & BamRecordImpl::REVERSE_STRAND) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetReverseStrand(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::REVERSE_STRAND;
+    else    d_->core.flag &= ~BamRecordImpl::REVERSE_STRAND;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsSecondMate() const
+{ return (d_->core.flag & BamRecordImpl::MATE_2) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetSecondMate(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::MATE_2;
+    else    d_->core.flag &= ~BamRecordImpl::MATE_2;
+    return *this;
+}
+
+inline bool BamRecordImpl::IsSupplementaryAlignment() const
+{ return (d_->core.flag & BamRecordImpl::SUPPLEMENTARY) != 0; }
+
+inline BamRecordImpl& BamRecordImpl::SetSupplementaryAlignment(bool ok)
+{
+    if (ok) d_->core.flag |=  BamRecordImpl::SUPPLEMENTARY;
+    else    d_->core.flag &= ~BamRecordImpl::SUPPLEMENTARY;
+    return *this;
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/BamRecordView.inl b/include/pbbam/internal/BamRecordView.inl

new file mode 100644 (file)

index 0000000..72ef22c
--- /dev/null
+++ b/include/pbbam/internal/BamRecordView.inl
@@ -0,0 +1,94 @@
+// File Description
+/// \file BamRecordView.inl
+/// \brief Inline implementations for the BamRecordView class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/BamRecordView.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline BamRecordView::BamRecordView(const BamRecord& record,
+                                    const Orientation orientation,
+                                    const bool aligned,
+                                    const bool exciseSoftClips,
+                                    const PulseBehavior pulseBehavior)
+    : record_(record)
+    , orientation_{orientation}
+    , aligned_{aligned}
+    , exciseSoftClips_{exciseSoftClips}
+    , pulseBehavior_{pulseBehavior}
+{ }
+
+inline QualityValues BamRecordView::AltLabelQVs() const
+{ return record_.AltLabelQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::string BamRecordView::AltLabelTags() const
+{ return record_.AltLabelTag(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline QualityValues BamRecordView::DeletionQVs() const
+{ return record_.DeletionQV(orientation_, aligned_, exciseSoftClips_); }
+
+inline std::string BamRecordView::DeletionTags() const
+{ return record_.DeletionTag(orientation_, aligned_, exciseSoftClips_); }
+
+inline QualityValues BamRecordView::InsertionQVs() const
+{ return record_.InsertionQV(orientation_, aligned_, exciseSoftClips_); }
+
+inline Frames BamRecordView::IPD() const
+{ return record_.IPD(orientation_, aligned_, exciseSoftClips_); }
+
+inline Frames BamRecordView::PrebaseFrames() const
+{ return record_.IPD(orientation_, aligned_, exciseSoftClips_); }
+
+inline QualityValues BamRecordView::LabelQVs() const
+{ return record_.LabelQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline QualityValues BamRecordView::MergeQVs() const
+{ return record_.MergeQV(orientation_, aligned_, exciseSoftClips_); }
+
+inline QualityValues BamRecordView::PulseMergeQVs() const
+{ return record_.PulseMergeQV(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::vector<float> BamRecordView::Pkmean() const
+{ return record_.Pkmean(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::vector<float> BamRecordView::Pkmid() const
+{ return record_.Pkmid(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::vector<float> BamRecordView::Pkmean2() const
+{ return record_.Pkmean2(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::vector<float> BamRecordView::Pkmid2() const
+{ return record_.Pkmid2(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline Frames BamRecordView::PrePulseFrames() const
+{ return record_.PrePulseFrames(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline std::string BamRecordView::PulseCalls() const
+{ return record_.PulseCall(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline Frames BamRecordView::PulseCallWidth() const
+{ return record_.PulseCallWidth(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline Frames BamRecordView::PulseWidths() const
+{ return record_.PulseWidth(orientation_, aligned_, exciseSoftClips_); }
+
+inline QualityValues BamRecordView::Qualities() const
+{ return record_.Qualities(orientation_, aligned_, exciseSoftClips_); }
+
+inline std::string BamRecordView::Sequence() const
+{ return record_.Sequence(orientation_, aligned_, exciseSoftClips_); }
+
+inline std::vector<uint32_t> BamRecordView::StartFrames() const
+{ return record_.StartFrame(orientation_, aligned_, exciseSoftClips_, pulseBehavior_); }
+
+inline QualityValues BamRecordView::SubstitutionQVs() const
+{ return record_.SubstitutionQV(orientation_, aligned_, exciseSoftClips_); }
+
+inline std::string BamRecordView::SubstitutionTags() const
+{ return record_.SubstitutionTag(orientation_, aligned_, exciseSoftClips_); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Cigar.inl b/include/pbbam/internal/Cigar.inl

new file mode 100644 (file)

index 0000000..a877b72
--- /dev/null
+++ b/include/pbbam/internal/Cigar.inl
@@ -0,0 +1,16 @@
+// File Description
+/// \file Cigar.inl
+/// \brief Inline implemenations for the Cigar class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Cigar.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline Cigar Cigar::FromStdString(const std::string& stdString)
+{ return Cigar(stdString); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/CigarOperation.inl b/include/pbbam/internal/CigarOperation.inl

new file mode 100644 (file)

index 0000000..0c5a5aa
--- /dev/null
+++ b/include/pbbam/internal/CigarOperation.inl
@@ -0,0 +1,57 @@
+// File Description
+/// \file CigarOperation.inl
+/// \brief Inline implemenations for the CigarOperation class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/CigarOperation.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline CigarOperation::CigarOperation(char c, uint32_t length)
+    : type_{CigarOperation::CharToType(c)}
+    , length_{length}
+{
+    #ifndef PBBAM_PERMISSIVE_CIGAR
+        if (validate_ && (type_ == CigarOperationType::ALIGNMENT_MATCH))
+            throw std::runtime_error{"CIGAR operation 'M' is not allowed in PacBio BAM files. Use 'X/=' instead."};
+    #endif
+}
+
+inline CigarOperation::CigarOperation(CigarOperationType op, uint32_t length)
+    : type_{op}
+    , length_{length}
+{
+    #ifndef PBBAM_PERMISSIVE_CIGAR
+        if (validate_ && (type_ == CigarOperationType::ALIGNMENT_MATCH))
+            throw std::runtime_error{"CIGAR operation 'M' is not allowed in PacBio BAM files. Use 'X/=' instead."};
+    #endif
+}
+
+inline uint32_t CigarOperation::Length() const
+{ return length_; }
+
+inline CigarOperation& CigarOperation::Length(const uint32_t length)
+{ length_ = length; return *this; }
+
+inline CigarOperationType CigarOperation::Type() const
+{ return type_; }
+
+inline CigarOperation &CigarOperation::Type(const CigarOperationType opType)
+{ type_ = opType; return *this; }
+
+inline char CigarOperation::Char() const
+{ return CigarOperation::TypeToChar(type_); }
+
+inline CigarOperation &CigarOperation::Char(const char opChar)
+{ type_ = CigarOperation::CharToType(opChar); return *this; }
+
+inline bool CigarOperation::operator==(const CigarOperation& other) const
+{ return type_ == other.type_ && length_ == other.length_; }
+
+inline bool CigarOperation::operator!=(const CigarOperation& other) const
+{ return !(*this == other); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Compare.inl b/include/pbbam/internal/Compare.inl

new file mode 100644 (file)

index 0000000..663556f
--- /dev/null
+++ b/include/pbbam/internal/Compare.inl
@@ -0,0 +1,43 @@
+// File Description
+/// \file Compare.inl
+/// \brief Inline implementations for the Compare class & inner classes.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Compare.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template <typename T, T> struct MemberFnProxy;
+
+template<typename T, typename R, typename... Args, R (T::*fn)(Args...)const>
+struct MemberFnProxy<R (T::*)(Args...)const, fn>
+{
+    static R call(const T& obj, Args&&... args)
+    {
+        return (obj.*fn)(std::forward<Args>(args)...);
+    }
+};
+
+} // namespace internal
+
+template<typename ValueType,
+         typename Compare::MemberFunctionBaseHelper<ValueType>::MemberFnType fn,
+         typename CompareType>
+inline bool Compare::MemberFunctionBase<ValueType, fn, CompareType>::operator()(const BamRecord& lhs,
+                                                                                const BamRecord& rhs) const
+{
+    using MemberFnTypeImpl = typename Compare::MemberFunctionBaseHelper<ValueType>::MemberFnType;
+    using Proxy = internal::MemberFnProxy<MemberFnTypeImpl, fn>;
+
+    CompareType cmp;
+    return cmp(Proxy::call(lhs), Proxy::call(rhs));
+}
+
+inline bool Compare::None::operator()(const BamRecord&, const BamRecord&) const
+{ return false; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/CompositeBamReader.inl b/include/pbbam/internal/CompositeBamReader.inl

new file mode 100644 (file)

index 0000000..ebca5e9
--- /dev/null
+++ b/include/pbbam/internal/CompositeBamReader.inl
@@ -0,0 +1,341 @@
+// File Description\r
+/// \file CompositeBamReader.inl\r
+/// \brief Inline implementations for the composite BAM readers, for\r
+///        working with multiple input files.\r
+//\r
+// Author: Derek Barnett\r
+\r
+#include "pbbam/CompositeBamReader.h"\r
+#include <algorithm>\r
+#include <set>\r
+#include <sstream>\r
+#include <stdexcept>\r
+\r
+#include "pbbam/MakeUnique.h"\r
+\r
+namespace PacBio {\r
+namespace BAM {\r
+namespace internal {\r
+\r
+// -----------------------------------\r
+// Merging helpers\r
+// -----------------------------------\r
+\r
+inline CompositeMergeItem::CompositeMergeItem(std::unique_ptr<BamReader> rdr)\r
+    : reader{std::move(rdr)}\r
+{ }\r
+\r
+inline CompositeMergeItem::CompositeMergeItem(std::unique_ptr<BamReader> rdr,\r
+                                              BamRecord rec)\r
+    : reader{std::move(rdr)}\r
+    , record{std::move(rec)}\r
+{ }\r
+\r
+template<typename CompareType>\r
+inline bool CompositeMergeItemSorter<CompareType>::operator()(const CompositeMergeItem& lhs,\r
+                                                              const CompositeMergeItem& rhs)\r
+{\r
+    const auto& l = lhs.record;\r
+    const auto& r = rhs.record;\r
+    return CompareType()(l, r);\r
+}\r
+\r
+} // namespace internal\r
+\r
+// -----------------------------------\r
+// GenomicIntervalCompositeBamReader\r
+// -----------------------------------\r
+\r
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(const GenomicInterval& interval,\r
+                                                                            const std::vector<BamFile>& bamFiles)\r
+{\r
+    filenames_.reserve(bamFiles.size());\r
+    for(const auto& bamFile : bamFiles)\r
+        filenames_.push_back(bamFile.Filename());\r
+    Interval(interval);\r
+}\r
+\r
+inline GenomicIntervalCompositeBamReader::GenomicIntervalCompositeBamReader(const GenomicInterval& interval,\r
+                                                                            const DataSet& dataset)\r
+    : GenomicIntervalCompositeBamReader{interval, dataset.BamFiles()}\r
+{ }\r
+\r
+inline bool GenomicIntervalCompositeBamReader::GetNext(BamRecord& record)\r
+{\r
+    // nothing left to read\r
+    if (mergeItems_.empty())\r
+        return false;\r
+\r
+    // non-destructive 'pop' of first item from queue\r
+    auto firstIter = mergeItems_.begin();\r
+    auto firstItem = internal::CompositeMergeItem{ std::move(firstIter->reader), std::move(firstIter->record) };\r
+    mergeItems_.pop_front();\r
+\r
+    // store its record in our output record\r
+    std::swap(record, firstItem.record);\r
+\r
+    // try fetch 'next' from first item's reader\r
+    // if successful, re-insert it into container & re-sort on our new values\r
+    // otherwise, this item will go out of scope & reader destroyed\r
+    if (firstItem.reader->GetNext(firstItem.record)) {\r
+        mergeItems_.push_front(std::move(firstItem));\r
+        UpdateSort();\r
+    }\r
+\r
+    // return success\r
+    return true;\r
+}\r
+\r
+inline const GenomicInterval& GenomicIntervalCompositeBamReader::Interval() const\r
+{ return interval_; }\r
+\r
+inline GenomicIntervalCompositeBamReader& GenomicIntervalCompositeBamReader::Interval(const GenomicInterval& interval)\r
+{\r
+    std::deque<internal::CompositeMergeItem> updatedMergeItems;\r
+    std::set<std::string> filesToCreate{filenames_.cbegin(), filenames_.cend()};\r
+\r
+    // update existing readers\r
+    while (!mergeItems_.empty()) {\r
+\r
+        // non-destructive 'pop' of first item from queue\r
+        auto firstIter = mergeItems_.begin();\r
+        internal::CompositeMergeItem firstItem{ std::move(firstIter->reader), std::move(firstIter->record) };\r
+        mergeItems_.pop_front();\r
+\r
+        // reset interval\r
+        auto* baiReader = dynamic_cast<BaiIndexedBamReader*>(firstItem.reader.get());\r
+        assert(baiReader);\r
+        baiReader->Interval(interval);\r
+\r
+        // try fetch 'next' from first item's reader\r
+        // if successful, re-insert it into container & re-sort on our new values\r
+        // otherwise, this item will go out of scope & reader destroyed\r
+        if (firstItem.reader->GetNext(firstItem.record)) {\r
+            updatedMergeItems.push_front(std::move(firstItem));\r
+            filesToCreate.erase(firstItem.reader->Filename());\r
+        }\r
+    }\r
+\r
+    // create readers for files that were not 'active' for the previous\r
+    std::vector<std::string> missingBai;\r
+    for (auto&& fn : filesToCreate) {\r
+        BamFile bamFile{ fn };\r
+        if (bamFile.StandardIndexExists()) {\r
+            internal::CompositeMergeItem item{ std::unique_ptr<BamReader>{ new BaiIndexedBamReader{ interval, std::move(bamFile) } } };\r
+            if (item.reader->GetNext(item.record))\r
+                updatedMergeItems.push_back(std::move(item));\r
+            // else not an error, simply no data matching interval\r
+        }\r
+        else {\r
+            // maybe handle PBI-backed interval searches if BAI missing, but for now treat as error\r
+            missingBai.push_back(bamFile.Filename());\r
+        }\r
+    }\r
+\r
+    // throw if any files missing BAI\r
+    if (!missingBai.empty()) {\r
+        std::ostringstream e;\r
+        e << "failed to open GenomicIntervalCompositeBamReader because the following files are missing a BAI file:\n";\r
+        for (const auto& fn : missingBai)\r
+            e << "  " << fn << '\n';\r
+        throw std::runtime_error{e.str()};\r
+    }\r
+\r
+    // update our actual container and return\r
+    mergeItems_ = std::move(updatedMergeItems);\r
+    UpdateSort();\r
+    return *this;\r
+}\r
+\r
+struct OrderByPosition\r
+{\r
+    static inline bool less_than(const BamRecord& lhs, const BamRecord& rhs)\r
+    {\r
+        const int32_t lhsId = lhs.ReferenceId();\r
+        const int32_t rhsId = rhs.ReferenceId();\r
+        if (lhsId == -1) return false;\r
+        if (rhsId == -1) return true;\r
+\r
+        if (lhsId == rhsId)\r
+            return lhs.ReferenceStart() < rhs.ReferenceStart();\r
+        else return lhsId < rhsId;\r
+    }\r
+\r
+    static inline bool equals(const BamRecord& lhs, const BamRecord& rhs)\r
+    {\r
+        return lhs.ReferenceId() == rhs.ReferenceId() &&\r
+               lhs.ReferenceStart() == rhs.ReferenceStart();\r
+    }\r
+};\r
+\r
+struct PositionSorter : std::binary_function<internal::CompositeMergeItem, internal::CompositeMergeItem, bool>\r
+{\r
+    bool operator()(const internal::CompositeMergeItem& lhs,\r
+                    const internal::CompositeMergeItem& rhs)\r
+    {\r
+        const BamRecord& l = lhs.record;\r
+        const BamRecord& r = rhs.record;\r
+        return OrderByPosition::less_than(l, r);\r
+    }\r
+};\r
+\r
+inline void GenomicIntervalCompositeBamReader::UpdateSort()\r
+{ std::sort(mergeItems_.begin(), mergeItems_.end(), PositionSorter{ }); }\r
+\r
+// ------------------------------\r
+// PbiRequestCompositeBamReader\r
+// ------------------------------\r
+\r
+template<typename OrderByType>\r
+inline PbiFilterCompositeBamReader<OrderByType>::PbiFilterCompositeBamReader(const PbiFilter& filter,\r
+                                                                             const std::vector<BamFile>& bamFiles)\r
+    : numReads_{0}\r
+{\r
+    filenames_.reserve(bamFiles.size());\r
+    for(const auto& bamFile : bamFiles)\r
+        filenames_.push_back(bamFile.Filename());\r
+    Filter(filter);\r
+}\r
+\r
+template<typename OrderByType>\r
+inline PbiFilterCompositeBamReader<OrderByType>::PbiFilterCompositeBamReader(const PbiFilter& filter,\r
+                                                                             const DataSet& dataset)\r
+    : PbiFilterCompositeBamReader{filter, dataset.BamFiles()}\r
+{ }\r
+\r
+template<typename OrderByType>\r
+inline bool PbiFilterCompositeBamReader<OrderByType>::GetNext(BamRecord& record)\r
+{\r
+    // nothing left to read\r
+    if (mergeQueue_.empty())\r
+        return false;\r
+\r
+    // non-destructive 'pop' of first item from queue\r
+    auto firstIter = mergeQueue_.begin();\r
+    value_type firstItem{ std::move(firstIter->reader), std::move(firstIter->record) };\r
+    mergeQueue_.pop_front();\r
+\r
+    // store its record in our output record\r
+    std::swap(record, firstItem.record);\r
+\r
+    // try fetch 'next' from first item's reader\r
+    // if successful, re-insert it into container & re-sort on our new values\r
+    // otherwise, this item will go out of scope & reader destroyed\r
+    if (firstItem.reader->GetNext(firstItem.record)) {\r
+        mergeQueue_.push_front(std::move(firstItem));\r
+        UpdateSort();\r
+    }\r
+\r
+    // return success\r
+    return true;\r
+}\r
+\r
+template<typename OrderByType>\r
+inline PbiFilterCompositeBamReader<OrderByType>&\r
+PbiFilterCompositeBamReader<OrderByType>::Filter(const PbiFilter& filter)\r
+{\r
+    container_type updatedMergeItems;\r
+    std::set<std::string> filesToCreate{ filenames_.cbegin(), filenames_.cend() };\r
+\r
+    // update existing readers\r
+    while (!mergeQueue_.empty()) {\r
+\r
+        // non-destructive 'pop' of first item from queue\r
+        auto firstIter = mergeQueue_.begin();\r
+        internal::CompositeMergeItem firstItem{ std::move(firstIter->reader), std::move(firstIter->record) };\r
+        mergeQueue_.pop_front();\r
+\r
+        // reset request\r
+        auto* pbiReader = dynamic_cast<PbiIndexedBamReader*>(firstItem.reader.get());\r
+        assert(pbiReader);\r
+        pbiReader->Filter(filter);\r
+\r
+        // try fetch 'next' from first item's reader\r
+        // if successful, re-insert it into container & re-sort on our new values\r
+        // otherwise, this item will go out of scope & reader destroyed\r
+        if (firstItem.reader->GetNext(firstItem.record)) {\r
+            updatedMergeItems.push_front(std::move(firstItem));\r
+            filesToCreate.erase(firstItem.reader->Filename());\r
+        }\r
+    }\r
+\r
+    // create readers for files that were not 'active' for the previous\r
+    std::vector<std::string> missingPbi;\r
+    for (auto&& fn : filesToCreate) {\r
+        const BamFile bamFile{ fn };\r
+        if (bamFile.PacBioIndexExists()) {\r
+            auto item = internal::CompositeMergeItem{ std::unique_ptr<BamReader>{ new PbiIndexedBamReader{ filter, std::move(bamFile) } } };\r
+            if (item.reader->GetNext(item.record))\r
+                updatedMergeItems.push_back(std::move(item));\r
+            // else not an error, simply no data matching filter\r
+        }\r
+        else\r
+            missingPbi.push_back(fn);\r
+    }\r
+\r
+    // throw if any files missing PBI\r
+    if (!missingPbi.empty()) {\r
+        std::ostringstream e;\r
+        e << "failed to open PbiFilterCompositeBamReader because the following files are missing a PBI file:\n";\r
+        for (const auto& fn : missingPbi)\r
+            e << "  " << fn << '\n';\r
+        throw std::runtime_error{e.str()};\r
+    }\r
+\r
+\r
+    // update our actual container, store num matching reads, sort & and return\r
+    mergeQueue_ = std::move(updatedMergeItems);\r
+\r
+    numReads_ = 0;\r
+    for (const auto& item : mergeQueue_)\r
+    {\r
+        auto* pbiReader = dynamic_cast<PbiIndexedBamReader*>(item.reader.get());\r
+        numReads_ += pbiReader->NumReads();\r
+    }\r
+\r
+    UpdateSort();\r
+    return *this;\r
+}\r
+\r
+template<typename OrderByType>\r
+inline uint32_t PbiFilterCompositeBamReader<OrderByType>::NumReads() const\r
+{\r
+    return numReads_;\r
+}\r
+\r
+template<typename OrderByType>\r
+inline void PbiFilterCompositeBamReader<OrderByType>::UpdateSort()\r
+{ std::stable_sort(mergeQueue_.begin(), mergeQueue_.end(), merge_sorter_type{}); }\r
+\r
+// ------------------------------\r
+// SequentialCompositeBamReader\r
+// ------------------------------\r
+\r
+inline SequentialCompositeBamReader::SequentialCompositeBamReader(std::vector<BamFile> bamFiles)\r
+{\r
+    for (auto&& bamFile : bamFiles)\r
+        readers_.emplace_back(std::make_unique<BamReader>(std::move(bamFile)));\r
+}\r
+\r
+inline SequentialCompositeBamReader::SequentialCompositeBamReader(const DataSet& dataset)\r
+    : SequentialCompositeBamReader{dataset.BamFiles()}\r
+{ }\r
+\r
+inline bool SequentialCompositeBamReader::GetNext(BamRecord& record)\r
+{\r
+    // try first reader, if successful return true\r
+    // else pop reader and try next, until all readers exhausted\r
+    while (!readers_.empty()) {\r
+        auto& reader = readers_.front();\r
+        if (reader->GetNext(record))\r
+            return true;\r
+        else\r
+            readers_.pop_front();\r
+    }\r
+\r
+    // no readers available\r
+    return false;\r
+}\r
+\r
+} // namespace BAM\r
+} // namespace PacBio\r
diff --git a/include/pbbam/internal/CompositeFastaReader.inl b/include/pbbam/internal/CompositeFastaReader.inl

new file mode 100644 (file)

index 0000000..96c1f5e
--- /dev/null
+++ b/include/pbbam/internal/CompositeFastaReader.inl
@@ -0,0 +1,42 @@
+// File Description
+/// \file CompositeFastaReader.inl
+/// \brief Inline implementation for the composite FASTA reader, for
+///        working with multiple input files.
+//
+// Author: Derek Barnett
+
+#include "pbbam/CompositeFastaReader.h"
+
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline CompositeFastaReader::CompositeFastaReader(const std::vector<std::string>& fastaFiles)
+{
+    for (const auto& fn : fastaFiles)
+        readers_.emplace_back(std::make_unique<FastaReader>(fn));
+}
+
+inline CompositeFastaReader::CompositeFastaReader(const DataSet& dataset)
+    : CompositeFastaReader{dataset.FastaFiles()}
+{ }
+
+inline bool CompositeFastaReader::GetNext(FastaSequence& seq)
+{
+    // try first reader, if successful return true
+    // else pop reader and try next, until all readers exhausted
+    while (!readers_.empty()) {
+        auto& reader = readers_.front();
+        if (reader->GetNext(seq))
+            return true;
+        else
+            readers_.pop_front();
+    }
+
+    // no readers available
+    return false;
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/DataSet.inl b/include/pbbam/internal/DataSet.inl

new file mode 100644 (file)

index 0000000..24e15cc
--- /dev/null
+++ b/include/pbbam/internal/DataSet.inl
@@ -0,0 +1,166 @@
+// File Description
+/// \file DataSet.inl
+/// \brief Inline implementations for the DataSet class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/DataSet.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline const std::string& DataSet::Attribute(const std::string& name) const
+{ return d_->Attribute(name); }
+
+inline std::string& DataSet::Attribute(const std::string& name)
+{ return d_->Attribute(name); }
+
+inline DataSet& DataSet::Attribute(const std::string& name, const std::string& value)
+{ d_->Attribute(name, value); return *this; }
+
+inline const std::string& DataSet::CreatedAt() const
+{ return d_->CreatedAt(); }
+
+inline std::string& DataSet::CreatedAt()
+{ return d_->CreatedAt(); }
+
+inline DataSet& DataSet::CreatedAt(const std::string& createdAt)
+{ d_->CreatedAt(createdAt); return *this; }
+
+inline const PacBio::BAM::Extensions& DataSet::Extensions() const
+{ return d_->Extensions(); }
+
+inline PacBio::BAM::Extensions& DataSet::Extensions()
+{ return d_->Extensions(); }
+
+inline DataSet& DataSet::Extensions(const PacBio::BAM::Extensions& extensions)
+{ d_->Extensions(extensions); return *this; }
+
+inline const PacBio::BAM::ExternalResources& DataSet::ExternalResources() const
+{ return d_->ExternalResources(); }
+
+inline PacBio::BAM::ExternalResources& DataSet::ExternalResources()
+{ return d_->ExternalResources(); }
+
+inline DataSet& DataSet::ExternalResources(const PacBio::BAM::ExternalResources& resources)
+{ d_->ExternalResources(resources); return *this; }
+
+inline const PacBio::BAM::Filters& DataSet::Filters() const
+{ return d_->Filters(); }
+
+inline PacBio::BAM::Filters& DataSet::Filters()
+{ return d_->Filters(); }
+
+inline DataSet& DataSet::Filters(const PacBio::BAM::Filters& filters)
+{ d_->Filters(filters); return *this; }
+
+inline const std::string& DataSet::Format() const
+{ return d_->Format(); }
+
+inline std::string& DataSet::Format()
+{ return d_->Format(); }
+
+inline DataSet& DataSet::Format(const std::string& format)
+{ d_->Format(format); return *this; }
+
+inline const PacBio::BAM::DataSetMetadata& DataSet::Metadata() const
+{ return d_->Metadata(); }
+
+inline PacBio::BAM::DataSetMetadata& DataSet::Metadata()
+{ return d_->Metadata(); }
+
+inline DataSet& DataSet::Metadata(const PacBio::BAM::DataSetMetadata& metadata)
+{ d_->Metadata(metadata); return *this; }
+
+inline const std::string& DataSet::MetaType() const
+{ return d_->MetaType(); }
+
+inline std::string& DataSet::MetaType()
+{ return d_->MetaType(); }
+
+inline DataSet& DataSet::MetaType(const std::string& metatype)
+{ d_->MetaType(metatype); return *this; }
+
+inline const std::string& DataSet::ModifiedAt() const
+{ return d_->ModifiedAt(); }
+
+inline std::string& DataSet::ModifiedAt()
+{ return d_->ModifiedAt(); }
+
+inline DataSet& DataSet::ModifiedAt(const std::string& modifiedAt)
+{ d_->ModifiedAt(modifiedAt); return *this; }
+
+inline const std::string& DataSet::Name() const
+{ return d_->Name(); }
+
+inline std::string& DataSet::Name()
+{ return d_->Name(); }
+
+inline DataSet& DataSet::Name(const std::string& name)
+{ d_->Name(name); return *this; }
+
+inline const std::string& DataSet::ResourceId() const
+{ return d_->ResourceId(); }
+
+inline std::string& DataSet::ResourceId()
+{ return d_->ResourceId(); }
+
+inline DataSet& DataSet::ResourceId(const std::string& resourceId)
+{ d_->ResourceId(resourceId); return *this; }
+
+inline const PacBio::BAM::SubDataSets& DataSet::SubDataSets() const
+{ return d_->SubDataSets(); }
+
+inline PacBio::BAM::SubDataSets& DataSet::SubDataSets()
+{ return d_->SubDataSets(); }
+
+inline DataSet& DataSet::SubDataSets(const PacBio::BAM::SubDataSets& subdatasets)
+{ d_->SubDataSets(subdatasets); return *this; }
+
+inline const std::string& DataSet::Tags() const
+{ return d_->Tags(); }
+
+inline std::string& DataSet::Tags()
+{ return d_->Tags(); }
+
+inline DataSet& DataSet::Tags(const std::string& tags)
+{ d_->Tags(tags); return *this; }
+
+inline const std::string& DataSet::TimeStampedName() const
+{ return d_->TimeStampedName(); }
+
+inline std::string& DataSet::TimeStampedName()
+{ return d_->TimeStampedName(); }
+
+inline DataSet& DataSet::TimeStampedName(const std::string& timeStampedName)
+{ d_->TimeStampedName(timeStampedName); return *this; }
+
+inline PacBio::BAM::DataSet::TypeEnum DataSet::Type() const
+{ return DataSet::NameToType(TypeName()); }
+
+inline DataSet& DataSet::Type(const DataSet::TypeEnum type)
+{ d_->Label(DataSet::TypeToName(type)); return *this; }
+
+inline std::string DataSet::TypeName() const
+{ return d_->LocalNameLabel().to_string(); }
+
+inline const std::string& DataSet::UniqueId() const
+{ return d_->UniqueId(); }
+
+inline std::string& DataSet::UniqueId()
+{ return d_->UniqueId(); }
+
+inline DataSet& DataSet::UniqueId(const std::string& uuid)
+{ d_->UniqueId(uuid); return *this; }
+
+inline const std::string& DataSet::Version() const
+{ return d_->Version(); }
+
+inline std::string& DataSet::Version()
+{ return d_->Version(); }
+
+inline DataSet& DataSet::Version(const std::string& version)
+{ d_->Version(version); return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/DataSetBaseTypes.h b/include/pbbam/internal/DataSetBaseTypes.h

new file mode 100644 (file)

index 0000000..2812e67
--- /dev/null
+++ b/include/pbbam/internal/DataSetBaseTypes.h
@@ -0,0 +1,137 @@
+// Author: Derek Barnett
+
+#ifndef DATASETBASETYPES_H
+#define DATASETBASETYPES_H
+
+#include <string>
+#include "pbbam/Config.h"
+#include "pbbam/internal/DataSetElement.h"
+#include "pbbam/internal/DataSetListElement.h"
+
+namespace PacBio {
+namespace BAM {
+
+class DataSetMetadata;
+class Extensions;
+class ExternalResources;
+class FileIndices;
+class Filters;
+class Properties;
+class Provenance;
+
+namespace internal {
+
+class BaseEntityType : public DataSetElement
+{
+protected:
+    BaseEntityType(const std::string& label, const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const std::string& CreatedAt() const;
+    const std::string& Description() const;
+    const PacBio::BAM::Extensions& Extensions() const;
+    const std::string& Format() const;
+    const std::string& ModifiedAt() const;
+    const std::string& Name() const;
+    const std::string& ResourceId() const;
+    const std::string& Tags() const;
+    const std::string& Version() const;
+
+    std::string& CreatedAt();
+    std::string& Description();
+    PacBio::BAM::Extensions& Extensions();
+    std::string& Format();
+    std::string& ModifiedAt();
+    std::string& Name();
+    std::string& ResourceId();
+    std::string& Tags();
+    std::string& Version();
+
+    BaseEntityType& CreatedAt(const std::string& createdAt);
+    BaseEntityType& Description(const std::string& description);
+    BaseEntityType& Extensions(const PacBio::BAM::Extensions& extensions);
+    BaseEntityType& Format(const std::string& format);
+    BaseEntityType& ModifiedAt(const std::string& modifiedAt);
+    BaseEntityType& Name(const std::string& name);
+    BaseEntityType& ResourceId(const std::string& resourceId);
+    BaseEntityType& Tags(const std::string& tags);
+    BaseEntityType& Version(const std::string& version);
+};
+
+class DataEntityType : public BaseEntityType
+{
+protected:
+    DataEntityType(const std::string& label, const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const std::string& Checksum() const;
+    const std::string& EncodedValue() const;
+    const std::string& MetaType() const;
+    const std::string& SimpleValue() const;
+    const std::string& TimeStampedName() const;
+    const std::string& UniqueId() const;
+    const std::string& ValueDataType() const;
+
+    std::string& Checksum();
+    std::string& EncodedValue();
+    std::string& MetaType();
+    std::string& SimpleValue();
+    std::string& TimeStampedName();
+    std::string& UniqueId();
+    std::string& ValueDataType();
+
+    DataEntityType& Checksum(const std::string& checksum);
+    DataEntityType& EncodedValue(const std::string& encodedValue);
+    DataEntityType& MetaType(const std::string& metatype);
+    DataEntityType& SimpleValue(const std::string& simpleValue);
+    DataEntityType& TimeStampedName(const std::string& timeStampedName);
+    DataEntityType& UniqueId(const std::string& uuid);
+    DataEntityType& ValueDataType(const std::string& valueDataType);
+};
+
+class StrictEntityType : public BaseEntityType
+{
+protected:
+    StrictEntityType(const std::string& metatype, const std::string& label,
+                     const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const std::string& MetaType() const;
+    const std::string& TimeStampedName() const;
+    const std::string& UniqueId() const;
+
+    std::string& MetaType();
+    std::string& TimeStampedName();
+    std::string& UniqueId();
+
+    StrictEntityType& MetaType(const std::string& metatype);
+    StrictEntityType& TimeStampedName(const std::string& timeStampedName);
+    StrictEntityType& UniqueId(const std::string& uuid);
+};
+
+class InputOutputDataType : public StrictEntityType
+{
+protected:
+    InputOutputDataType(const std::string& metatype, const std::string& filename,
+                        const std::string& label, const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+};
+
+class IndexedDataType : public InputOutputDataType
+{
+protected:
+    IndexedDataType(const std::string& metatype, const std::string& filename,
+                    const std::string& label, const XsdType& xsd = XsdType::BASE_DATA_MODEL);
+
+public:
+    const PacBio::BAM::FileIndices& FileIndices() const;
+    PacBio::BAM::FileIndices& FileIndices();
+    IndexedDataType& FileIndices(const PacBio::BAM::FileIndices& indices);
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/DataSetBaseTypes.inl"
+
+#endif  // DATASETBASETYPES_H
diff --git a/include/pbbam/internal/DataSetBaseTypes.inl b/include/pbbam/internal/DataSetBaseTypes.inl

new file mode 100644 (file)

index 0000000..722a629
--- /dev/null
+++ b/include/pbbam/internal/DataSetBaseTypes.inl
@@ -0,0 +1,185 @@
+// Author: Derek Barnett
+
+#include "pbbam/internal/DataSetBaseTypes.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// ----------------
+// BaseEntityType
+// ----------------
+
+inline const std::string& BaseEntityType::CreatedAt() const
+{ return Attribute("CreatedAt"); }
+
+inline std::string& BaseEntityType::CreatedAt()
+{ return Attribute("CreatedAt"); }
+
+inline BaseEntityType& BaseEntityType::CreatedAt(const std::string& createdAt)
+{ Attribute("CreatedAt", createdAt); return *this; }
+
+inline const std::string& BaseEntityType::Description() const
+{ return Attribute("Description"); }
+
+inline std::string& BaseEntityType::Description()
+{ return Attribute("Description"); }
+
+inline BaseEntityType& BaseEntityType::Description(const std::string& description)
+{ Attribute("Description", description); return *this; }
+
+inline const std::string& BaseEntityType::Format() const
+{ return Attribute("Format"); }
+
+inline std::string& BaseEntityType::Format()
+{ return Attribute("Format"); }
+
+inline BaseEntityType& BaseEntityType::Format(const std::string& format)
+{ Attribute("Format", format); return *this; }
+
+inline const std::string& BaseEntityType::ModifiedAt() const
+{ return Attribute("ModifiedAt"); }
+
+inline std::string& BaseEntityType::ModifiedAt()
+{ return Attribute("ModifiedAt"); }
+
+inline BaseEntityType& BaseEntityType::ModifiedAt(const std::string& modifiedAt)
+{ Attribute("ModifiedAt", modifiedAt); return *this; }
+
+inline const std::string& BaseEntityType::Name() const
+{ return Attribute("Name"); }
+
+inline std::string& BaseEntityType::Name()
+{ return Attribute("Name"); }
+
+inline BaseEntityType& BaseEntityType::Name(const std::string& name)
+{ Attribute("Name", name); return *this; }
+
+inline const std::string& BaseEntityType::ResourceId() const
+{ return Attribute("ResourceId"); }
+
+inline std::string& BaseEntityType::ResourceId()
+{ return Attribute("ResourceId"); }
+
+inline BaseEntityType& BaseEntityType::ResourceId(const std::string& resourceId)
+{ Attribute("ResourceId", resourceId); return *this; }
+
+inline const std::string& BaseEntityType::Tags() const
+{ return Attribute("Tags"); }
+
+inline std::string& BaseEntityType::Tags()
+{ return Attribute("Tags"); }
+
+inline BaseEntityType& BaseEntityType::Tags(const std::string& tags)
+{ Attribute("Tags", tags); return *this; }
+
+inline const std::string& BaseEntityType::Version() const
+{ return Attribute("Version"); }
+
+inline std::string& BaseEntityType::Version()
+{ return Attribute("Version"); }
+
+inline BaseEntityType& BaseEntityType::Version(const std::string& version)
+{ Attribute("Version", version); return *this; }
+
+// ----------------
+// DataEntityType
+// ----------------
+
+inline const std::string& DataEntityType::Checksum() const
+{ return ChildText("Checksum"); }
+
+inline std::string& DataEntityType::Checksum()
+{ return ChildText("Checksum"); }
+
+inline DataEntityType& DataEntityType::Checksum(const std::string& checksum)
+{ ChildText("Checksum", checksum); return *this; }
+
+inline const std::string& DataEntityType::EncodedValue() const
+{ return ChildText("EncodedValue"); }
+
+inline std::string& DataEntityType::EncodedValue()
+{ return ChildText("EncodedValue"); }
+
+inline DataEntityType& DataEntityType::EncodedValue(const std::string& encodedValue)
+{ ChildText("EncodedValue", encodedValue); return *this; }
+
+inline const std::string& DataEntityType::MetaType() const
+{ return Attribute("MetaType"); }
+
+inline std::string& DataEntityType::MetaType()
+{ return Attribute("MetaType"); }
+
+inline DataEntityType& DataEntityType::MetaType(const std::string& metatype)
+{ Attribute("MetaType", metatype); return *this; }
+
+inline const std::string& DataEntityType::SimpleValue() const
+{ return Attribute("SimpleValue"); }
+
+inline std::string& DataEntityType::SimpleValue()
+{ return Attribute("SimpleValue"); }
+
+inline DataEntityType& DataEntityType::SimpleValue(const std::string& simpleValue)
+{ Attribute("SimpleValue", simpleValue); return *this; }
+
+inline const std::string& DataEntityType::TimeStampedName() const
+{ return Attribute("TimeStampedName"); }
+
+inline std::string& DataEntityType::TimeStampedName()
+{ return Attribute("TimeStampedName"); }
+
+inline DataEntityType& DataEntityType::TimeStampedName(const std::string& timeStampedName)
+{ Attribute("TimeStampedName", timeStampedName); return *this; }
+
+inline const std::string& DataEntityType::UniqueId() const
+{ return Attribute("UniqueId"); }
+
+inline std::string& DataEntityType::UniqueId()
+{ return Attribute("UniqueId"); }
+
+inline DataEntityType& DataEntityType::UniqueId(const std::string& uuid)
+{ Attribute("UniqueId", uuid); return *this; }
+
+inline const std::string& DataEntityType::ValueDataType() const
+{ return Attribute("ValueDataType"); }
+
+inline std::string& DataEntityType::ValueDataType()
+{ return Attribute("ValueDataType"); }
+
+inline DataEntityType& DataEntityType::ValueDataType(const std::string& valueDataType)
+{ Attribute("ValueDataType", valueDataType); return *this; }
+
+// ----------------
+// StrictEntityType
+// ----------------
+
+inline const std::string& StrictEntityType::MetaType() const
+{ return Attribute("MetaType"); }
+
+inline std::string& StrictEntityType::MetaType()
+{ return Attribute("MetaType"); }
+
+inline StrictEntityType& StrictEntityType::MetaType(const std::string& metatype)
+{ Attribute("MetaType", metatype); return *this; }
+
+inline const std::string& StrictEntityType::TimeStampedName() const
+{ return Attribute("TimeStampedName"); }
+
+inline std::string& StrictEntityType::TimeStampedName()
+{ return Attribute("TimeStampedName"); }
+
+inline StrictEntityType& StrictEntityType::TimeStampedName(const std::string& timeStampedName)
+{ Attribute("TimeStampedName", timeStampedName); return *this; }
+
+inline const std::string& StrictEntityType::UniqueId() const
+{ return Attribute("UniqueId"); }
+
+inline std::string& StrictEntityType::UniqueId()
+{ return Attribute("UniqueId"); }
+
+inline StrictEntityType& StrictEntityType::UniqueId(const std::string& uuid)
+{ Attribute("UniqueId", uuid); return *this; }
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/DataSetElement.h b/include/pbbam/internal/DataSetElement.h

new file mode 100644 (file)

index 0000000..45a7c75
--- /dev/null
+++ b/include/pbbam/internal/DataSetElement.h
@@ -0,0 +1,162 @@
+// Author: Derek Barnett
+
+#ifndef DATASETELEMENT_H
+#define DATASETELEMENT_H
+
+#include "pbbam/DataSetXsd.h"
+
+#include <algorithm>
+#include <boost/utility/string_ref.hpp>
+#include <cassert>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class XmlName
+{
+    //    qualified name
+    //       |
+    //  --------------
+    // <pbns:node_name >
+    //  ---- ---------
+    //   |        |
+    //  prefix    local name
+
+public:
+    XmlName(std::string fullName, bool verbatim = false);
+    XmlName(const std::string& localName, const std::string& prefix);
+
+    XmlName(const XmlName&) = default;
+    XmlName(XmlName&&) = default;
+    XmlName& operator=(const XmlName&) = default;
+    XmlName& operator=(XmlName&&) = default;
+    ~XmlName() = default;
+
+public:
+    bool operator==(const XmlName& other) const;
+    bool operator!=(const XmlName& other) const;
+
+public:
+    const boost::string_ref LocalName() const;
+    const boost::string_ref Prefix() const;
+    const std::string& QualifiedName() const;
+    bool Verbatim() const;
+
+private:
+    std::string qualifiedName_;
+    size_t prefixSize_;
+    size_t localNameOffset_;
+    size_t localNameSize_;
+    bool verbatim_;
+};
+
+struct FromInputXml
+{
+};
+
+class DataSetElement
+{
+public:
+    DataSetElement(const std::string& label, const XsdType& xsd = XsdType::NONE);
+    DataSetElement(const std::string& label, const FromInputXml& fromInputXml,
+                   const XsdType& xsd = XsdType::NONE);
+
+    DataSetElement(const DataSetElement&) = default;
+    DataSetElement(DataSetElement&&) = default;
+    DataSetElement& operator=(const DataSetElement&) = default;
+    DataSetElement& operator=(DataSetElement&&) = default;
+    virtual ~DataSetElement() = default;
+
+public:
+    bool operator==(const DataSetElement& other) const;
+    bool operator!=(const DataSetElement& other) const;
+
+public:
+    const std::string& Attribute(const std::string& name) const;
+    std::string& Attribute(const std::string& name);
+    const std::map<std::string, std::string>& Attributes() const;
+    std::map<std::string, std::string>& Attributes();
+    bool HasAttribute(const std::string& name) const;
+
+    const std::vector<DataSetElement>& Children() const;
+    std::vector<DataSetElement>& Children();
+    bool HasChild(const std::string& label) const;
+
+    const boost::string_ref LocalNameLabel() const;
+    const boost::string_ref PrefixLabel() const;
+    const std::string& QualifiedNameLabel() const;
+    bool IsVerbatimLabel() const;
+
+    const std::string& Text() const;
+    std::string& Text();
+
+    const XsdType& Xsd() const;
+
+public:
+    void Attribute(const std::string& name, const std::string& value);
+    void Label(const std::string& label);
+    void Text(const std::string& text);
+
+public:
+    size_t NumAttributes() const;
+    size_t NumChildren() const;
+
+public:
+    void AddChild(const DataSetElement& e);
+    void RemoveChild(const DataSetElement& e);
+
+    template <typename T>
+    const T& Child(size_t index) const;
+
+    template <typename T>
+    T& Child(size_t index);
+
+    template <typename T>
+    const T& Child(const std::string& label) const;
+
+    template <typename T>
+    T& Child(const std::string& label);
+
+    template <typename T>
+    const T& operator[](size_t index) const;
+
+    template <typename T>
+    T& operator[](size_t index);
+
+    template <typename T = DataSetElement>
+    const T& operator[](const std::string& label) const;
+
+    template <typename T = DataSetElement>
+    T& operator[](const std::string& label);
+
+protected:
+    static const std::string& SharedNullString();
+
+public:
+    const std::string& ChildText(const std::string& label) const;
+    std::string& ChildText(const std::string& label);
+    void ChildText(const std::string& label, const std::string& text);
+
+protected:
+    XsdType xsd_;
+    XmlName label_;
+    std::string text_;
+    std::map<std::string, std::string> attributes_;
+    std::vector<DataSetElement> children_;
+
+private:
+    int IndexOf(const std::string& label) const;
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/DataSetElement.inl"
+
+#endif  // DATASETELEMENT_H
diff --git a/include/pbbam/internal/DataSetElement.inl b/include/pbbam/internal/DataSetElement.inl

new file mode 100644 (file)

index 0000000..beb18f5
--- /dev/null
+++ b/include/pbbam/internal/DataSetElement.inl
@@ -0,0 +1,262 @@
+// Author: Derek Barnett
+
+#include "pbbam/internal/DataSetElement.h"
+
+#include <iostream>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// ----------------
+// DataSetElement
+// ----------------
+
+inline DataSetElement::DataSetElement(const std::string& label, const XsdType& xsd)
+    : xsd_(xsd)
+    , label_(label)
+{ }
+
+inline DataSetElement::DataSetElement(const std::string& label,
+                                      const FromInputXml&,
+                                      const XsdType& xsd)
+    : xsd_(xsd)
+    , label_(label, true)
+{ }
+
+inline bool DataSetElement::operator==(const DataSetElement& other) const
+{
+    return xsd_   == other.xsd_   &&
+           label_ == other.label_ &&
+           text_  == other.text_  &&
+           attributes_ == other.attributes_ &&
+           children_   == other.children_;
+}
+
+inline bool DataSetElement::operator!=(const DataSetElement& other) const
+{ return !(*this == other); }
+
+template<typename T>
+const T& DataSetElement::operator[](size_t index) const
+{ return Child<T>(index); }
+
+template<typename T>
+T& DataSetElement::operator[](size_t index)
+{ return Child<T>(index); }
+
+template<typename T>
+const T& DataSetElement::operator[](const std::string& label) const
+{ return Child<T>(label); }
+
+template<typename T>
+T& DataSetElement::operator[](const std::string& label)
+{ return Child<T>(label); }
+
+inline void DataSetElement::AddChild(const DataSetElement& e)
+{ children_.push_back(e); }
+
+inline std::string& DataSetElement::Attribute(const std::string& name)
+{ return attributes_[name]; }
+
+inline const std::string& DataSetElement::Attribute(const std::string& name) const
+{
+    auto iter = attributes_.find(name);
+    if (iter == attributes_.cend())
+        return SharedNullString();
+    return iter->second;
+}
+
+inline void DataSetElement::Attribute(const std::string& name, const std::string& value)
+{ attributes_[name] = value; }
+
+inline const std::map<std::string, std::string>& DataSetElement::Attributes() const
+{ return attributes_; }
+
+inline std::map<std::string, std::string>& DataSetElement::Attributes()
+{ return attributes_; }
+
+template<typename T>
+inline const T& DataSetElement::Child(size_t index) const
+{ return static_cast<const T&>(children_.at(index)); }
+
+template<typename T>
+inline T& DataSetElement::Child(size_t index)
+{ return static_cast<T&>(children_.at(index)); }
+
+template<typename T>
+inline const T& DataSetElement::Child(const std::string& label) const
+{ return Child<T>(IndexOf(label)); }
+
+template<typename T>
+inline T& DataSetElement::Child(const std::string& label)
+{
+    const int i = IndexOf(label);
+    if (i >= 0) {
+        assert(static_cast<size_t>(i) < NumChildren());
+        return Child<T>(i);
+    } else {
+        AddChild(DataSetElement(label));
+        return Child<T>(NumChildren()-1);
+    }
+}
+
+inline const std::vector<DataSetElement>& DataSetElement::Children() const
+{ return children_; }
+
+inline std::vector<DataSetElement>& DataSetElement::Children()
+{ return children_; }
+
+inline const std::string& DataSetElement::ChildText(const std::string& label) const
+{
+    if (!HasChild(label))
+        return SharedNullString();
+    return Child<DataSetElement>(label).Text();
+}
+
+inline std::string& DataSetElement::ChildText(const std::string& label)
+{
+    if (!HasChild(label))
+        AddChild(DataSetElement(label));
+    return Child<DataSetElement>(label).Text();
+}
+
+inline bool DataSetElement::HasAttribute(const std::string& name) const
+{ return attributes_.find(name) != attributes_.cend(); }
+
+inline bool DataSetElement::HasChild(const std::string& label) const
+{ return IndexOf(label) != -1; }
+
+inline int DataSetElement::IndexOf(const std::string& label) const
+{
+    const size_t count = NumChildren();
+    for (size_t i = 0; i < count; ++i) {
+        const DataSetElement& child = children_.at(i);
+        if (child.LocalNameLabel() == label || child.label_ == label)
+            return i;
+    }
+    return -1;
+}
+
+inline const boost::string_ref DataSetElement::LocalNameLabel() const
+{ return label_.LocalName(); }
+
+inline const boost::string_ref DataSetElement::PrefixLabel() const
+{ return label_.Prefix(); }
+
+inline const std::string& DataSetElement::QualifiedNameLabel() const
+{ return label_.QualifiedName(); }
+
+//inline std::string& DataSetElement::Label()
+//{ return label_.QualifiedName(); }
+
+inline void DataSetElement::Label(const std::string& label)
+{ label_ = XmlName(label, true); }
+
+inline size_t DataSetElement::NumAttributes() const
+{ return attributes_.size(); }
+
+inline size_t DataSetElement::NumChildren() const
+{ return children_.size(); }
+
+inline void DataSetElement::RemoveChild(const DataSetElement& e)
+{
+    children_.erase(
+        std::remove(children_.begin(),
+                    children_.end(),
+                    e),
+        children_.end()
+    );
+}
+
+inline void DataSetElement::ChildText(const std::string& label,
+                                         const std::string& text)
+{
+    if (!HasChild(label)) {
+        DataSetElement e(label);
+        e.Text(text);
+        AddChild(e);
+    } else {
+        Child<DataSetElement>(label).Text(text);
+    }
+}
+
+inline bool DataSetElement::IsVerbatimLabel() const
+{ return label_.Verbatim(); }
+
+inline const std::string& DataSetElement::Text() const
+{ return text_; }
+
+inline std::string& DataSetElement::Text()
+{ return text_; }
+
+inline void DataSetElement::Text(const std::string& text)
+{ text_ = text; }
+
+inline const XsdType& DataSetElement::Xsd() const
+{ return xsd_; }
+
+// ----------------
+// XmlName
+// ----------------
+
+inline XmlName::XmlName(std::string fullName, bool verbatim)
+    : qualifiedName_(std::move(fullName))
+    , prefixSize_(0)
+    , localNameOffset_(0)
+    , localNameSize_(0)
+    , verbatim_(verbatim)
+{
+    const size_t colonFound = qualifiedName_.find(':');
+    if (colonFound == std::string::npos || colonFound == 0)
+        localNameSize_ = qualifiedName_.size();
+    else {
+        prefixSize_ = colonFound;
+        localNameSize_ = (qualifiedName_.size() - colonFound) - 1;
+    }
+
+    // adjust for colon if prefix present
+    localNameOffset_ = prefixSize_;
+    if (prefixSize_ != 0)
+        ++localNameOffset_;
+}
+
+inline XmlName::XmlName(const std::string& localName,
+                        const std::string& prefix)
+    : prefixSize_(prefix.size())
+    , localNameOffset_(prefixSize_)
+    , localNameSize_(localName.size())
+    , verbatim_(true)
+{
+    qualifiedName_.clear();
+    qualifiedName_.reserve(localNameSize_+ prefixSize_ + 1);
+    qualifiedName_.append(prefix);
+    if (!qualifiedName_.empty())
+        qualifiedName_.append(1, ':');
+    qualifiedName_.append(localName);
+
+    // adjust for colon if prefix present
+    if (prefixSize_ != 0)
+        ++localNameOffset_;
+}
+
+inline bool XmlName::operator==(const XmlName& other) const
+{ return qualifiedName_ == other.qualifiedName_; }
+
+inline bool XmlName::operator!=(const XmlName& other) const
+{ return !(*this == other); }
+
+inline const boost::string_ref XmlName::LocalName() const
+{ return boost::string_ref(qualifiedName_.data() + localNameOffset_, localNameSize_); }
+
+inline const boost::string_ref XmlName::Prefix() const
+{ return boost::string_ref(qualifiedName_.data(), prefixSize_); }
+
+inline const std::string& XmlName::QualifiedName() const
+{ return qualifiedName_; }
+
+inline bool XmlName::Verbatim() const
+{ return verbatim_; }
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/DataSetListElement.h b/include/pbbam/internal/DataSetListElement.h

new file mode 100644 (file)

index 0000000..f2d2a36
--- /dev/null
+++ b/include/pbbam/internal/DataSetListElement.h
@@ -0,0 +1,84 @@
+// Author: Derek Barnett
+
+#ifndef DATASETLISTELEMENT_H
+#define DATASETLISTELEMENT_H
+
+#include "pbbam/internal/DataSetElement.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+//
+// adds iterators for convenience
+//
+template <class T>
+class DataSetListElement;
+
+template <class T>
+class DataSetListIteratorBase
+{
+public:
+    bool operator==(const DataSetListIteratorBase<T>& other) const;
+    bool operator!=(const DataSetListIteratorBase<T>& other) const;
+
+protected:
+    DataSetListIteratorBase(const DataSetListElement<T>* parent, size_t i);
+    void ReadNext();
+
+protected:
+    const DataSetListElement<T>* parent_;
+    size_t index_;
+};
+
+template <class T>
+class DataSetListIterator : public DataSetListIteratorBase<T>
+{
+public:
+    DataSetListIterator(const DataSetListElement<T>* parent, size_t i);
+    T& operator*();
+    T* operator->();
+    DataSetListIterator<T>& operator++();
+    DataSetListIterator<T> operator++(int);
+};
+
+template <class T>
+class DataSetListConstIterator : public DataSetListIteratorBase<T>
+{
+public:
+    DataSetListConstIterator(const DataSetListElement<T>* parent, size_t i);
+    const T& operator*() const;
+    const T* operator->() const;
+    DataSetListConstIterator<T>& operator++();
+    DataSetListConstIterator<T> operator++(int);
+};
+
+template <class T>
+class DataSetListElement : public DataSetElement
+{
+public:
+    DataSetListElement(const std::string& label, const XsdType& xsd = XsdType::NONE);
+
+    // child access through index
+public:
+    const T& operator[](size_t index) const;
+    T& operator[](size_t index);
+    size_t Size() const;
+
+    // child access through iterators
+public:
+    DataSetListIterator<T> begin();
+    DataSetListConstIterator<T> begin() const;
+    DataSetListConstIterator<T> cbegin() const;
+    DataSetListIterator<T> end();
+    DataSetListConstIterator<T> end() const;
+    DataSetListConstIterator<T> cend() const;
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/DataSetListElement.inl"
+
+#endif  // DATASETLISTELEMENT_H
diff --git a/include/pbbam/internal/DataSetListElement.inl b/include/pbbam/internal/DataSetListElement.inl

new file mode 100644 (file)

index 0000000..e1de18b
--- /dev/null
+++ b/include/pbbam/internal/DataSetListElement.inl
@@ -0,0 +1,146 @@
+// Author: Derek Barnett
+
+#include "pbbam/internal/DataSetListElement.h"
+#include <cassert>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// --------------------
+// DataSetListElement
+// --------------------
+
+template<class T>
+inline DataSetListElement<T>::DataSetListElement(const std::string& label,
+                                                 const XsdType& xsd)
+    : DataSetElement(label, xsd)
+{ }
+
+template<class T>
+inline const T& DataSetListElement<T>::operator[](size_t index) const
+{ return static_cast<const T&>(children_.at(index)); }
+
+template<class T>
+inline T& DataSetListElement<T>::operator[](size_t index)
+{ return static_cast<T&>(children_.at(index)); }
+
+template<class T>
+inline size_t DataSetListElement<T>::Size() const
+{ return NumChildren(); }
+
+template<class T>
+inline DataSetListIterator<T> DataSetListElement<T>::begin()
+{ return DataSetListIterator<T>(this, 0); }
+
+template<class T>
+inline DataSetListConstIterator<T> DataSetListElement<T>::begin() const
+{ return DataSetListConstIterator<T>(this, 0); }
+
+template<class T>
+inline DataSetListConstIterator<T> DataSetListElement<T>::cbegin() const
+{ return DataSetListConstIterator<T>(this, 0); }
+
+template<class T>
+inline DataSetListIterator<T> DataSetListElement<T>::end()
+{ return DataSetListIterator<T>(this, NumChildren()); }
+
+template<class T>
+inline DataSetListConstIterator<T> DataSetListElement<T>::end() const
+{ return DataSetListConstIterator<T>(this, NumChildren()); }
+
+template<class T>
+inline DataSetListConstIterator<T>DataSetListElement<T>::cend() const
+{ return DataSetListConstIterator<T>(this, NumChildren()); }
+
+// -------------------------
+// DataSetListIteratorBase
+// -------------------------
+
+template<class T>
+inline bool DataSetListIteratorBase<T>::operator==(const DataSetListIteratorBase<T>& other) const
+{ return parent_ == other.parent_ &&
+         index_ == other.index_;
+}
+
+template<class T>
+inline bool DataSetListIteratorBase<T>::operator!=(const DataSetListIteratorBase<T>& other) const
+{ return !(*this == other); }
+
+template<class T>
+inline DataSetListIteratorBase<T>::DataSetListIteratorBase(const DataSetListElement<T>* parent, size_t i)
+    : parent_(parent)
+    , index_(i)
+{ }
+
+template<class T>
+inline void DataSetListIteratorBase<T>::ReadNext()
+{
+    if (index_ >= parent_->NumChildren()) {
+        parent_ = nullptr;
+        return;
+    }
+    ++index_;
+}
+
+// ---------------------
+// DataSetListIterator
+// ---------------------
+
+template<class T>
+inline DataSetListIterator<T>::DataSetListIterator(const DataSetListElement<T>* parent, size_t i)
+    : DataSetListIteratorBase<T>(parent, i)
+{ }
+
+template<class T>
+inline T& DataSetListIterator<T>::operator*()
+{ return DataSetListIteratorBase<T>::parent_->template Child<T>(DataSetListIteratorBase<T>::index_); }
+
+template<class T>
+inline T* DataSetListIterator<T>::operator->()
+{ return &(operator*()); }
+
+template<class T>
+inline DataSetListIterator<T>& DataSetListIterator<T>::operator++()
+{ DataSetListIteratorBase<T>::ReadNext(); return *this; }
+
+template<class T>
+inline DataSetListIterator<T> DataSetListIterator<T>::operator++(int)
+{
+    DataSetListIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+// --------------------------
+// DataSetListConstIterator
+// --------------------------
+
+template<class T>
+inline DataSetListConstIterator<T>::DataSetListConstIterator(const DataSetListElement<T>* parent, size_t i)
+    : DataSetListIteratorBase<T>(parent, i)
+{ }
+
+template<class T>
+inline const T& DataSetListConstIterator<T>::operator*() const
+{ return DataSetListIteratorBase<T>::parent_->template Child<T>(DataSetListIteratorBase<T>::index_); }
+
+template<class T>
+inline const T* DataSetListConstIterator<T>::operator->() const
+{ return &(operator*()); }
+
+template<class T>
+inline DataSetListConstIterator<T>& DataSetListConstIterator<T>::operator++()
+{ DataSetListIteratorBase<T>::ReadNext(); return *this; }
+
+template<class T>
+inline DataSetListConstIterator<T> DataSetListConstIterator<T>::operator++(int)
+{
+    DataSetListConstIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/DataSetTypes.inl b/include/pbbam/internal/DataSetTypes.inl

new file mode 100644 (file)

index 0000000..c2b0608
--- /dev/null
+++ b/include/pbbam/internal/DataSetTypes.inl
@@ -0,0 +1,119 @@
+// File Description
+/// \file DataSetTypes.inl
+/// \brief Inline implementations for the public DataSet component classes.
+//
+// Author: Derek Barnett
+
+#include "pbbam/DataSetTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+// -------------
+// DataSetBase
+// --------------
+
+inline const NamespaceRegistry& DataSetBase::Namespaces() const
+{ return registry_; }
+
+inline NamespaceRegistry& DataSetBase::Namespaces()
+{ return registry_; }
+
+// ---------------------
+// DataSetMetadata
+// ---------------------
+
+inline const std::string& DataSetMetadata::NumRecords() const
+{ return ChildText("NumRecords"); }
+
+inline std::string& DataSetMetadata::NumRecords()
+{ return ChildText("NumRecords"); }
+
+inline DataSetMetadata& DataSetMetadata::NumRecords(const std::string& numRecords)
+{ ChildText("NumRecords", numRecords); return *this; }
+
+inline const std::string& DataSetMetadata::TotalLength() const
+{ return ChildText("TotalLength"); }
+
+inline std::string& DataSetMetadata::TotalLength()
+{ return ChildText("TotalLength"); }
+
+inline DataSetMetadata& DataSetMetadata::TotalLength(const std::string& totalLength)
+{ ChildText("TotalLength", totalLength); return *this; }
+
+// ----------
+// Property
+// ----------
+
+inline const std::string& Property::Name() const
+{ return Attribute("Name"); }
+
+inline std::string& Property::Name()
+{ return Attribute("Name"); }
+
+inline Property& Property::Name(const std::string& name)
+{ Attribute("Name", name); return *this; }
+
+inline const std::string& Property::Operator() const
+{ return Attribute("Operator"); }
+
+inline std::string& Property::Operator()
+{ return Attribute("Operator"); }
+
+inline Property& Property::Operator(const std::string& op)
+{ Attribute("Operator", op); return *this; }
+
+inline const std::string& Property::Value() const
+{ return Attribute("Value"); }
+
+inline std::string& Property::Value()
+{ return Attribute("Value"); }
+
+inline Property& Property::Value(const std::string& value)
+{ Attribute("Value", value); return *this; }
+
+// ------------
+// Provenance
+// ------------
+
+inline const std::string& Provenance::CreatedBy() const
+{ return Attribute("CreatedBy"); }
+
+inline std::string& Provenance::CreatedBy()
+{ return Attribute("CreatedBy"); }
+
+inline Provenance& Provenance::CreatedBy(const std::string& createdBy)
+{ Attribute("CreatedBy", createdBy); return *this; }
+
+inline const std::string& Provenance::CommonServicesInstanceId() const
+{ return ChildText("CommonServicesInstanceId"); }
+
+inline std::string& Provenance::CommonServicesInstanceId()
+{ return ChildText("CommonServicesInstanceId"); }
+
+inline Provenance& Provenance::CommonServicesInstanceId(const std::string& id)
+{ ChildText("CommonServicesInstanceId", id); return *this; }
+
+inline const std::string& Provenance::CreatorUserId() const
+{ return ChildText("CreatorUserId"); }
+
+inline std::string& Provenance::CreatorUserId()
+{ return ChildText("CreatorUserId"); }
+
+inline Provenance& Provenance::CreatorUserId(const std::string& id)
+{ ChildText("CreatorUserId", id); return *this; }
+
+inline const std::string& Provenance::ParentJobId() const
+{ return ChildText("ParentJobId"); }
+
+inline std::string& Provenance::ParentJobId()
+{ return ChildText("ParentJobId"); }
+
+inline Provenance& Provenance::ParentJobId(const std::string& id)
+{ ChildText("ParentJobId", id); return *this; }
+
+inline Provenance& Provenance::ParentTool(const PacBio::BAM::ParentTool& tool)
+{ ParentTool() = tool; return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/FastaSequence.inl b/include/pbbam/internal/FastaSequence.inl

new file mode 100644 (file)

index 0000000..a4a3b96
--- /dev/null
+++ b/include/pbbam/internal/FastaSequence.inl
@@ -0,0 +1,25 @@
+// File Description
+/// \file FastaSequence.inl
+/// \brief Inline implementations for the FastaSequence class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/FastaSequence.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline FastaSequence::FastaSequence(std::string name,
+                                    std::string bases)
+    : name_{std::move(name)}
+    , bases_{std::move(bases)}
+{ }
+
+inline const std::string& FastaSequence::Bases() const
+{ return bases_; }
+
+inline const std::string& FastaSequence::Name() const
+{ return name_; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/FastqSequence.inl b/include/pbbam/internal/FastqSequence.inl

new file mode 100644 (file)

index 0000000..9df2d73
--- /dev/null
+++ b/include/pbbam/internal/FastqSequence.inl
@@ -0,0 +1,30 @@
+// File Description
+/// \file FastqSequence.inl
+/// \brief Inline implementations for the FastqSequence class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/FastqSequence.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline FastqSequence::FastqSequence(std::string name,
+                                    std::string bases,
+                                    QualityValues qualities)
+    : FastaSequence{std::move(name), std::move(bases)}
+    , qualities_{std::move(qualities)}
+{ }
+
+inline FastqSequence::FastqSequence(std::string name,
+                                    std::string bases,
+                                    std::string qualities)
+    : FastaSequence{std::move(name), std::move(bases)}
+    , qualities_{QualityValues::FromFastq(qualities)}
+{ }
+
+inline const QualityValues& FastqSequence::Qualities() const
+{ return qualities_; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Frames.inl b/include/pbbam/internal/Frames.inl

new file mode 100644 (file)

index 0000000..ad951c0
--- /dev/null
+++ b/include/pbbam/internal/Frames.inl
@@ -0,0 +1,55 @@
+// File Description
+/// \file Frames.inl
+/// \brief Inline implementations for the Frames class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Frames.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline const std::vector<uint16_t>& Frames::Data() const
+{ return data_; }
+
+inline std::vector<uint16_t>& Frames::DataRaw()
+{ return data_; }
+
+inline std::vector<uint8_t> Frames::Encode() const
+{ return Frames::Encode(data_); }
+
+inline Frames& Frames::Data(std::vector<uint16_t> frames)
+{ data_ = std::move(frames); return *this; }
+
+inline std::vector<uint16_t>::const_iterator Frames::begin() const
+{ return data_.begin(); }
+
+inline std::vector<uint16_t>::iterator Frames::begin()
+{ return data_.begin(); }
+
+inline std::vector<uint16_t>::const_iterator Frames::cbegin() const
+{ return data_.cbegin(); }
+
+inline std::vector<uint16_t>::const_iterator Frames::cend() const
+{ return data_.cend(); }
+
+inline std::vector<uint16_t>::const_iterator Frames::end() const
+{ return data_.end(); }
+
+inline std::vector<uint16_t>::iterator Frames::end()
+{ return data_.end(); }
+
+inline size_t Frames::size() const
+{ return data_.size(); }
+
+inline bool Frames::empty() const
+{ return data_.empty(); }
+
+inline bool Frames::operator==(const Frames& other) const
+{ return data_ == other.data_; }
+
+inline bool Frames::operator!=(const Frames& other) const
+{ return !(*this == other); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/GenomicInterval.inl b/include/pbbam/internal/GenomicInterval.inl

new file mode 100644 (file)

index 0000000..0aae19f
--- /dev/null
+++ b/include/pbbam/internal/GenomicInterval.inl
@@ -0,0 +1,54 @@
+// File Description
+/// \file GenomicInterval.inl
+/// \brief Inline implementations for the GenomicInterval class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/GenomicInterval.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline std::string GenomicInterval::Name() const
+{ return name_; }
+
+inline GenomicInterval& GenomicInterval::Name(std::string name)
+{ name_ = std::move(name); return *this; }
+
+inline PacBio::BAM::Interval<Position> GenomicInterval::Interval() const
+{ return interval_; }
+
+inline GenomicInterval& GenomicInterval::Interval(PacBio::BAM::Interval<Position> interval)
+{ interval_ = std::move(interval); return *this; }
+
+inline bool GenomicInterval::IsValid() const
+{
+    return !name_.empty() &&
+           interval_.Start() >= 0 &&
+           interval_.Stop()  >= 0 &&
+           interval_.IsValid();
+}
+
+inline size_t GenomicInterval::Length() const
+{ return interval_.Length(); }
+
+inline Position GenomicInterval::Start() const
+{ return interval_.Start(); }
+
+inline GenomicInterval& GenomicInterval::Start(const Position start)
+{ interval_.Start(start); return *this; }
+
+inline Position GenomicInterval::Stop() const
+{ return interval_.Stop(); }
+
+inline GenomicInterval& GenomicInterval::Stop(const Position stop)
+{ interval_.Stop(stop); return *this; }
+
+inline bool GenomicInterval::operator==(const GenomicInterval& other) const
+{ return name_ == other.name_ && interval_ == other.interval_; }
+
+inline bool GenomicInterval::operator!=(const GenomicInterval& other) const
+{ return !(*this == other); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Interval.inl b/include/pbbam/internal/Interval.inl

new file mode 100644 (file)

index 0000000..b804b15
--- /dev/null
+++ b/include/pbbam/internal/Interval.inl
@@ -0,0 +1,78 @@
+// File Description
+/// \file Interval.inl
+/// \brief Inline implementations for the Interval class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Interval.h"
+
+namespace PacBio {
+namespace BAM {
+
+template<typename T>
+inline Interval<T>::Interval()
+    : data_{boost::icl::discrete_interval<T>::right_open(0,0)}
+{ }
+
+template<typename T>
+inline Interval<T>::Interval(const T val)
+    : data_{boost::icl::discrete_interval<T>::right_open(val,val+1)}
+{ }
+
+template<typename T>
+inline Interval<T>::Interval(const T start, const T stop)
+    : data_{boost::icl::discrete_interval<T>::right_open(start,stop)}
+{ }
+
+template<typename T>
+inline bool Interval<T>::operator==(const Interval<T>& other) const
+{ return data_ == other.data_; }
+
+template<typename T>
+inline bool Interval<T>::operator!=(const Interval<T>& other) const
+{ return !(data_ == other.data_); }
+
+template<typename T>
+inline bool Interval<T>::CoveredBy(const Interval<T>& other) const
+{ return boost::icl::within(data_, other.data_); }
+
+template<typename T>
+inline bool Interval<T>::Covers(const Interval<T>& other) const
+{ return boost::icl::contains(data_, other.data_); }
+
+template<typename T>
+inline bool Interval<T>::Intersects(const Interval<T>& other) const
+{ return boost::icl::intersects(data_, other.data_); }
+
+template<typename T>
+inline bool Interval<T>::IsValid() const
+{ return !boost::icl::is_empty(data_); }
+
+template<typename T>
+inline size_t Interval<T>::Length() const
+{ return boost::icl::length(data_); }
+
+template<typename T>
+inline T Interval<T>::Start() const
+{ return data_.lower(); }
+
+template<typename T>
+inline Interval<T>& Interval<T>::Start(const T& start)
+{
+    data_ = boost::icl::discrete_interval<T>::right_open(start, data_.upper());
+    return *this;
+}
+
+template<typename T>
+inline T Interval<T>::Stop() const
+{ return data_.upper(); }
+
+template<typename T>
+inline Interval<T>& Interval<T>::Stop(const T& stop)
+{
+    data_ = boost::icl::discrete_interval<T>::right_open(data_.lower(), stop);
+    return *this;
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/PbiBasicTypes.inl b/include/pbbam/internal/PbiBasicTypes.inl

new file mode 100644 (file)

index 0000000..3024f6a
--- /dev/null
+++ b/include/pbbam/internal/PbiBasicTypes.inl
@@ -0,0 +1,28 @@
+// File Description
+/// \file PbiBasicTypes.inl
+/// \brief Inline implementations for the basic data structures used in PBI lookups.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiBasicTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline IndexResultBlock::IndexResultBlock(size_t idx, size_t numReads)
+    : firstIndex_{idx}
+    , numReads_{numReads}
+{ }
+
+inline bool IndexResultBlock::operator==(const IndexResultBlock& other) const
+{
+    return firstIndex_ == other.firstIndex_ &&
+           numReads_ == other.numReads_ &&
+           virtualOffset_ == other.virtualOffset_;
+}
+
+inline bool IndexResultBlock::operator!=(const IndexResultBlock& other) const
+{ return !(*this == other); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/PbiFilter.inl b/include/pbbam/internal/PbiFilter.inl

new file mode 100644 (file)

index 0000000..f8f03ee
--- /dev/null
+++ b/include/pbbam/internal/PbiFilter.inl
@@ -0,0 +1,241 @@
+// File Description
+/// \file PbiFilter.inl
+/// \brief Inline implementations for the PbiFilter class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiFilter.h"
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <set>
+#include <vector>
+
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+/// \internal
+///
+/// This class wraps a the basic PBI filter (whether property filter or some operator
+/// e.g. union, intersect, etc.). The wrapper allows PbiFilters to hold heterogeneous,
+/// recursive filter types - without exposing pointers & worrying about memory ownership
+/// issues between client & library.
+///
+/// Filters can be given by value from client code and we will wrap them for composition.
+///
+/// \code{.cpp}
+///    PbiFilter f1(PbiZmwFilter(42));
+///    PbiFilter f2;
+///    f2.Add(PbiQueryLengthFilter(3000, GREATER_THAN_EQUAL));
+///    f2.Add(MyApplicationCustomFilter("foo"));
+///    PbiFilter intersect = PbiFilter::Intersect(f1, f2);
+///    ...
+/// \endcode
+///
+struct FilterWrapper
+{
+public:
+    template<typename T> FilterWrapper(T x);
+
+    FilterWrapper(const FilterWrapper& other);
+    FilterWrapper(FilterWrapper&&) noexcept = default;
+    FilterWrapper& operator=(const FilterWrapper& other);
+    FilterWrapper& operator=(FilterWrapper&&) noexcept = default;
+    ~FilterWrapper() = default;
+
+public:
+    bool Accepts(const PacBio::BAM::PbiRawData& idx, const size_t row) const;
+
+private:
+    struct WrapperInterface
+    {
+        virtual ~WrapperInterface() = default;
+        virtual WrapperInterface* Clone() const =0;
+        virtual bool Accepts(const PacBio::BAM::PbiRawData& idx,
+                             const size_t row) const =0;
+    };
+
+    template<typename T>
+    struct WrapperImpl : public WrapperInterface
+    {
+        WrapperImpl(T x);
+        WrapperImpl(const WrapperImpl& other);
+        WrapperInterface* Clone() const override;
+        bool Accepts(const PacBio::BAM::PbiRawData& idx,
+                     const size_t row) const override;
+        T data_;
+    };
+
+private:
+    std::unique_ptr<WrapperInterface> self_;
+};
+
+// ---------------
+// FilterWrapper
+// ---------------
+
+template<typename T>
+inline FilterWrapper::FilterWrapper(T x)
+    : self_{std::make_unique<WrapperImpl<T>>(std::move(x))}
+{ }
+
+inline FilterWrapper::FilterWrapper(const FilterWrapper& other)
+    : self_{other.self_->Clone()}
+{ }
+
+inline FilterWrapper& FilterWrapper::operator=(const FilterWrapper& other)
+{
+    self_.reset(other.self_->Clone());
+    return *this;
+}
+
+inline bool FilterWrapper::Accepts(const PbiRawData& idx, const size_t row) const
+{ return self_->Accepts(idx, row); }
+
+// ----------------
+// WrapperImpl<T>
+// ----------------
+
+template<typename T>
+inline FilterWrapper::WrapperImpl<T>::WrapperImpl(T x)
+    : FilterWrapper::WrapperInterface{}
+    , data_(std::move(x))
+{
+    BOOST_CONCEPT_ASSERT((PbiFilterConcept<T>));
+}
+
+template<typename T>
+inline FilterWrapper::WrapperImpl<T>::WrapperImpl(const WrapperImpl& other)
+    : FilterWrapper::WrapperInterface{}
+    , data_(other.data_)
+{ }
+
+template<typename T>
+inline FilterWrapper::WrapperInterface* FilterWrapper::WrapperImpl<T>::Clone() const
+{ return new WrapperImpl(*this); }
+
+template<typename T>
+inline bool FilterWrapper::WrapperImpl<T>::Accepts(const PbiRawData& idx,
+                                                   const size_t row) const
+{ return data_.Accepts(idx, row); }
+
+struct PbiFilterPrivate
+{
+    PbiFilterPrivate(PbiFilter::CompositionType type = PbiFilter::INTERSECT)
+        : type_{type}
+    { }
+
+    template<typename T>
+    void Add(T filter)
+    {
+        filters_.emplace_back(std::move(filter));
+    }
+
+    std::unique_ptr<internal::PbiFilterPrivate> DeepCopy()
+    {
+        auto copy = std::make_unique<PbiFilterPrivate>(type_);
+        copy->filters_ = this->filters_;
+        return copy;
+    }
+
+    bool Accepts(const PbiRawData& idx, const size_t row) const
+    {
+        // no filter -> accepts every record
+        if (filters_.empty())
+            return true;
+
+        // intersection of child filters
+        if (type_ == PbiFilter::INTERSECT) {
+            for (const auto& filter : filters_) {
+                if (!filter.Accepts(idx, row))
+                    return false; // break early on failure
+            }
+            return true; // all passed
+        }
+
+        // union of child filters
+        else if (type_ == PbiFilter::UNION) {
+            for (const auto& filter : filters_) {
+                if (filter.Accepts(idx, row))
+                    return true; // break early on pass
+            }
+            return false; // none passed
+        }
+
+        else
+            //assert(false); // invalid composite filter type
+            throw std::runtime_error{"invalid composite filter type in PbiFilterPrivate::Accepts"};
+    }
+
+    PbiFilter::CompositionType type_;
+    std::vector<FilterWrapper> filters_;
+};
+
+} // namespace internal
+
+inline PbiFilter::PbiFilter(const CompositionType type)
+    : d_{std::make_unique<internal::PbiFilterPrivate>(type) }
+{ }
+
+template<typename T> inline
+PbiFilter::PbiFilter(T filter)
+    : d_{std::make_unique<internal::PbiFilterPrivate>() }
+{
+    Add(std::move(filter));
+}
+
+inline PbiFilter::PbiFilter(std::vector<PbiFilter> filters)
+    : d_{std::make_unique<internal::PbiFilterPrivate>() }
+{
+    Add(std::move(filters));
+}
+
+inline PbiFilter::PbiFilter(const PbiFilter& other)
+    : d_{ other.d_->DeepCopy() }
+{ }
+
+inline PbiFilter& PbiFilter::operator=(const PbiFilter& other)
+{
+    d_ = other.d_->DeepCopy();
+    return *this;
+}
+
+inline bool PbiFilter::Accepts(const PacBio::BAM::PbiRawData& idx,
+                               const size_t row) const
+{ return d_->Accepts(idx, row); }
+
+template<typename T>
+inline PbiFilter& PbiFilter::Add(T filter)
+{
+    d_->Add(std::move(filter));
+    return *this;
+}
+
+inline PbiFilter& PbiFilter::Add(PbiFilter filter)
+{
+    d_->Add(std::move(filter));
+    return *this;
+}
+
+inline PbiFilter& PbiFilter::Add(std::vector<PbiFilter> filters)
+{
+    for (auto&& filter : filters)
+        d_->Add(std::move(filter));
+    return *this;
+}
+
+inline bool PbiFilter::IsEmpty() const
+{ return d_->filters_.empty(); }
+
+inline size_t PbiFilter::NumChildren() const
+{ return d_->filters_.size(); }
+
+inline PbiFilter::CompositionType PbiFilter::Type() const
+{ return d_->type_; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/PbiFilterTypes.inl b/include/pbbam/internal/PbiFilterTypes.inl

new file mode 100644 (file)

index 0000000..98ffa33
--- /dev/null
+++ b/include/pbbam/internal/PbiFilterTypes.inl
@@ -0,0 +1,504 @@
+// File Description
+/// \file PbiFilterTypes.inl
+/// \brief Inline implementations for the built-in PBI filters.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiFilterTypes.h"
+#include <cassert>
+#include <stdexcept>
+
+#include <boost/functional/hash/hash.hpp>
+
+namespace PacBio {
+namespace BAM {
+
+namespace internal {
+
+template <typename T>
+inline FilterBase<T>::FilterBase(T value, const Compare::Type cmp)
+    : value_{std::move(value)}
+    , cmp_{cmp}
+{ }
+
+template <typename T>
+inline FilterBase<T>::FilterBase(std::vector<T> values, const Compare::Type cmp)
+    : multiValue_{std::move(values)}
+    , cmp_{cmp}
+{ }
+
+template<typename T>
+inline bool FilterBase<T>::CompareHelper(const T& lhs) const
+{
+    if (multiValue_ == boost::none)
+        return CompareSingleHelper(lhs);
+    else
+        return CompareMultiHelper(lhs);
+}
+
+template<typename T>
+inline bool FilterBase<T>::CompareMultiHelper(const T& lhs) const
+{
+    if (cmp_ == Compare::EQUAL)
+    {
+        // check provided value against all filter criteria,
+        // return true on any exact match
+        auto iter = multiValue_.get().cbegin();
+        const auto end  = multiValue_.get().cend();
+        for (; iter != end; ++iter) {
+            if (*iter == lhs)
+                return true;
+        }
+        return false; // no matches
+    }
+    else if (cmp_ == Compare::NOT_EQUAL)
+    {
+        // check provided value against all filter criteria,
+        // return true on any exact match
+        auto iter = multiValue_.get().cbegin();
+        const auto end  = multiValue_.get().cend();
+        for (; iter != end; ++iter) {
+            if (*iter == lhs)
+                return false;
+        }
+        return true;
+    }
+    else
+        throw std::runtime_error{"unsupported compare type on multivalue filter"};
+}
+
+template<typename T>
+inline bool FilterBase<T>::CompareSingleHelper(const T& lhs) const
+{
+    return Compare::Check(lhs, value_, cmp_);
+}
+
+template<>
+inline bool FilterBase<LocalContextFlags>::CompareSingleHelper(const LocalContextFlags& lhs) const
+{
+    switch(cmp_) {
+        case Compare::EQUAL:              return lhs == value_;
+        case Compare::LESS_THAN:          return lhs < value_;
+        case Compare::LESS_THAN_EQUAL:    return lhs <= value_;
+        case Compare::GREATER_THAN:       return lhs > value_;
+        case Compare::GREATER_THAN_EQUAL: return lhs >= value_;
+        case Compare::NOT_EQUAL:          return lhs != value_;
+        case Compare::CONTAINS:           return ((lhs & value_) != 0);
+        case Compare::NOT_CONTAINS:       return ((lhs & value_) == 0);
+
+        default:
+            assert(false);
+            throw std::runtime_error{"unsupported compare type requested"};
+    }
+}
+
+// BarcodeDataFilterBase
+
+template<typename T, PbiFile::BarcodeField field>
+inline BarcodeDataFilterBase<T, field>::BarcodeDataFilterBase(T value, const Compare::Type cmp)
+    : FilterBase<T>{std::move(value), cmp}
+{ }
+
+template<typename T, PbiFile::BarcodeField field>
+inline BarcodeDataFilterBase<T, field>::BarcodeDataFilterBase(std::vector<T> values, const Compare::Type cmp)
+    : FilterBase<T>{std::move(values), cmp}
+{ }
+
+template<typename T, PbiFile::BarcodeField field>
+inline bool BarcodeDataFilterBase<T, field>::BarcodeDataFilterBase::Accepts(const PbiRawData& idx,
+                                           const size_t row) const
+{
+    const PbiRawBarcodeData& barcodeData = idx.BarcodeData();
+    switch (field) {
+        case PbiFile::BarcodeField::BC_FORWARD: return FilterBase<T>::CompareHelper(barcodeData.bcForward_.at(row));
+        case PbiFile::BarcodeField::BC_REVERSE: return FilterBase<T>::CompareHelper(barcodeData.bcReverse_.at(row));
+        case PbiFile::BarcodeField::BC_QUALITY: return FilterBase<T>::CompareHelper(barcodeData.bcQual_.at(row));
+        default:
+            assert(false);
+            throw std::runtime_error{"unsupported BarcodeData field requested"};
+    }
+}
+
+// BasicDataFilterBase
+
+template<typename T, PbiFile::BasicField field>
+inline BasicDataFilterBase<T, field>::BasicDataFilterBase(T value, const Compare::Type cmp)
+    : FilterBase<T>{std::move(value), cmp}
+{ }
+
+template<typename T, PbiFile::BasicField field>
+inline BasicDataFilterBase<T, field>::BasicDataFilterBase(std::vector<T> values, const Compare::Type cmp)
+    : FilterBase<T>{std::move(values), cmp}
+{ }
+
+template<typename T, PbiFile::BasicField field>
+inline bool BasicDataFilterBase<T, field>::BasicDataFilterBase::Accepts(const PbiRawData& idx,
+                                                                        const size_t row) const
+{
+    const PbiRawBasicData& basicData = idx.BasicData();
+    switch (field) {
+        case PbiFile::BasicField::RG_ID:        return FilterBase<T>::CompareHelper(basicData.rgId_.at(row));
+        case PbiFile::BasicField::Q_START:      return FilterBase<T>::CompareHelper(basicData.qStart_.at(row));
+        case PbiFile::BasicField::Q_END:        return FilterBase<T>::CompareHelper(basicData.qEnd_.at(row));
+        case PbiFile::BasicField::ZMW:          return FilterBase<T>::CompareHelper(basicData.holeNumber_.at(row));
+        case PbiFile::BasicField::READ_QUALITY: return FilterBase<T>::CompareHelper(basicData.readQual_.at(row));
+        //   PbiFile::BasicField::CONTEXT_FLAG has its own specialization
+        default:
+            assert(false);
+            throw std::runtime_error{"unsupported BasicData field requested"};
+    }
+}
+
+// this typedef exists purely so that the next method signature isn't 2 screen widths long
+using LocalContextFilter__ = BasicDataFilterBase<LocalContextFlags, PbiFile::BasicField::CONTEXT_FLAG>;
+
+template<>
+inline bool LocalContextFilter__::BasicDataFilterBase::Accepts(const PbiRawData& idx,
+                                                               const size_t row) const
+{
+    const auto& basicData = idx.BasicData();
+    const auto rowFlags = static_cast<LocalContextFlags>(basicData.ctxtFlag_.at(row));
+    return FilterBase<LocalContextFlags>::CompareHelper(rowFlags);
+}
+
+// BasicDataFilterBase
+
+template<typename T, PbiFile::MappedField field>
+inline MappedDataFilterBase<T, field>::MappedDataFilterBase(T value, const Compare::Type cmp)
+    : FilterBase<T>{std::move(value), cmp}
+{ }
+
+template<typename T, PbiFile::MappedField field>
+inline MappedDataFilterBase<T, field>::MappedDataFilterBase(std::vector<T> values, const Compare::Type cmp)
+    : FilterBase<T>{std::move(values), cmp}
+{ }
+
+template<>
+inline bool MappedDataFilterBase<Strand, PbiFile::MappedField::STRAND>::MappedDataFilterBase::Accepts(const PbiRawData& idx,
+                                                                                                  const size_t row) const
+{
+    const PbiRawMappedData& mappedData = idx.MappedData();
+    const Strand strand = (mappedData.revStrand_.at(row) == 1 ? Strand::REVERSE : Strand::FORWARD);
+    return FilterBase<Strand>::CompareHelper(strand);
+}
+
+template<typename T, PbiFile::MappedField field>
+inline bool MappedDataFilterBase<T, field>::MappedDataFilterBase::Accepts(const PbiRawData& idx,
+                                                                          const size_t row) const
+{
+    const PbiRawMappedData& mappedData = idx.MappedData();
+    switch (field) {
+        case PbiFile::MappedField::T_ID:        return FilterBase<T>::CompareHelper(mappedData.tId_.at(row));
+        case PbiFile::MappedField::T_START:     return FilterBase<T>::CompareHelper(mappedData.tStart_.at(row));
+        case PbiFile::MappedField::T_END:       return FilterBase<T>::CompareHelper(mappedData.tEnd_.at(row));
+        case PbiFile::MappedField::A_START:     return FilterBase<T>::CompareHelper(mappedData.aStart_.at(row));
+        case PbiFile::MappedField::A_END:       return FilterBase<T>::CompareHelper(mappedData.aEnd_.at(row));
+        case PbiFile::MappedField::N_M:         return FilterBase<T>::CompareHelper(mappedData.nM_.at(row));
+        case PbiFile::MappedField::N_MM:        return FilterBase<T>::CompareHelper(mappedData.nMM_.at(row));
+        case PbiFile::MappedField::N_DEL:       return FilterBase<T>::CompareHelper(mappedData.NumDeletedBasesAt(row));
+        case PbiFile::MappedField::N_INS:       return FilterBase<T>::CompareHelper(mappedData.NumInsertedBasesAt(row));
+        case PbiFile::MappedField::MAP_QUALITY: return FilterBase<T>::CompareHelper(mappedData.mapQV_.at(row));
+        default:
+            assert(false);
+            throw std::runtime_error{"unsupported MappedData field requested"};
+    }
+}
+
+} // namespace internal
+
+// PbiAlignedEndFilter
+
+inline PbiAlignedEndFilter::PbiAlignedEndFilter(const uint32_t position, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::A_END>{position, cmp}
+{ }
+
+// PbiAlignedLengthFilter
+
+inline PbiAlignedLengthFilter::PbiAlignedLengthFilter(const uint32_t length, const Compare::Type cmp)
+    : internal::FilterBase<uint32_t>{length, cmp}
+{ }
+
+// PbiAlignedStartFilter
+
+inline PbiAlignedStartFilter::PbiAlignedStartFilter(const uint32_t position, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::A_START>{position, cmp}
+{ }
+
+// PbiAlignedStrandFilter
+
+inline PbiAlignedStrandFilter::PbiAlignedStrandFilter(const Strand strand, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<Strand, PbiFile::MappedField::STRAND>{strand, cmp}
+{
+    if (cmp != Compare::EQUAL && cmp != Compare::NOT_EQUAL) {
+        throw std::runtime_error{"Compare type: " + Compare::TypeToName(cmp) + " not supported for PbiAlignedStrandFilter (use one of Compare::EQUAL or Compare::NOT_EQUAL)."};
+    }
+}
+
+// PbiBarcodeFilter
+
+inline PbiBarcodeFilter::PbiBarcodeFilter(const int16_t barcode, const Compare::Type cmp)
+    : compositeFilter_{ PbiFilter::Union({ PbiBarcodeForwardFilter{barcode,cmp},
+                                           PbiBarcodeReverseFilter{barcode,cmp}
+                                         })
+                      }
+{ }
+
+inline PbiBarcodeFilter::PbiBarcodeFilter(std::vector<int16_t> whitelist, const Compare::Type cmp)
+    : compositeFilter_{ PbiFilter::Union({ PbiBarcodeForwardFilter{std::move(whitelist), cmp},
+                                           PbiBarcodeReverseFilter{std::move(whitelist), cmp}
+                                         })
+                      }
+{ }
+
+inline bool PbiBarcodeFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{ return compositeFilter_.Accepts(idx, row); }
+
+// PbiBarcodeForwardFilter
+
+inline PbiBarcodeForwardFilter::PbiBarcodeForwardFilter(const int16_t bcFwdId, const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_FORWARD>{bcFwdId, cmp}
+{ }
+
+inline PbiBarcodeForwardFilter::PbiBarcodeForwardFilter(std::vector<int16_t> whitelist, const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_FORWARD>{std::move(whitelist), cmp}
+{ }
+
+// PbiBarcodeQualityFilter
+
+inline PbiBarcodeQualityFilter::PbiBarcodeQualityFilter(const uint8_t bcQuality, const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<uint8_t, PbiFile::BarcodeField::BC_QUALITY>{bcQuality, cmp}
+{ }
+
+// PbiBarcodeReverseFilter
+
+inline PbiBarcodeReverseFilter::PbiBarcodeReverseFilter(const int16_t bcRevId, const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_REVERSE>{bcRevId, cmp}
+{ }
+
+inline PbiBarcodeReverseFilter::PbiBarcodeReverseFilter(std::vector<int16_t> whitelist, const Compare::Type cmp)
+    : internal::BarcodeDataFilterBase<int16_t, PbiFile::BarcodeField::BC_REVERSE>{std::move(whitelist), cmp}
+{ }
+
+// PbiBarcodesFilter
+
+inline PbiBarcodesFilter::PbiBarcodesFilter(const std::pair<int16_t, int16_t> barcodes, const Compare::Type cmp)
+    : PbiBarcodesFilter{barcodes.first, barcodes.second, cmp}
+{ }
+
+inline PbiBarcodesFilter::PbiBarcodesFilter(const int16_t bcForward, const int16_t bcReverse, const Compare::Type cmp)
+    : compositeFilter_{ PbiFilter::Intersection({ PbiBarcodeForwardFilter{bcForward,cmp},
+                                                  PbiBarcodeReverseFilter{bcReverse,cmp}
+                                                })
+                      }
+{ }
+
+inline bool PbiBarcodesFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{ return compositeFilter_.Accepts(idx, row); }
+
+// PbiIdentityFilter
+
+inline PbiIdentityFilter::PbiIdentityFilter(const float identity,
+                                            const Compare::Type cmp)
+    : internal::FilterBase<float>{identity, cmp}
+{ }
+
+// PbiLocalContextFilter
+
+inline PbiLocalContextFilter::PbiLocalContextFilter(const LocalContextFlags& flags,
+                                                    const Compare::Type cmp)
+    : internal::BasicDataFilterBase<LocalContextFlags, PbiFile::BasicField::CONTEXT_FLAG>{flags, cmp}
+{ }
+
+// PbiMapQualityFilter
+
+inline PbiMapQualityFilter::PbiMapQualityFilter(const uint8_t mapQual, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint8_t, PbiFile::MappedField::MAP_QUALITY>{mapQual, cmp}
+{ }
+
+// PbiMovieNameFilter
+
+inline bool PbiMovieNameFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const bool found = compositeFilter_.Accepts(idx, row);
+    if (cmp_ == Compare::EQUAL) return found;
+    else if (cmp_ == Compare::NOT_EQUAL) return !found;
+    else throw std::runtime_error{"unsupported compare type on movie name filter"};
+}
+
+// PbiNumDeletedBasesFilter
+
+inline PbiNumDeletedBasesFilter::PbiNumDeletedBasesFilter(const size_t numDeletions, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_DEL>{numDeletions, cmp}
+{ }
+
+// PbiNumInsertedBasesFilter
+
+inline PbiNumInsertedBasesFilter::PbiNumInsertedBasesFilter(const size_t numInsertions, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_INS>{numInsertions, cmp}
+{ }
+
+// PbiNumMatchesFilter
+
+inline PbiNumMatchesFilter::PbiNumMatchesFilter(const size_t numMatchedBases, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_M>{numMatchedBases, cmp}
+{ }
+
+// PbiNumMismatchesFilter
+
+inline PbiNumMismatchesFilter::PbiNumMismatchesFilter(const size_t numMismatchedBases, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<size_t, PbiFile::MappedField::N_MM>{numMismatchedBases, cmp}
+{ }
+
+// PbiQueryEndFilter
+
+inline PbiQueryEndFilter::PbiQueryEndFilter(const int32_t position, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::Q_END>{position, cmp}
+{ }
+
+// PbiQueryLengthFilter
+
+inline PbiQueryLengthFilter::PbiQueryLengthFilter(const int32_t length, const Compare::Type cmp)
+    : internal::FilterBase<int32_t>{length, cmp}
+{ }
+
+// PbiQueryStartFilter
+
+inline PbiQueryStartFilter::PbiQueryStartFilter(const int32_t position, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::Q_START>{position, cmp}
+{ }
+
+// PbiReadAccuracyFilter
+
+inline PbiReadAccuracyFilter::PbiReadAccuracyFilter(const Accuracy accuracy, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<Accuracy, PbiFile::BasicField::READ_QUALITY>{accuracy, cmp}
+{ }
+
+// PbiReadGroupFilter
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(const int32_t rgId, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::RG_ID>{rgId, cmp}
+{ }
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(const std::string& rgId, const Compare::Type cmp)
+    : PbiReadGroupFilter{ReadGroupInfo::IdToInt(rgId), cmp}
+{ }
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(const ReadGroupInfo& rg, const Compare::Type cmp)
+    : PbiReadGroupFilter{rg.Id(), cmp}
+{ }
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(std::vector<int32_t> whitelist, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::RG_ID>{std::move(whitelist), cmp}
+{ }
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(const std::vector<std::string>& whitelist, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::RG_ID>{std::vector<int32_t>{}, cmp}
+{
+    multiValue_->reserve(whitelist.size());
+    for (const auto& rg : whitelist)
+        multiValue_->push_back(ReadGroupInfo::IdToInt(rg));
+}
+
+inline PbiReadGroupFilter::PbiReadGroupFilter(const std::vector<ReadGroupInfo>& whitelist, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::RG_ID>{std::vector<int32_t>{}, cmp}
+{
+    multiValue_->reserve(whitelist.size());
+    for (const auto& rg : whitelist)
+        multiValue_->push_back(ReadGroupInfo::IdToInt(rg.Id()));
+}
+
+// PbiReferenceEndFilter
+
+inline PbiReferenceEndFilter::PbiReferenceEndFilter(const uint32_t tEnd, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::T_END>{tEnd, cmp}
+{ }
+
+// PbiReferenceIdFilter
+
+inline PbiReferenceIdFilter::PbiReferenceIdFilter(const int32_t tId, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<int32_t, PbiFile::MappedField::T_ID>{tId, cmp}
+{ }
+
+inline PbiReferenceIdFilter::PbiReferenceIdFilter(std::vector<int32_t> whitelist, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<int32_t, PbiFile::MappedField::T_ID>{std::move(whitelist), cmp}
+{ }
+
+// PbiReferenceStartFilter
+
+inline PbiReferenceStartFilter::PbiReferenceStartFilter(const uint32_t tStart, const Compare::Type cmp)
+    : internal::MappedDataFilterBase<uint32_t, PbiFile::MappedField::T_START>{tStart, cmp}
+{ }
+
+// PbiZmwFilter
+
+inline PbiZmwFilter::PbiZmwFilter(const int32_t zmw, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::ZMW>{zmw, cmp}
+{ }
+
+inline PbiZmwFilter::PbiZmwFilter(std::vector<int32_t> whitelist, const Compare::Type cmp)
+    : internal::BasicDataFilterBase<int32_t, PbiFile::BasicField::ZMW>{std::move(whitelist), cmp}
+{ }
+
+// PbiZmwModuloFilter
+
+inline PbiZmwModuloFilter::PbiZmwModuloFilter(
+        const uint32_t denominator,
+        const uint32_t value,
+        const FilterHash hashType,
+        const Compare::Type cmp)
+    : denominator_{denominator}
+    , value_{value}
+    , hash_{hashType}
+    , cmp_{cmp}
+{ }
+
+inline uint32_t UnsignedLongIntCast(const int32_t zm)
+{
+    return static_cast<uint32_t>(zm);
+}
+
+inline uint32_t BoostHashCombine(const int32_t zm)
+{
+    constexpr static const uint16_t mask = 0xFFFF;
+
+    const uint16_t upper = (zm >> 16) & mask;
+    const uint16_t lower = zm & mask;
+
+    // FIXME: discrepancies with Python API. Will return to nail down.
+
+    size_t seed = 0;
+    boost::hash_combine(seed, upper);
+    boost::hash_combine(seed, lower);
+    return static_cast<uint32_t>(seed);
+}
+
+inline bool PbiZmwModuloFilter::Accepts(const PbiRawData& idx,
+                                        const size_t row) const
+{
+    const auto zm = idx.BasicData().holeNumber_.at(row);
+
+    uint32_t hashValue;
+    switch(hash_)
+    {
+        case FilterHash::UNSIGNED_LONG_CAST :
+        {
+            hashValue = UnsignedLongIntCast(zm);
+            break;
+        }
+
+        case FilterHash::BOOST_HASH_COMBINE :
+        {
+            hashValue = BoostHashCombine(zm);
+            break;
+        }
+
+        default:
+            throw std::runtime_error{"unsupported filter hash type"};
+    }
+
+    const auto modResult = hashValue % denominator_;
+    return Compare::Check(modResult, value_, cmp_);
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/PbiRawData.inl b/include/pbbam/internal/PbiRawData.inl

new file mode 100644 (file)

index 0000000..9e0158d
--- /dev/null
+++ b/include/pbbam/internal/PbiRawData.inl
@@ -0,0 +1,78 @@
+// File Description
+/// \file PbiRawData.inl
+/// \brief Inline implementations for the classes used for working with raw PBI
+///        data.
+//
+// Author: Derek Barnett
+
+#include "pbbam/PbiRawData.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline const PbiRawBarcodeData& PbiRawData::BarcodeData() const
+{ return barcodeData_; }
+
+inline PbiRawBarcodeData& PbiRawData::BarcodeData()
+{ return barcodeData_; }
+
+inline const PbiRawBasicData& PbiRawData::BasicData() const
+{ return basicData_; }
+
+inline PbiRawBasicData& PbiRawData::BasicData()
+{ return basicData_; }
+
+inline std::string PbiRawData::Filename() const
+{ return filename_; }
+
+inline PbiFile::Sections PbiRawData::FileSections() const
+{ return sections_; }
+
+inline PbiRawData& PbiRawData::FileSections(PbiFile::Sections sections)
+{ sections_ = sections; return *this; }
+
+inline bool PbiRawData::HasBarcodeData() const
+{ return HasSection(PbiFile::BARCODE); }
+
+inline bool PbiRawData::HasMappedData() const
+{ return HasSection(PbiFile::MAPPED); }
+
+inline bool PbiRawData::HasReferenceData() const
+{ return HasSection(PbiFile::REFERENCE); }
+
+inline bool PbiRawData::HasSection(const PbiFile::Section section) const
+{ return (sections_ & section) != 0; }
+
+inline uint32_t PbiRawData::NumReads() const
+{ return numReads_; }
+
+inline PbiRawData& PbiRawData::NumReads(uint32_t num)
+{ numReads_ = num; return *this; }
+
+inline const PbiRawMappedData& PbiRawData::MappedData() const
+{ return mappedData_; }
+
+inline PbiRawMappedData& PbiRawData::MappedData()
+{ return mappedData_; }
+
+inline const PbiRawReferenceData& PbiRawData::ReferenceData() const
+{ return referenceData_; }
+
+inline PbiRawReferenceData& PbiRawData::ReferenceData()
+{ return referenceData_; }
+
+inline PbiFile::VersionEnum PbiRawData::Version() const
+{ return version_; }
+
+inline PbiRawData& PbiRawData::Version(PbiFile::VersionEnum version)
+{ version_ = version; return *this; }
+
+inline bool PbiReferenceEntry::operator==(const PbiReferenceEntry& other) const
+{
+    return tId_      == other.tId_ &&
+           beginRow_ == other.beginRow_ &&
+           endRow_   == other.endRow_;
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/ProgramInfo.inl b/include/pbbam/internal/ProgramInfo.inl

new file mode 100644 (file)

index 0000000..048e7a4
--- /dev/null
+++ b/include/pbbam/internal/ProgramInfo.inl
@@ -0,0 +1,62 @@
+// File Description
+/// \file ProgramInfo.inl
+/// \brief Inline implementations for the ProgramInfo class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/ProgramInfo.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline std::string ProgramInfo::CommandLine() const
+{ return commandLine_; }
+
+inline ProgramInfo& ProgramInfo::CommandLine(std::string cmd)
+{ commandLine_ = std::move(cmd); return *this; }
+
+inline std::map<std::string, std::string> ProgramInfo::CustomTags() const
+{ return custom_; }
+
+inline ProgramInfo& ProgramInfo::CustomTags(std::map<std::string,
+                                            std::string> custom)
+{ custom_ = std::move(custom); return *this; }
+
+inline std::string ProgramInfo::Description() const
+{ return description_; }
+
+inline ProgramInfo& ProgramInfo::Description(std::string description)
+{ description_ = std::move(description); return *this; }
+
+inline std::string ProgramInfo::Id() const
+{ return id_; }
+
+inline ProgramInfo& ProgramInfo::Id(std::string id)
+{ id_ = std::move(id); return *this; }
+
+inline bool ProgramInfo::IsValid() const
+{ return !id_.empty(); }
+
+inline std::string ProgramInfo::Name() const
+{ return name_; }
+
+inline ProgramInfo& ProgramInfo::Name(std::string name)
+{ name_ = std::move(name); return *this; }
+
+inline std::string ProgramInfo::PreviousProgramId() const
+{ return previousProgramId_; }
+
+inline ProgramInfo& ProgramInfo::PreviousProgramId(std::string id)
+{ previousProgramId_ = std::move(id); return *this; }
+
+inline std::string ProgramInfo::ToSam(const ProgramInfo& prog)
+{ return prog.ToSam(); }
+
+inline std::string ProgramInfo::Version() const
+{ return version_; }
+
+inline ProgramInfo& ProgramInfo::Version(std::string version)
+{ version_ = std::move(version); return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/QualityValue.inl b/include/pbbam/internal/QualityValue.inl

new file mode 100644 (file)

index 0000000..caf9c35
--- /dev/null
+++ b/include/pbbam/internal/QualityValue.inl
@@ -0,0 +1,30 @@
+// File Description
+/// \file QualityValue.inl
+/// \brief Inline implementations for the QualityValue class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/QualityValue.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline QualityValue::QualityValue(const uint8_t value)
+    : value_{value}
+{
+    // clamp QV
+    if (value_ > QualityValue::MAX)
+        value_ = QualityValue::MAX;
+}
+
+inline char QualityValue::Fastq() const
+{ return static_cast<char>(value_ + 33); }
+
+inline QualityValue::operator uint8_t() const
+{ return value_; }
+
+inline QualityValue QualityValue::FromFastq(const char c)
+{ return { static_cast<uint8_t>(c-33) }; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/QualityValues.inl b/include/pbbam/internal/QualityValues.inl

new file mode 100644 (file)

index 0000000..a9c7019
--- /dev/null
+++ b/include/pbbam/internal/QualityValues.inl
@@ -0,0 +1,83 @@
+// File Description
+/// \file QualityValues.inl
+/// \brief Inline implementations for the QualityValues class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/QualityValues.h"
+#include <algorithm>
+
+namespace PacBio {
+namespace BAM {
+
+inline QualityValues::QualityValues(const std::string& fastqString)
+    : std::vector<QualityValue>{}
+{
+    resize(fastqString.size());
+    std::transform(fastqString.cbegin(), fastqString.cend(),
+                   begin(), QualityValue::FromFastq);
+}
+
+inline QualityValues::QualityValues(std::vector<QualityValue> quals)
+    : std::vector<QualityValue>{std::move(quals)}
+{ }
+
+inline QualityValues::QualityValues(const std::vector<uint8_t>& quals)
+    : std::vector<QualityValue>(quals.size())
+{
+    std::copy(quals.cbegin(), quals.cend(), begin());
+}
+
+inline QualityValues::QualityValues(const std::vector<uint8_t>::const_iterator first,
+                                    const std::vector<uint8_t>::const_iterator last)
+    : std::vector<QualityValue>(first, last)
+{ }
+
+inline QualityValues::QualityValues(const QualityValues::const_iterator first,
+                                    const QualityValues::const_iterator last)
+    : std::vector<QualityValue>{}
+{
+    assign(first, last);
+}
+
+inline QualityValues& QualityValues::operator=(std::vector<QualityValue> quals)
+{ std::vector<QualityValue>::operator=(std::move(quals)); return *this; }
+
+inline std::vector<QualityValue>::const_iterator QualityValues::cbegin() const
+{ return std::vector<QualityValue>::cbegin(); }
+
+inline std::vector<QualityValue>::const_iterator QualityValues::cend() const
+{ return std::vector<QualityValue>::cend(); }
+
+inline std::vector<QualityValue>::const_iterator QualityValues::begin() const
+{ return std::vector<QualityValue>::begin(); }
+
+inline std::vector<QualityValue>::const_iterator QualityValues::end() const
+{ return std::vector<QualityValue>::end(); }
+
+inline std::vector<QualityValue>::iterator QualityValues::begin()
+{ return std::vector<QualityValue>::begin(); }
+
+inline std::vector<QualityValue>::iterator QualityValues::end()
+{ return std::vector<QualityValue>::end(); }
+
+inline QualityValues QualityValues::FromFastq(const std::string& fastq)
+{ return QualityValues{fastq}; }
+
+inline std::string QualityValues::Fastq() const
+{
+    std::string result;
+    result.reserve(size());
+    for (const auto qv : *this)
+        result.push_back(qv.Fastq());
+    return result;
+}
+
+inline bool QualityValues::operator==(const std::string& fastq) const
+{ return *this == QualityValues(fastq); }
+
+inline bool QualityValues::operator!=(const std::string& fastq) const
+{ return *this != QualityValues(fastq); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/QueryBase.h b/include/pbbam/internal/QueryBase.h

new file mode 100644 (file)

index 0000000..13b62b6
--- /dev/null
+++ b/include/pbbam/internal/QueryBase.h
@@ -0,0 +1,104 @@
+// Author: Derek Barnett
+
+#ifndef QUERYBASE_H
+#define QUERYBASE_H
+
+#include <cassert>
+#include <memory>
+#include <vector>
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/DataSet.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template <typename T>
+class QueryBase;
+
+template <typename T>
+class QueryIteratorBase
+{
+public:
+    ~QueryIteratorBase() = default;
+
+    bool operator==(const QueryIteratorBase<T>& other) const;
+    bool operator!=(const QueryIteratorBase<T>& other) const;
+
+protected:
+    QueryIteratorBase() = default;
+    QueryIteratorBase(QueryBase<T>& query);
+
+    void ReadNext();
+
+protected:
+    QueryBase<T>* query_ = nullptr;
+    T record_;
+};
+
+template <typename T>
+class QueryIterator : public QueryIteratorBase<T>
+{
+public:
+    QueryIterator() = default;
+    QueryIterator(QueryBase<T>& query);
+
+    T& operator*();
+    T* operator->();
+
+    QueryIterator<T>& operator++();
+    QueryIterator<T> operator++(int);
+};
+
+template <typename T>
+class QueryConstIterator : public QueryIteratorBase<T>
+{
+public:
+    QueryConstIterator() = default;
+    QueryConstIterator(const QueryBase<T>& query);
+
+    const T& operator*() const;
+    const T* operator->() const;
+
+    QueryConstIterator<T>& operator++();
+    QueryConstIterator<T> operator++(int);
+};
+
+template <typename T>
+class QueryBase
+{
+
+public:
+    using iterator = QueryIterator<T>;
+    using const_iterator = QueryConstIterator<T>;
+
+public:
+    virtual ~QueryBase() = default;
+
+public:
+    QueryConstIterator<T> begin() const;
+    QueryConstIterator<T> cbegin() const;
+    QueryIterator<T> begin();
+
+    QueryConstIterator<T> end() const;
+    QueryConstIterator<T> cend() const;
+    QueryIterator<T> end();
+
+public:
+    virtual bool GetNext(T& r) = 0;
+
+protected:
+    QueryBase() = default;
+};
+
+using IQuery = QueryBase<BamRecord>;
+using IGroupQuery = QueryBase<std::vector<BamRecord>>;
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "pbbam/internal/QueryBase.inl"
+
+#endif  // QUERYBASE_H
diff --git a/include/pbbam/internal/QueryBase.inl b/include/pbbam/internal/QueryBase.inl

new file mode 100644 (file)

index 0000000..49cf860
--- /dev/null
+++ b/include/pbbam/internal/QueryBase.inl
@@ -0,0 +1,122 @@
+// Author: Derek Barnett
+
+#include "pbbam/internal/QueryBase.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// -------------------
+// QueryIteratorBase
+// -------------------
+
+template<typename T>
+inline QueryIteratorBase<T>::QueryIteratorBase(QueryBase<T>& query)
+    : query_{&query}
+{ ReadNext(); }
+
+template<typename T> inline
+bool QueryIteratorBase<T>::operator==(const QueryIteratorBase<T>& other) const
+{ return query_ == other.query_; }
+
+template<typename T> inline
+bool QueryIteratorBase<T>::operator!=(const QueryIteratorBase<T>& other) const
+{ return !(*this == other); }
+
+// -------------------
+// QueryIterator
+// -------------------
+
+template<typename T> inline
+QueryIterator<T>::QueryIterator(QueryBase<T>& query)
+    : QueryIteratorBase<T>{query}
+{ }
+
+template<typename T> inline
+T& QueryIterator<T>::operator*()
+{ return QueryIteratorBase<T>::record_; }
+
+template<typename T> inline
+T* QueryIterator<T>::operator->()
+{ return &(operator*()); }
+
+template<typename T> inline
+QueryIterator<T>& QueryIterator<T>::operator++()
+{ QueryIteratorBase<T>::ReadNext(); return *this; }
+
+template<typename T> inline
+QueryIterator<T> QueryIterator<T>::operator++(int)
+{
+    QueryIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+// --------------------
+// QueryConstIterator
+// --------------------
+
+template<typename T> inline
+QueryConstIterator<T>::QueryConstIterator(const QueryBase<T>& query)
+    : QueryIteratorBase<T>{const_cast<QueryBase<T>&>(query)}
+{ }
+
+template<typename T> inline
+const T& QueryConstIterator<T>::operator*() const
+{ return QueryIteratorBase<T>::record_; }
+
+template<typename T> inline
+const T* QueryConstIterator<T>::operator->() const
+{ return &(operator*()); }
+
+template<typename T> inline
+QueryConstIterator<T>& QueryConstIterator<T>::operator++()
+{ QueryIteratorBase<T>::ReadNext(); return *this; }
+
+template<typename T> inline
+QueryConstIterator<T> QueryConstIterator<T>::operator++(int)
+{
+    QueryConstIterator<T> result(*this);
+    ++(*this);
+    return result;
+}
+
+// -----------
+// QueryBase
+// -----------
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::begin() const
+{ return QueryConstIterator<T>(*this); }
+
+template<typename T> inline
+QueryIterator<T> QueryBase<T>::begin()
+{ return QueryIterator<T>(*this); }
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::cbegin() const
+{ return QueryConstIterator<T>(*this); }
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::cend() const
+{ return QueryConstIterator<T>(); }
+
+template<typename T> inline
+QueryConstIterator<T> QueryBase<T>::end() const
+{ return QueryConstIterator<T>(); }
+
+template<typename T> inline
+QueryIterator<T> QueryBase<T>::end()
+{ return QueryIterator<T>(); }
+
+template<typename T>
+inline void QueryIteratorBase<T>::ReadNext()
+{
+    assert(query_);
+    if (!query_->GetNext(record_))
+        query_ = nullptr;
+}
+
+} // namespace internal
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/ReadGroupInfo.inl b/include/pbbam/internal/ReadGroupInfo.inl

new file mode 100644 (file)

index 0000000..4282081
--- /dev/null
+++ b/include/pbbam/internal/ReadGroupInfo.inl
@@ -0,0 +1,262 @@
+// File Description
+/// \file ReadGroupInfo.inl
+/// \brief Inline implementations for the ReadGroupInfo class.
+//
+// Author: Derek Barnett
+
+#include <stdexcept>
+#include "pbbam/ReadGroupInfo.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline size_t ReadGroupInfo::BarcodeCount() const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error{"barcode count requested but barcode data is missing"};
+    return barcodeCount_;
+}
+
+inline ReadGroupInfo& ReadGroupInfo::BarcodeData(std::string barcodeFile,
+                                                 std::string barcodeHash,
+                                                 size_t barcodeCount,
+                                                 BarcodeModeType barcodeMode,
+                                                 BarcodeQualityType barcodeQuality)
+{
+    barcodeFile_ = std::move(barcodeFile);
+    barcodeHash_ = std::move(barcodeHash);
+    barcodeCount_ = barcodeCount;
+    barcodeMode_ = barcodeMode;
+    barcodeQuality_ = barcodeQuality;
+    hasBarcodeData_ = true;
+    return *this;
+}
+
+inline std::string ReadGroupInfo::BarcodeFile() const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error{"barcode file requested but barcode data is missing"};
+    return barcodeFile_;
+}
+
+inline std::string ReadGroupInfo::BarcodeHash() const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error{"barcode hash requested but barcode data is missing"};
+    return barcodeHash_;
+}
+
+inline BarcodeModeType ReadGroupInfo::BarcodeMode() const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error{"barcode mode requested but barcode data is missing"};
+    return barcodeMode_;
+}
+
+inline BarcodeQualityType ReadGroupInfo::BarcodeQuality() const
+{
+    if (!hasBarcodeData_)
+        throw std::runtime_error{"barcode quality requested but barcode data is missing"};
+    return barcodeQuality_;
+}
+
+inline std::string ReadGroupInfo::BasecallerVersion() const
+{ return basecallerVersion_; }
+
+inline ReadGroupInfo& ReadGroupInfo::BasecallerVersion(std::string versionNumber)
+{
+    if (basecallerVersion_ != versionNumber) { 
+        basecallerVersion_ = std::move(versionNumber);
+        sequencingChemistry_.clear(); // reset cached chemistry name
+    }
+    return *this; 
+}
+
+inline std::string ReadGroupInfo::BaseFeatureTag(BaseFeature feature) const
+{
+    const auto iter = features_.find(feature);
+    if (iter == features_.end())
+        return {};
+    return iter->second;
+}
+
+inline ReadGroupInfo& ReadGroupInfo::BaseFeatureTag(BaseFeature feature,
+                                                    std::string tag)
+{ features_[feature] = std::move(tag); return *this; }
+
+inline std::string ReadGroupInfo::BindingKit() const
+{ return bindingKit_; }
+
+inline ReadGroupInfo& ReadGroupInfo::BindingKit(std::string kitNumber)
+{
+    if (bindingKit_ != kitNumber) { 
+        bindingKit_ = std::move(kitNumber);
+        sequencingChemistry_.clear(); // reset cached chemistry name
+    }
+    return *this; 
+}
+
+inline ReadGroupInfo& ReadGroupInfo::ClearBarcodeData()
+{
+    barcodeFile_.clear();
+    barcodeHash_.clear();
+    hasBarcodeData_ = false;
+    return *this;
+}
+
+inline ReadGroupInfo& ReadGroupInfo::ClearBaseFeatures()
+{
+    features_.clear();
+    return *this;
+}
+
+inline bool ReadGroupInfo::Control() const
+{ return control_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Control(bool ctrl)
+{ control_ = ctrl; return *this; }
+
+inline std::map<std::string, std::string> ReadGroupInfo::CustomTags() const
+{ return custom_; }
+
+inline ReadGroupInfo& ReadGroupInfo::CustomTags(std::map<std::string, std::string> custom)
+{ custom_ = std::move(custom); return *this; }
+
+inline std::string ReadGroupInfo::Date() const
+{ return date_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Date(std::string date)
+{ date_ = std::move(date); return *this; }
+
+inline std::string ReadGroupInfo::FlowOrder() const
+{ return flowOrder_; }
+
+inline ReadGroupInfo& ReadGroupInfo::FlowOrder(std::string order)
+{ flowOrder_ = std::move(order); return *this; }
+
+inline std::string ReadGroupInfo::FrameRateHz() const
+{ return frameRateHz_; }
+
+inline ReadGroupInfo& ReadGroupInfo::FrameRateHz(std::string frameRateHz)
+{ frameRateHz_ = std::move(frameRateHz); return *this; }
+
+inline bool ReadGroupInfo::HasBarcodeData() const
+{ return hasBarcodeData_; }
+
+inline bool ReadGroupInfo::HasBaseFeature(BaseFeature feature) const
+{ return features_.find(feature) != features_.end(); }
+
+inline std::string ReadGroupInfo::Id() const
+{ return id_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Id(std::string id)
+{ id_ = std::move(id); return *this; }
+
+inline ReadGroupInfo& ReadGroupInfo::Id(const std::string& movieName,
+                                        const std::string& readType)
+{ id_ = MakeReadGroupId(movieName, readType); return *this; }
+
+inline int32_t ReadGroupInfo::IdToInt(const std::string& rgId)
+{
+    const uint32_t rawid = std::stoul(rgId, nullptr, 16);
+    return static_cast<int32_t>(rawid);
+}
+
+inline FrameCodec ReadGroupInfo::IpdCodec() const
+{ return ipdCodec_; }
+
+inline bool ReadGroupInfo::IsValid() const
+{ return !id_.empty(); }
+
+inline std::string ReadGroupInfo::KeySequence() const
+{ return keySequence_; }
+
+inline ReadGroupInfo& ReadGroupInfo::KeySequence(std::string sequence)
+{ keySequence_ = std::move(sequence); return *this; }
+
+inline std::string ReadGroupInfo::Library() const
+{ return library_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Library(std::string library)
+{ library_ = std::move(library); return *this; }
+
+inline std::string ReadGroupInfo::MovieName() const
+{ return movieName_; }
+
+inline ReadGroupInfo& ReadGroupInfo::MovieName(std::string movieName)
+{ movieName_ = std::move(movieName); return *this; }
+
+inline std::string ReadGroupInfo::Platform() const
+{ return std::string("PACBIO"); }
+
+inline PlatformModelType ReadGroupInfo::PlatformModel() const
+{ return platformModel_; }
+
+inline ReadGroupInfo& ReadGroupInfo::PlatformModel(PlatformModelType platform)
+{ platformModel_ = platform; return *this; }
+
+inline std::string ReadGroupInfo::PredictedInsertSize() const
+{ return predictedInsertSize_; }
+
+inline ReadGroupInfo& ReadGroupInfo::PredictedInsertSize(std::string size)
+{ predictedInsertSize_ = std::move(size); return *this; }
+
+inline std::string ReadGroupInfo::Programs() const
+{ return programs_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Programs(std::string programs)
+{ programs_ = std::move(programs); return *this; }
+
+inline FrameCodec ReadGroupInfo::PulseWidthCodec() const
+{ return pulseWidthCodec_; }
+
+inline std::string ReadGroupInfo::ReadType() const
+{ return readType_; }
+
+inline ReadGroupInfo& ReadGroupInfo::ReadType(std::string type)
+{ readType_ = std::move(type); return *this; }
+
+inline ReadGroupInfo& ReadGroupInfo::RemoveBaseFeature(BaseFeature feature)
+{
+    const auto iter = features_.find(feature);
+    if (iter != features_.end())
+        features_.erase(iter);
+    return *this;
+}
+
+inline std::string ReadGroupInfo::Sample() const
+{ return sample_; }
+
+inline ReadGroupInfo& ReadGroupInfo::Sample(std::string sample)
+{ sample_ = std::move(sample); return *this; }
+
+inline std::string ReadGroupInfo::SequencingCenter() const
+{ return sequencingCenter_; }
+
+inline ReadGroupInfo& ReadGroupInfo::SequencingCenter(std::string center)
+{ sequencingCenter_ = std::move(center); return *this; }
+
+inline std::string ReadGroupInfo::SequencingChemistry() const
+{
+    if (!sequencingChemistry_.empty()) return sequencingChemistry_;
+    return sequencingChemistry_ = SequencingChemistryFromTriple(BindingKit(),
+                                                                SequencingKit(),
+                                                                BasecallerVersion());
+}
+
+inline std::string ReadGroupInfo::SequencingKit() const
+{ return sequencingKit_; }
+
+inline ReadGroupInfo& ReadGroupInfo::SequencingKit(std::string kitNumber)
+{ 
+    if (sequencingKit_ != kitNumber) {
+        sequencingKit_ = std::move(kitNumber);
+        sequencingChemistry_.clear(); // reset cached chemistry name
+    }
+    return *this; }
+
+inline std::string ReadGroupInfo::ToSam(const ReadGroupInfo& rg)
+{ return rg.ToSam(); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/SequenceInfo.inl b/include/pbbam/internal/SequenceInfo.inl

new file mode 100644 (file)

index 0000000..392e0b6
--- /dev/null
+++ b/include/pbbam/internal/SequenceInfo.inl
@@ -0,0 +1,72 @@
+// File Description
+/// \file SequenceInfo.inl
+/// \brief Inline implementations for the SequenceInfo class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/SequenceInfo.h"
+
+namespace PacBio {
+namespace BAM {
+
+inline bool SequenceInfo::operator==(const SequenceInfo& other) const
+{
+    return assemblyId_ == other.assemblyId_ &&
+           checksum_   == other.checksum_   &&
+           length_     == other.length_     &&
+           name_       == other.name_       &&
+           species_    == other.species_    &&
+           uri_        == other.uri_        &&
+           custom_     == other.custom_;
+}
+
+inline bool SequenceInfo::operator!=(const SequenceInfo& other) const
+{ return !(*this == other); }
+
+inline std::string SequenceInfo::AssemblyId() const
+{ return assemblyId_; }
+
+inline SequenceInfo& SequenceInfo::AssemblyId(std::string id)
+{ assemblyId_ = std::move(id); return *this; }
+
+inline std::string SequenceInfo::Checksum() const
+{ return checksum_; }
+
+inline SequenceInfo& SequenceInfo::Checksum(std::string checksum)
+{ checksum_ = std::move(checksum); return *this; }
+
+inline std::map<std::string, std::string> SequenceInfo::CustomTags() const
+{ return custom_; }
+
+inline SequenceInfo& SequenceInfo::CustomTags(std::map<std::string, std::string> custom)
+{ custom_ = std::move(custom); return *this; }
+
+inline std::string SequenceInfo::Length() const
+{ return length_; }
+
+inline SequenceInfo& SequenceInfo::Length(std::string length)
+{ length_ = std::move(length); return *this; }
+
+inline std::string SequenceInfo::Name() const
+{ return name_; }
+
+inline SequenceInfo& SequenceInfo::Name(std::string name)
+{ name_ = std::move(name); return *this; }
+
+inline std::string SequenceInfo::Species() const
+{ return species_; }
+
+inline SequenceInfo& SequenceInfo::Species(std::string species)
+{ species_ = std::move(species); return *this; }
+
+inline std::string SequenceInfo::ToSam(const SequenceInfo& seq)
+{ return seq.ToSam(); }
+
+inline std::string SequenceInfo::Uri() const
+{ return uri_; }
+
+inline SequenceInfo& SequenceInfo::Uri(std::string uri)
+{ uri_ = std::move(uri); return *this; }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Tag.inl b/include/pbbam/internal/Tag.inl

new file mode 100644 (file)

index 0000000..51e253b
--- /dev/null
+++ b/include/pbbam/internal/Tag.inl
@@ -0,0 +1,417 @@
+// File Description
+/// \file Tag.inl
+/// \brief Inline implementations for the Tag class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Tag.h"
+#include <boost/numeric/conversion/cast.hpp>
+#include <iostream>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template<typename T>
+inline bool InAsciiRange(const T x)
+{ return (x >=33 && x <= 127); }
+
+struct AsciiConvertVisitor : public boost::static_visitor<char>
+{
+    // only valid for numeric types - maybe even more restrictive?
+    char operator() (const int8_t& x) const   { return Helper(x); }
+    char operator() (const uint8_t& x) const  { return Helper(x); }
+    char operator() (const int16_t& x) const  { return Helper(x); }
+    char operator() (const uint16_t& x) const { return Helper(x); }
+    char operator() (const int32_t& x) const  { return Helper(x); }
+    char operator() (const uint32_t& x) const { return Helper(x); }
+
+    // anything else always throws
+    template<typename T>
+    char operator()(const T&) const
+    { throw std::runtime_error{"conversion not supported"}; return 0; }
+
+private:
+    template<typename T>
+    char Helper(const T& x) const
+    {
+        if (!InAsciiRange(x))
+            throw std::runtime_error{"not valid ASCII"};
+        return static_cast<char>(x);
+    }
+};
+
+template<typename DesiredType>
+struct NumericConvertVisitor : public boost::static_visitor<DesiredType>
+{
+    // only valid for integral types
+    DesiredType operator() (const int8_t& x) const   { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator() (const uint8_t& x) const  { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator() (const int16_t& x) const  { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator() (const uint16_t& x) const { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator() (const int32_t& x) const  { return boost::numeric_cast<DesiredType>(x); }
+    DesiredType operator() (const uint32_t& x) const { return boost::numeric_cast<DesiredType>(x); }
+
+    // anything else always throws
+    template<typename T> DesiredType operator()(const T& t) const
+    {
+        const std::string from = typeid(t).name();
+        const std::string to   = typeid(DesiredType).name();
+        const std::string msg  = "conversion not supported: " + from + " -> " + to;
+        throw std::runtime_error(msg);
+        return 0;
+    }
+};
+
+using ToInt8ConvertVisitor   = NumericConvertVisitor<int8_t>;
+using ToUInt8ConvertVisitor  = NumericConvertVisitor<uint8_t>;
+using ToInt16ConvertVisitor  = NumericConvertVisitor<int16_t>;
+using ToUInt16ConvertVisitor = NumericConvertVisitor<uint16_t>;
+using ToInt32ConvertVisitor  = NumericConvertVisitor<int32_t>;
+using ToUInt32ConvertVisitor = NumericConvertVisitor<uint32_t>;
+
+struct IsEqualVisitor : public boost::static_visitor<bool>
+{
+    template <typename T, typename U>
+    bool operator() (const T&, const U&) const
+    {
+        // maybe allow conversions down the road?
+        // but for now, just fail if types are different
+        return false;
+    }
+
+    bool operator() (const boost::blank&, const boost::blank&) const
+    { return true; }
+
+    template <typename T>
+    bool operator() (const T& lhs, const T& rhs) const
+    { return lhs == rhs; }
+};
+
+struct TypenameVisitor : public boost::static_visitor<std::string>
+{
+    std::string operator() (const boost::blank&) const     { return "none"; }
+    std::string operator() (const int8_t&) const           { return "int8_t"; }
+    std::string operator() (const uint8_t&) const          { return "uint8_t"; }
+    std::string operator() (const int16_t&) const          { return "int16_t"; }
+    std::string operator() (const uint16_t&) const         { return "uint16_t"; }
+    std::string operator() (const int32_t&) const          { return "int32_t"; }
+    std::string operator() (const uint32_t&) const         { return "uint32_t"; }
+    std::string operator() (const float&) const            { return "float"; }
+    std::string operator() (const std::string&) const      { return "string"; }
+    std::string operator() (const std::vector<int8_t>&) const   { return "vector<int8_t>"; }
+    std::string operator() (const std::vector<uint8_t>&) const  { return "vector<uint8_t>"; }
+    std::string operator() (const std::vector<int16_t>&) const  { return "vector<int16_t>"; }
+    std::string operator() (const std::vector<uint16_t>&) const { return "vector<uint16_t>"; }
+    std::string operator() (const std::vector<int32_t>&) const  { return "vector<int32_t>"; }
+    std::string operator() (const std::vector<uint32_t>&) const { return "vector<uint32_t>"; }
+    std::string operator() (const std::vector<float>&) const    { return "vector<float>"; }
+};
+
+} // namespace internal
+
+inline Tag::Tag(int8_t value) : data_{value} {}
+inline Tag::Tag(uint8_t value) : data_{value} {}
+inline Tag::Tag(int16_t value) : data_{value} {}
+inline Tag::Tag(uint16_t value) : data_{value} {}
+inline Tag::Tag(int32_t value) : data_{value} {}
+inline Tag::Tag(uint32_t value) : data_{value} {}
+inline Tag::Tag(float value) : data_{value} {}
+inline Tag::Tag(std::string value) : data_{std::move(value)} {}
+inline Tag::Tag(std::vector<int8_t> value) : data_{std::move(value)} {}
+inline Tag::Tag(std::vector<uint8_t> value) : data_{std::move(value)} {}
+inline Tag::Tag(std::vector<int16_t> value) : data_{std::move(value)} {}
+inline Tag::Tag(std::vector<uint16_t> value) : data_{std::move(value)} {}
+inline Tag::Tag(std::vector<int32_t> value) : data_{std::move(value)} {}
+inline Tag::Tag(std::vector<uint32_t> value) : data_{std::move(value)} {}
+inline Tag::Tag(std::vector<float> value) : data_{std::move(value)} {}
+
+inline Tag::Tag(int8_t value, const TagModifier mod) : data_{value}, modifier_(mod)
+{
+    if (mod == TagModifier::HEX_STRING)
+        throw std::runtime_error{
+            "HEX_STRING is not a valid tag modifier for int8_t data. "
+            "It is intended for string-type data only."};
+}
+
+inline Tag::Tag(std::string value, TagModifier mod) : data_{std::move(value)}, modifier_{mod}
+{
+    if (mod == TagModifier::ASCII_CHAR)
+        throw std::runtime_error{
+            "ASCII_CHAR is not a valid tag modifier for string-type data. "
+            "To construct an ASCII char tag, use a single-quoted value (e.g. 'X' instead of "
+            "\"X\")"};
+}
+
+inline Tag& Tag::operator=(boost::blank value)
+{
+    data_ = value;
+    return *this;
+}
+
+inline Tag& Tag::operator=(int8_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+inline Tag& Tag::operator=(uint8_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+inline Tag& Tag::operator=(int16_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+inline Tag& Tag::operator=(uint16_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+inline Tag& Tag::operator=(int32_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+inline Tag& Tag::operator=(uint32_t value)
+{
+    data_ = value;
+    return *this;
+}
+
+inline Tag& Tag::operator=(float value)
+{
+    data_ = value;
+    return *this;
+}
+
+inline Tag& Tag::operator=(std::string value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+inline Tag& Tag::operator=(std::vector<int8_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+inline Tag& Tag::operator=(std::vector<uint8_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+inline Tag& Tag::operator=(std::vector<int16_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+inline Tag& Tag::operator=(std::vector<uint16_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+inline Tag& Tag::operator=(std::vector<int32_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+inline Tag& Tag::operator=(std::vector<uint32_t> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+inline Tag& Tag::operator=(std::vector<float> value)
+{
+    data_ = std::move(value);
+    return *this;
+}
+
+inline bool Tag::operator== (const Tag& other) const
+{
+    return boost::apply_visitor(internal::IsEqualVisitor(), data_, other.data_) &&
+           (modifier_ == other.modifier_) ;
+}
+
+inline bool Tag::operator!= (const Tag& other) const
+{ return !(*this == other); }
+
+inline bool Tag::HasModifier(const TagModifier m) const
+{
+    // we just allow one at a time (for now at least)
+    return modifier_ == m;
+}
+
+inline bool Tag::IsNull() const
+{ return Type() == TagDataType::INVALID; }
+
+inline bool Tag::IsInt8() const
+{ return Type() == TagDataType::INT8; }
+
+inline bool Tag::IsUInt8() const
+{ return Type() == TagDataType::UINT8; }
+
+inline bool Tag::IsInt16() const
+{ return Type() == TagDataType::INT16; }
+
+inline bool Tag::IsUInt16() const
+{ return Type() == TagDataType::UINT16; }
+
+inline bool Tag::IsInt32() const
+{ return Type() == TagDataType::INT32; }
+
+inline bool Tag::IsUInt32() const
+{ return Type() == TagDataType::UINT32; }
+
+inline bool Tag::IsFloat() const
+{ return Type() == TagDataType::FLOAT; }
+
+inline bool Tag::IsString() const
+{ return Type() == TagDataType::STRING; }
+
+inline bool Tag::IsHexString() const
+{ return IsString() && modifier_ == TagModifier::HEX_STRING; }
+
+inline bool Tag::IsInt8Array() const
+{ return Type() == TagDataType::INT8_ARRAY; }
+
+inline bool Tag::IsUInt8Array() const
+{ return Type() == TagDataType::UINT8_ARRAY; }
+
+inline bool Tag::IsInt16Array() const
+{ return Type() == TagDataType::INT16_ARRAY; }
+
+inline bool Tag::IsUInt16Array() const
+{ return Type() == TagDataType::UINT16_ARRAY; }
+
+inline bool Tag::IsInt32Array() const
+{ return Type() == TagDataType::INT32_ARRAY; }
+
+inline bool Tag::IsUInt32Array() const
+{ return Type() == TagDataType::UINT32_ARRAY; }
+
+inline bool Tag::IsFloatArray() const
+{ return Type() == TagDataType::FLOAT_ARRAY; }
+
+inline bool Tag::IsSignedInt() const
+{ return IsInt8() || IsInt16() || IsInt32(); }
+
+inline bool Tag::IsUnsignedInt() const
+{ return IsUInt8() || IsUInt16() || IsUInt32(); }
+
+inline bool Tag::IsIntegral() const
+{ return IsSignedInt() || IsUnsignedInt(); }
+
+inline bool Tag::IsNumeric() const
+{ return IsIntegral() || IsFloat(); }
+
+inline bool Tag::IsSignedArray() const
+{ return IsInt8Array() || IsInt16Array() || IsInt32Array(); }
+
+inline bool Tag::IsUnsignedArray() const
+{ return IsUInt8Array() || IsUInt16Array() || IsUInt32Array(); }
+
+inline bool Tag::IsIntegralArray() const
+{ return IsSignedArray() || IsUnsignedArray(); }
+
+inline bool Tag::IsArray() const
+{ return IsIntegralArray() || IsFloatArray(); }
+
+inline TagModifier Tag::Modifier() const
+{ return modifier_; }
+
+inline Tag& Tag::Modifier(const TagModifier m)
+{ modifier_ = m; return *this; }
+
+inline char Tag::ToAscii() const
+{ return boost::apply_visitor(internal::AsciiConvertVisitor(), data_); }
+
+inline int8_t Tag::ToInt8() const
+{
+    if (IsInt8())
+        return boost::get<int8_t>(data_);
+    return boost::apply_visitor(internal::ToInt8ConvertVisitor(), data_);
+}
+
+inline uint8_t Tag::ToUInt8() const
+{
+    if (IsUInt8())
+        return boost::get<uint8_t>(data_);
+    return boost::apply_visitor(internal::ToUInt8ConvertVisitor(), data_);
+}
+
+inline int16_t Tag::ToInt16() const
+{
+    if (IsInt16())
+        return boost::get<int16_t>(data_);
+    return boost::apply_visitor(internal::ToInt16ConvertVisitor(), data_);
+}
+
+inline uint16_t Tag::ToUInt16() const
+{
+    if (IsUInt16())
+        return boost::get<uint16_t>(data_);
+    return boost::apply_visitor(internal::ToUInt16ConvertVisitor(), data_);
+}
+
+inline int32_t Tag::ToInt32() const
+{
+    if (IsInt32())
+        return boost::get<int32_t>(data_);
+    return boost::apply_visitor(internal::ToInt32ConvertVisitor(), data_);
+}
+
+inline uint32_t Tag::ToUInt32() const
+{
+    if (IsUInt32())
+        return boost::get<uint32_t>(data_);
+    return boost::apply_visitor(internal::ToUInt32ConvertVisitor(), data_);
+}
+
+inline float Tag::ToFloat() const
+{ return boost::get<float>(data_); }
+
+inline std::string Tag::ToString() const
+{ return boost::get<std::string>(data_); }
+
+inline std::vector<int8_t> Tag::ToInt8Array() const
+{ return boost::get< std::vector<int8_t> >(data_); }
+
+inline std::vector<uint8_t> Tag::ToUInt8Array() const
+{ return boost::get< std::vector<uint8_t> >(data_); }
+
+inline std::vector<int16_t> Tag::ToInt16Array() const
+{ return boost::get< std::vector<int16_t> >(data_); }
+
+inline std::vector<uint16_t> Tag::ToUInt16Array() const
+{ return boost::get< std::vector<uint16_t> >(data_); }
+
+inline std::vector<int32_t> Tag::ToInt32Array() const
+{ return boost::get< std::vector<int32_t> >(data_); }
+
+inline std::vector<uint32_t> Tag::ToUInt32Array() const
+{ return boost::get< std::vector<uint32_t> >(data_); }
+
+inline std::vector<float> Tag::ToFloatArray() const
+{ return boost::get< std::vector<float> >(data_); }
+
+inline TagDataType Tag::Type() const
+{ return TagDataType(data_.which()  ); }
+
+inline std::string Tag::Typename() const
+{ return boost::apply_visitor(internal::TypenameVisitor(), data_); }
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/internal/Validator.inl b/include/pbbam/internal/Validator.inl

new file mode 100644 (file)

index 0000000..969f058
--- /dev/null
+++ b/include/pbbam/internal/Validator.inl
@@ -0,0 +1,57 @@
+// File Description
+/// \file Validator.inl
+/// \brief Inline implementations for the Validator class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/Validator.h"
+#include <stdexcept>
+
+namespace PacBio {
+namespace BAM {
+
+inline bool Validator::IsValid(const BamFile& file, const bool entireFile)
+{
+    try {
+        if (entireFile)
+            ValidateEntireFile(file, 1);
+        else
+            ValidateFileMetadata(file, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+inline bool Validator::IsValid(const BamHeader& header)
+{
+    try {
+        Validate(header, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+inline bool Validator::IsValid(const BamRecord& record)
+{
+    try {
+        Validate(record, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+inline bool Validator::IsValid(const ReadGroupInfo& rg)
+{
+    try {
+        Validate(rg, 1);
+        return true;
+    } catch (std::exception&) {
+        return false;
+    }
+}
+
+} // namespace BAM
+} // namespace PacBio
diff --git a/include/pbbam/vcf/VcfFile.h b/include/pbbam/vcf/VcfFile.h

new file mode 100644 (file)

index 0000000..a8155cc
--- /dev/null
+++ b/include/pbbam/vcf/VcfFile.h
@@ -0,0 +1,39 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFFILE_H
+#define PBBAM_VCF_VCFFILE_H
+
+#include <string>
+
+#include <pbbam/vcf/VcfHeader.h>
+
+namespace PacBio {
+namespace VCF {
+
+class VcfFile
+{
+public:
+    explicit VcfFile(std::string fn);
+
+    VcfFile() = delete;
+    VcfFile(const VcfFile&) = default;
+    VcfFile(VcfFile&&) = default;
+    VcfFile& operator=(const VcfFile&) = default;
+    VcfFile& operator=(VcfFile&&) = default;
+    ~VcfFile() = default;
+
+public:
+    const std::string& Filename() const;
+    const VcfHeader& Header() const;
+
+private:
+    std::string filename_;
+    VcfHeader header_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#include "pbbam/vcf/internal/VcfFile.inl"
+
+#endif  // PBBAM_VCF_VCFFILE_H
diff --git a/include/pbbam/vcf/VcfFormat.h b/include/pbbam/vcf/VcfFormat.h

new file mode 100644 (file)

index 0000000..590cfb4
--- /dev/null
+++ b/include/pbbam/vcf/VcfFormat.h
@@ -0,0 +1,101 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFFORMAT_H
+#define PBBAM_VCF_VCFFORMAT_H
+
+#include <iosfwd>
+#include <string>
+
+#include <pbbam/vcf/VcfHeader.h>
+#include <pbbam/vcf/VcfVariant.h>
+
+namespace PacBio {
+namespace VCF {
+
+struct VcfFormat
+{
+public:
+    /// \name General format info
+    /// \{
+
+    static const char* CurrentVersion();
+
+    /// \}
+
+public:
+    /// \name Header format
+    /// \{
+
+    static VcfHeader ParsedHeader(const std::string& text);
+
+    static std::string FormattedHeader(const VcfHeader& header);
+
+    static VcfHeader HeaderFromFile(const std::string& fn);
+
+    static VcfHeader HeaderFromStream(std::istream& in);
+
+    /// \}
+
+public:
+    /// \name Variant format
+    /// \{
+
+    static VcfVariant ParsedVariant(const std::string& line);
+
+    static std::string FormattedVariant(const VcfVariant& var);
+
+    /// \}
+
+    // ---------------------------------------------------------------------- //
+    // The following methods are mostly internal helpers, exposed here for    //
+    // testing. Client code should probably not need these, but are available //
+    // here if needed.                                                        //
+    // ---------------------------------------------------------------------- //
+
+public:
+    /// \internal
+    /// \name Header format helpers
+    /// \{
+
+    static ContigDefinition ParsedContigDefinition(std::string line);
+
+    static FilterDefinition ParsedFilterDefinition(std::string line);
+
+    static FormatDefinition ParsedFormatDefinition(std::string line);
+
+    static GeneralDefinition ParsedGeneralDefinition(const std::string& line);
+
+    static InfoDefinition ParsedInfoDefinition(std::string line);
+
+    static std::string FormattedContigDefinition(const ContigDefinition& def);
+
+    static std::string FormattedFilterDefinition(const FilterDefinition& def);
+
+    static std::string FormattedFormatDefinition(const FormatDefinition& def);
+
+    static std::string FormattedGeneralDefinition(const GeneralDefinition& def);
+
+    static std::string FormattedInfoDefinition(const InfoDefinition& def);
+
+    /// \}
+
+public:
+    /// \internal
+    /// \name Variant format helpers
+    /// \{
+
+    static std::string FormattedInfoField(const InfoField& field);
+
+    static std::string FormattedInfoFields(const std::vector<InfoField>& fields);
+
+    static InfoField ParsedInfoField(const std::string& text);
+
+    static std::vector<InfoField> ParsedInfoFields(const std::string& text);
+
+    /// \}
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFFORMAT_H
diff --git a/include/pbbam/vcf/VcfHeader.h b/include/pbbam/vcf/VcfHeader.h

new file mode 100644 (file)

index 0000000..6d36873
--- /dev/null
+++ b/include/pbbam/vcf/VcfHeader.h
@@ -0,0 +1,121 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFHEADER_H
+#define PBBAM_VCF_VCFHEADER_H
+
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <boost/optional.hpp>
+
+#include <pbbam/vcf/VcfHeaderTypes.h>
+
+namespace PacBio {
+namespace VCF {
+
+class VcfHeader
+{
+public:
+    VcfHeader();
+
+    explicit VcfHeader(const std::string& hdrText);
+
+    VcfHeader(const VcfHeader&) = default;
+    VcfHeader(VcfHeader&&) = default;
+    VcfHeader& operator=(const VcfHeader&) = default;
+    VcfHeader& operator=(VcfHeader&&) = default;
+    ~VcfHeader() = default;
+
+public:
+    // general lines
+
+    size_t NumLines() const;
+
+    const std::string& FileDate() const;
+    const std::string& Version() const;
+
+    const std::vector<PacBio::VCF::GeneralDefinition>& GeneralDefinitions() const;
+    const PacBio::VCF::GeneralDefinition& GeneralDefinition(const std::string& id) const;
+
+    // ##contig
+    const std::vector<PacBio::VCF::ContigDefinition>& ContigDefinitions() const;
+    const PacBio::VCF::ContigDefinition& ContigDefinition(const std::string& id) const;
+
+    // INFO
+
+    const std::vector<PacBio::VCF::InfoDefinition>& InfoDefinitions() const;
+    const PacBio::VCF::InfoDefinition& InfoDefinition(const std::string& id) const;
+
+    // FILTER
+
+    const std::vector<PacBio::VCF::FilterDefinition>& FilterDefinitions() const;
+    const PacBio::VCF::FilterDefinition& FilterDefinition(const std::string& id) const;
+
+    // FORMAT
+
+    const std::vector<PacBio::VCF::FormatDefinition>& FormatDefinitions() const;
+    const PacBio::VCF::FormatDefinition& FormatDefinition(const std::string& id) const;
+
+    // samples
+
+    size_t IndexOfSample(const Sample& sample) const;
+    const Sample& SampleAt(size_t index) const;
+    const std::vector<Sample>& Samples() const;
+
+public:
+    // general lines
+
+    VcfHeader& FileDate(std::string fileDate);
+    VcfHeader& Version(std::string version);
+
+    VcfHeader& AddGeneralDefinition(PacBio::VCF::GeneralDefinition def);
+    VcfHeader& GeneralDefinitions(std::vector<PacBio::VCF::GeneralDefinition> defs);
+
+    // ##contig
+    VcfHeader& AddContigDefinition(PacBio::VCF::ContigDefinition def);
+    VcfHeader& ContigDefinitions(std::vector<PacBio::VCF::ContigDefinition> defs);
+
+    // INFO
+
+    VcfHeader& AddInfoDefinition(PacBio::VCF::InfoDefinition info);
+    VcfHeader& InfoDefinitions(std::vector<PacBio::VCF::InfoDefinition> defs);
+
+    // FILTER
+
+    VcfHeader& AddFilterDefinition(PacBio::VCF::FilterDefinition filter);
+    VcfHeader& FilterDefinitions(std::vector<PacBio::VCF::FilterDefinition> defs);
+
+    // FORMAT
+
+    VcfHeader& AddFormatDefinition(PacBio::VCF::FormatDefinition format);
+    VcfHeader& FormatDefinitions(std::vector<PacBio::VCF::FormatDefinition> defs);
+
+    // samples
+
+    VcfHeader& AddSample(std::string sample);
+    VcfHeader& Samples(std::vector<std::string> names);
+
+private:
+    std::vector<PacBio::VCF::GeneralDefinition> generalDefinitions_;
+    std::vector<PacBio::VCF::ContigDefinition> contigDefinitions_;
+    std::vector<PacBio::VCF::InfoDefinition> infoDefinitions_;
+    std::vector<PacBio::VCF::FilterDefinition> filterDefinitions_;
+    std::vector<PacBio::VCF::FormatDefinition> formatDefinitions_;
+    std::vector<PacBio::VCF::Sample> samples_;
+
+    std::unordered_map<std::string, size_t> generalLookup_;
+    std::unordered_map<std::string, size_t> contigLookup_;
+    std::unordered_map<std::string, size_t> infoLookup_;
+    std::unordered_map<std::string, size_t> filterLookup_;
+    std::unordered_map<std::string, size_t> formatLookup_;
+    std::unordered_map<std::string, size_t> sampleLookup_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#include "pbbam/vcf/internal/VcfHeader.inl"
+
+#endif  // PBBAM_VCF_VCFHEADER_H
diff --git a/include/pbbam/vcf/VcfHeaderTypes.h b/include/pbbam/vcf/VcfHeaderTypes.h

new file mode 100644 (file)

index 0000000..8fa143f
--- /dev/null
+++ b/include/pbbam/vcf/VcfHeaderTypes.h
@@ -0,0 +1,161 @@
+
+#ifndef PBBAM_VCF_VCFHEADERTYPES_H
+#define PBBAM_VCF_VCFHEADERTYPES_H
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <boost/optional.hpp>
+
+namespace PacBio {
+namespace VCF {
+
+using Sample = std::string;
+
+class ContigDefinition
+{
+public:
+    explicit ContigDefinition(std::string id);
+    ContigDefinition(std::string id, std::vector<std::pair<std::string, std::string>> attributes);
+
+    ContigDefinition() = delete;
+    ContigDefinition(const ContigDefinition&) = default;
+    ContigDefinition(ContigDefinition&&) = default;
+    ContigDefinition& operator=(const ContigDefinition&) = default;
+    ContigDefinition& operator=(ContigDefinition&&) = default;
+    ~ContigDefinition() = default;
+
+public:
+    const std::string& Id() const;
+    const std::vector<std::pair<std::string, std::string>>& Attributes() const;
+
+    ContigDefinition& AddAttribute(std::string id, std::string value);
+    ContigDefinition& AddAttribute(std::pair<std::string, std::string> attribute);
+    ContigDefinition& Attributes(std::vector<std::pair<std::string, std::string>> attributes);
+
+private:
+    std::string id_;
+    std::vector<std::pair<std::string, std::string>> attributes_;
+};
+
+///
+/// \brief The FilterDefinition class
+///
+class FilterDefinition
+{
+public:
+    FilterDefinition(std::string id, std::string description);
+
+    FilterDefinition() = delete;
+    FilterDefinition(const FilterDefinition&) = default;
+    FilterDefinition(FilterDefinition&&) = default;
+    FilterDefinition& operator=(const FilterDefinition&) = default;
+    FilterDefinition& operator=(FilterDefinition&&) = default;
+    ~FilterDefinition() = default;
+
+    const std::string& Id() const;
+    const std::string& Description() const;
+
+private:
+    // required fields
+    std::string id_;
+    std::string description_;
+};
+
+///
+/// \brief The FormatDefinition class
+///
+class FormatDefinition
+{
+public:
+    FormatDefinition(std::string id, std::string number, std::string type, std::string description);
+
+    FormatDefinition() = delete;
+    FormatDefinition(const FormatDefinition&) = default;
+    FormatDefinition(FormatDefinition&&) = default;
+    FormatDefinition& operator=(const FormatDefinition&) = default;
+    FormatDefinition& operator=(FormatDefinition&&) = default;
+    ~FormatDefinition() = default;
+
+    const std::string& Id() const;
+    const std::string& Number() const;
+    const std::string& Type() const;
+    const std::string& Description() const;
+
+private:
+    std::string id_;
+    std::string number_;  // TODO: enum
+    std::string type_;    // TODO: enum
+    std::string description_;
+};
+
+///
+/// \brief The GeneralDefinition class
+///
+class GeneralDefinition
+{
+public:
+    GeneralDefinition(std::string id, std::string text);
+
+    GeneralDefinition() = delete;
+    GeneralDefinition(const GeneralDefinition&) = default;
+    GeneralDefinition(GeneralDefinition&&) = default;
+    GeneralDefinition& operator=(const GeneralDefinition&) = default;
+    GeneralDefinition& operator=(GeneralDefinition&&) = default;
+    ~GeneralDefinition() = default;
+
+    const std::string& Id() const;
+    const std::string& Text() const;
+
+private:
+    // required fields
+    std::string id_;
+    std::string text_;
+};
+
+///
+/// \brief The InfoDefinition class
+///
+class InfoDefinition
+{
+public:
+    InfoDefinition(std::string id, std::string number, std::string type, std::string description,
+                   std::string source = std::string{}, std::string version = std::string{});
+
+    InfoDefinition() = delete;
+    InfoDefinition(const InfoDefinition&) = default;
+    InfoDefinition(InfoDefinition&&) = default;
+    InfoDefinition& operator=(const InfoDefinition&) = default;
+    InfoDefinition& operator=(InfoDefinition&&) = default;
+    ~InfoDefinition() = default;
+
+    const std::string& Id() const;
+    const std::string& Number() const;
+    const std::string& Type() const;
+    const std::string& Description() const;
+    const boost::optional<std::string>& Source() const;
+    const boost::optional<std::string>& Version() const;
+
+    InfoDefinition& Source(std::string s);
+    InfoDefinition& Version(std::string v);
+
+private:
+    // required fields
+    // (functionally const, not marked as such to still allow moves)
+    std::string id_;
+    std::string number_;  // TODO: enum
+    std::string type_;    // TODO: enum
+    std::string description_;
+
+    // optional fields - settable after ctor
+    boost::optional<std::string> source_;
+    boost::optional<std::string> version_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#include "pbbam/vcf/internal/VcfHeaderTypes.inl"
+
+#endif  // PBBAM_VCF_VCFHEADERTYPES_H
diff --git a/include/pbbam/vcf/VcfQuery.h b/include/pbbam/vcf/VcfQuery.h

new file mode 100644 (file)

index 0000000..3c39af1
--- /dev/null
+++ b/include/pbbam/vcf/VcfQuery.h
@@ -0,0 +1,45 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFQUERY_H
+#define PBBAM_VCF_VCFQUERY_H
+
+#include <string>
+
+#include <pbbam/internal/QueryBase.h>
+
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfReader.h>
+#include <pbbam/vcf/VcfVariant.h>
+
+namespace PacBio {
+namespace VCF {
+
+class VcfQuery : public PacBio::BAM::internal::QueryBase<VcfVariant>
+{
+public:
+    explicit VcfQuery(std::string fn);
+    explicit VcfQuery(const VcfFile& file);
+
+    VcfQuery() = default;
+    VcfQuery(const VcfQuery&) = delete;
+    VcfQuery(VcfQuery&&) = default;
+    VcfQuery& operator=(const VcfQuery&) = delete;
+    VcfQuery& operator=(VcfQuery&&) = default;
+    ~VcfQuery() = default;
+
+public:
+    /// \brief Main iteration point for record access.
+    ///
+    /// Most client code should not need to use this method directly. Use
+    /// iterators instead.
+    ///
+    bool GetNext(VcfVariant& var) override;
+
+private:
+    VcfReader reader_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFQUERY_H
diff --git a/include/pbbam/vcf/VcfReader.h b/include/pbbam/vcf/VcfReader.h

new file mode 100644 (file)

index 0000000..2d2f78a
--- /dev/null
+++ b/include/pbbam/vcf/VcfReader.h
@@ -0,0 +1,50 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFREADER_H
+#define PBBAM_VCF_VCFREADER_H
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfHeader.h>
+#include <pbbam/vcf/VcfVariant.h>
+
+namespace PacBio {
+namespace VCF {
+
+///
+/// \brief The VcfReader class
+///
+class VcfReader
+{
+public:
+    explicit VcfReader(std::string fn);
+    explicit VcfReader(const VcfFile& file);
+
+    VcfReader() = delete;
+    VcfReader(const VcfReader&) = delete;
+    VcfReader(VcfReader&&) = default;
+    VcfReader& operator=(const VcfReader&) = delete;
+    VcfReader& operator=(VcfReader&&) = default;
+    ~VcfReader() = default;
+
+public:
+    const VcfHeader& Header() const;
+
+    bool GetNext(VcfVariant& var);
+
+private:
+    void FetchNext();
+
+private:
+    std::ifstream in_;
+    VcfHeader header_;
+    std::string line_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFREADER_H
diff --git a/include/pbbam/vcf/VcfSort.h b/include/pbbam/vcf/VcfSort.h

new file mode 100644 (file)

index 0000000..4166732
--- /dev/null
+++ b/include/pbbam/vcf/VcfSort.h
@@ -0,0 +1,34 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFSORT_H
+#define PBBAM_VCF_VCFSORT_H
+
+#include <string>
+
+#include "pbbam/Config.h"
+#include "pbbam/vcf/VcfFile.h"
+
+namespace PacBio {
+namespace VCF {
+
+///
+/// \brief SortFile
+/// \param file
+/// \param outputFilename
+///
+void SortFile(const VcfFile& file, const std::string& outputFilename);
+
+///
+/// \brief SortFile
+/// \param inputFilename
+/// \param outputFilename
+///
+inline void SortFile(const std::string& inputFilename, const std::string& outputFilename)
+{
+    SortFile(VcfFile{inputFilename}, outputFilename);
+}
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFSORT_H
diff --git a/include/pbbam/vcf/VcfVariant.h b/include/pbbam/vcf/VcfVariant.h

new file mode 100644 (file)

index 0000000..3ea244a
--- /dev/null
+++ b/include/pbbam/vcf/VcfVariant.h
@@ -0,0 +1,148 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VARIANT_H
+#define PBBAM_VCF_VARIANT_H
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <boost/optional.hpp>
+
+#include <pbbam/Position.h>
+
+namespace PacBio {
+namespace VCF {
+
+struct InfoField
+{
+    std::string id;
+    boost::optional<std::string> value;
+    boost::optional<std::vector<std::string>> values;
+};
+
+struct GenotypeData
+{
+    boost::optional<std::string> value;
+    boost::optional<std::vector<std::string>> values;
+};
+
+struct GenotypeField
+{
+    std::vector<GenotypeData> data;
+};
+
+class VcfVariant
+{
+public:
+    VcfVariant();
+
+    explicit VcfVariant(const std::string& text);
+
+    VcfVariant(std::string id, std::string chrom, PacBio::BAM::Position pos, std::string refAllele,
+               std::string altAllele);
+
+    VcfVariant(const VcfVariant&) = default;
+    VcfVariant(VcfVariant&&) = default;
+    VcfVariant& operator=(const VcfVariant&) = default;
+    VcfVariant& operator=(VcfVariant&&) = default;
+    ~VcfVariant() = default;
+
+public:
+    // core fields
+
+    const std::string& Chrom() const;
+    VcfVariant& Chrom(std::string chrom);
+
+    PacBio::BAM::Position Position() const;
+    VcfVariant& Position(PacBio::BAM::Position pos);
+
+    const std::string& Id() const;
+    VcfVariant& Id(std::string id);
+
+    const std::string& RefAllele() const;
+    VcfVariant& RefAllele(std::string refAllele);
+
+    const std::string& AltAllele() const;
+    VcfVariant& AltAllele(std::string altAllele);
+
+    float Quality() const;
+    VcfVariant& Quality(float qual);
+
+    const std::string& Filter() const;
+    VcfVariant& Filter(std::string filter);
+
+    // convenience methods
+    bool IsDeletion() const;
+    bool IsInsertion() const;
+    bool IsQualityMissing() const;
+    bool IsSnp() const;
+
+public:
+    // info fields
+
+    VcfVariant& AddInfoField(InfoField field);
+    VcfVariant& RemoveInfoField(const std::string& id);
+
+    const std::vector<InfoField>& InfoFields() const;
+    VcfVariant& InfoFields(std::vector<InfoField> fields);
+
+    bool HasInfoField(const std::string& id) const;
+
+    const boost::optional<std::string> InfoValue(const std::string& id) const;
+    VcfVariant& InfoValue(const std::string& id, boost::optional<std::string> value);
+
+    const boost::optional<std::vector<std::string>> InfoValues(const std::string& id) const;
+    VcfVariant& InfoValues(const std::string& id, boost::optional<std::vector<std::string>> values);
+
+public:
+    // sample genotypes
+
+    // NOTE: if you want to look up by sample name, get the index from header
+
+    std::vector<std::string> GenotypeIds() const;
+    VcfVariant& GenotypeIds(std::vector<std::string> ids);
+
+    std::vector<GenotypeField> Genotypes() const;
+    VcfVariant& Genotypes(std::vector<GenotypeField> genotypes);
+
+    const boost::optional<std::string>& GenotypeValue(const size_t sampleIndex,
+                                                      const std::string& id) const;
+    VcfVariant& GenotypeValue(const size_t sampleIndex, const std::string& id,
+                              boost::optional<std::string> value);
+
+    const boost::optional<std::vector<std::string>>& GenotypeValues(const size_t sampleIndex,
+                                                                    const std::string& id) const;
+    VcfVariant& GenotypeValues(const size_t sampleIndex, const std::string& id,
+                               boost::optional<std::vector<std::string>> values);
+
+    bool IsSampleHeterozygous(const size_t sampleIndex) const;
+    bool IsSamplePhased(const size_t sampleIndex) const;
+
+private:
+    // FIXED data
+    std::string chrom_;
+    PacBio::BAM::Position pos_;
+    std::string id_;
+    std::string refAllele_;
+    std::string altAllele_;  // multiple? KISS, only add if needed
+    float qual_;
+    std::string filter_;
+
+    // INFO data
+    std::vector<InfoField> infoFields_;
+    std::unordered_map<std::string, size_t> infoLookup_;
+
+    // SAMPLE GENOTYPE data
+    std::vector<std::string> format_;  // order matches FORMAT string
+    std::unordered_map<std::string, size_t>
+        genotypeDataLookup_;                      // genotype ID -> genotypeField.data index
+    std::vector<GenotypeField> sampleGenotypes_;  // index matches sample order
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#include "pbbam/vcf/internal/VcfVariant.inl"
+
+#endif  // PBBAM_VCF_VARIANT_H
diff --git a/include/pbbam/vcf/VcfWriter.h b/include/pbbam/vcf/VcfWriter.h

new file mode 100644 (file)

index 0000000..96dd849
--- /dev/null
+++ b/include/pbbam/vcf/VcfWriter.h
@@ -0,0 +1,38 @@
+// Author: Derek Barnett
+
+#ifndef PBBAM_VCF_VCFWRITER_H
+#define PBBAM_VCF_VCFWRITER_H
+
+#include <memory>
+#include <string>
+
+namespace PacBio {
+namespace VCF {
+
+class VcfHeader;
+class VcfVariant;
+
+class VcfWriter
+{
+public:
+    VcfWriter(std::string filename, const VcfHeader& header);
+
+    VcfWriter() = delete;
+    VcfWriter(const VcfWriter&) = delete;
+    VcfWriter(VcfWriter&&) = default;
+    VcfWriter& operator=(const VcfWriter&) = delete;
+    VcfWriter& operator=(VcfWriter&&) = default;
+    ~VcfWriter();
+
+public:
+    bool Write(const VcfVariant& var);
+
+private:
+    struct VcfWriterPrivate;
+    std::unique_ptr<VcfWriterPrivate> d_;
+};
+
+}  // namespace VCF
+}  // namespace PacBio
+
+#endif  // PBBAM_VCF_VCFWRITER_H
diff --git a/include/pbbam/vcf/internal/VcfFile.inl b/include/pbbam/vcf/internal/VcfFile.inl

new file mode 100644 (file)

index 0000000..cd7c71e
--- /dev/null
+++ b/include/pbbam/vcf/internal/VcfFile.inl
@@ -0,0 +1,22 @@
+#ifndef PBBAM_VCF_VCFFILE_INL
+#define PBBAM_VCF_VCFFILE_INL
+
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfFormat.h>
+
+namespace PacBio {
+namespace VCF {
+
+inline VcfFile::VcfFile(std::string fn)
+    : filename_{std::move(fn)}
+    , header_{VcfFormat::HeaderFromFile(filename_)}
+{ }
+
+inline const std::string& VcfFile::Filename() const { return filename_; }
+
+inline const VcfHeader& VcfFile::Header() const { return header_; }
+
+} // namespace VCF
+} // namespace PacBio
+
+#endif // PBBAM_VCF_VCFFILE_INL
diff --git a/include/pbbam/vcf/internal/VcfHeader.inl b/include/pbbam/vcf/internal/VcfHeader.inl

new file mode 100644 (file)

index 0000000..2c92bb7
--- /dev/null
+++ b/include/pbbam/vcf/internal/VcfHeader.inl
@@ -0,0 +1,241 @@
+#ifndef PBBAM_VCF_VCFHEADER_INL
+#define PBBAM_VCF_VCFHEADER_INL
+
+#include <pbbam/vcf/VcfHeader.h>
+
+namespace PacBio {
+namespace VCF {
+
+inline VcfHeader& VcfHeader::AddContigDefinition(PacBio::VCF::ContigDefinition contig)
+{
+    const auto found = contigLookup_.find(contig.Id());
+    if (found == contigLookup_.cend())
+    {
+        contigLookup_.insert({contig.Id(), contigDefinitions_.size()});
+        contigDefinitions_.push_back(std::move(contig));
+    }
+    else
+        contigDefinitions_.at(found->second) = std::move(contig);
+    return *this;
+}
+
+inline VcfHeader& VcfHeader::AddFilterDefinition(PacBio::VCF::FilterDefinition filter)
+{
+    const auto found = filterLookup_.find(filter.Id());
+    if (found == filterLookup_.cend())
+    {
+        filterLookup_.insert({filter.Id(), filterDefinitions_.size()});
+        filterDefinitions_.push_back(std::move(filter));
+    }
+    else
+        filterDefinitions_.at(found->second) = std::move(filter);
+    return *this;
+}
+
+inline VcfHeader& VcfHeader::AddFormatDefinition(PacBio::VCF::FormatDefinition format)
+{
+    const auto found = formatLookup_.find(format.Id());
+    if (found == formatLookup_.cend())
+    {
+        formatLookup_.insert({format.Id(), formatDefinitions_.size()});
+        formatDefinitions_.push_back(std::move(format));
+    }
+    else
+        formatDefinitions_.at(found->second) = std::move(format);
+    return *this;
+}
+
+inline VcfHeader& VcfHeader::AddGeneralDefinition(PacBio::VCF::GeneralDefinition def)
+{
+    const auto found = generalLookup_.find(def.Id());
+    if (found == generalLookup_.cend())
+    {
+        generalLookup_.insert({def.Id(), generalDefinitions_.size()});
+        generalDefinitions_.push_back(std::move(def));
+    }
+    else
+        generalDefinitions_.at(found->second) = std::move(def);
+    return *this;
+}
+
+inline VcfHeader& VcfHeader::AddInfoDefinition(PacBio::VCF::InfoDefinition info)
+{
+    const auto found = infoLookup_.find(info.Id());
+    if (found == infoLookup_.cend()) {
+        infoLookup_.insert({info.Id()
+                            , infoDefinitions_.size()});
+        infoDefinitions_.push_back(std::move(info));
+    }
+    else
+        infoDefinitions_.at(found->second) = std::move(info);
+    return *this;
+}
+
+inline VcfHeader& VcfHeader::AddSample(std::string sample)
+{
+    const auto found = sampleLookup_.find(sample);
+    if (found == sampleLookup_.cend())
+    {
+        sampleLookup_.insert({sample, samples_.size()});
+        samples_.push_back(std::move(sample));
+    }
+    else
+        samples_.at(found->second) = std::move(sample);
+    return *this;
+}
+
+inline const std::vector<PacBio::VCF::ContigDefinition>& VcfHeader::ContigDefinitions() const
+{
+    return contigDefinitions_;
+}
+
+inline const PacBio::VCF::ContigDefinition& VcfHeader::ContigDefinition(const std::string& id) const
+{
+    return contigDefinitions_.at(contigLookup_.at(id));
+}
+
+inline VcfHeader& VcfHeader::ContigDefinitions(std::vector<PacBio::VCF::ContigDefinition> defs)
+{
+    contigDefinitions_.clear();
+    contigLookup_.clear();
+    for (auto&& def : defs)
+        AddContigDefinition(std::move(def));
+    return *this;
+}
+
+inline const std::string& VcfHeader::FileDate() const
+{
+    return generalDefinitions_.at(generalLookup_.at("fileDate")).Text();
+}
+
+inline VcfHeader& VcfHeader::FileDate(std::string fileDate)
+{
+    AddGeneralDefinition({"fileDate", std::move(fileDate)});
+    return *this;
+}
+
+inline const std::vector<PacBio::VCF::FilterDefinition>& VcfHeader::FilterDefinitions() const
+{
+    return filterDefinitions_;
+}
+
+inline const PacBio::VCF::FilterDefinition& VcfHeader::FilterDefinition(const std::string& id) const
+{
+    return filterDefinitions_.at(filterLookup_.at(id));
+}
+
+inline VcfHeader& VcfHeader::FilterDefinitions(std::vector<PacBio::VCF::FilterDefinition> defs)
+{
+    filterDefinitions_.clear();
+    filterLookup_.clear();
+    for (auto&& def : defs)
+        AddFilterDefinition(std::move(def));
+    return *this;
+}
+
+inline const std::vector<PacBio::VCF::FormatDefinition>& VcfHeader::FormatDefinitions() const
+{
+    return formatDefinitions_;
+}
+
+inline const PacBio::VCF::FormatDefinition& VcfHeader::FormatDefinition(const std::string& id) const
+{
+    return formatDefinitions_.at(formatLookup_.at(id));
+}
+
+inline VcfHeader& VcfHeader::FormatDefinitions(std::vector<PacBio::VCF::FormatDefinition> defs)
+{
+    formatDefinitions_.clear();
+    formatLookup_.clear();
+    for (auto&& def : defs)
+        AddFormatDefinition(std::move(def));
+    return *this;
+}
+
+inline const std::vector<PacBio::VCF::GeneralDefinition>& VcfHeader::GeneralDefinitions() const
+{
+    return generalDefinitions_;
+}
+
+inline const PacBio::VCF::GeneralDefinition& VcfHeader::GeneralDefinition(const std::string& id) const
+{
+    return generalDefinitions_.at(generalLookup_.at(id));
+}
+
+inline VcfHeader& VcfHeader::GeneralDefinitions(std::vector<PacBio::VCF::GeneralDefinition> defs)
+{
+    generalDefinitions_.clear();
+    generalLookup_.clear();
+    for (auto&& def : defs)
+        AddGeneralDefinition(std::move(def));
+    return *this;
+}
+
+inline const std::vector<PacBio::VCF::InfoDefinition>& VcfHeader::InfoDefinitions() const
+{
+    return infoDefinitions_;
+
+}
+inline const PacBio::VCF::InfoDefinition& VcfHeader::InfoDefinition(const std::string& id) const
+{
+    return infoDefinitions_.at(infoLookup_.at(id));
+}
+
+inline VcfHeader& VcfHeader::InfoDefinitions(std::vector<PacBio::VCF::InfoDefinition> defs)
+{
+    infoDefinitions_.clear();
+    infoLookup_.clear();
+    for (auto&& def : defs)
+        AddInfoDefinition(std::move(def));
+    return *this;
+}
+
+inline size_t VcfHeader::NumLines() const
+{
+    // +1 for #CHROM line
+    return generalDefinitions_.size() +
+           contigDefinitions_.size() +
+           infoDefinitions_.size() +
+           filterDefinitions_.size() +
+           formatDefinitions_.size() + 1;
+}
+
+inline const Sample& VcfHeader::SampleAt(size_t index) const
+{
+    return samples_.at(index);
+}
+
+inline size_t VcfHeader::IndexOfSample(const Sample &sample) const
+{
+    return sampleLookup_.at(sample);
+}
+
+inline const std::vector<Sample>& VcfHeader::Samples() const
+{
+    return samples_;
+}
+
+inline VcfHeader& VcfHeader::Samples(std::vector<Sample> names)
+{
+    samples_.clear();
+    sampleLookup_.clear();
+    for (auto&& name : names)
+        AddSample(std::move(name));
+    return *this;
+}
+
+inline const std::string& VcfHeader::Version() const
+{
+    return generalDefinitions_.at(generalLookup_.at("fileformat")).Text();
+}
+
+inline VcfHeader& VcfHeader::Version(std::string version)
+{
+    AddGeneralDefinition({"fileformat", std::move(version)});
+    return *this;
+}
+
+} // namespace VCF
+} // namespace PacBio
+
+#endif // PBBAM_VCF_VCFHEADER_INL
diff --git a/include/pbbam/vcf/internal/VcfHeaderTypes.inl b/include/pbbam/vcf/internal/VcfHeaderTypes.inl

new file mode 100644 (file)

index 0000000..610de51
--- /dev/null
+++ b/include/pbbam/vcf/internal/VcfHeaderTypes.inl
@@ -0,0 +1,187 @@
+
+#ifndef PBBAM_VCF_VCFHEADERTYPES_INL
+#define PBBAM_VCF_VCFHEADERTYPES_INL
+
+#include <stdexcept>
+
+#include <pbbam/vcf/VcfHeaderTypes.h>
+
+namespace PacBio {
+namespace VCF {
+
+// -------------------
+// ContigDefinition
+// -------------------
+
+inline ContigDefinition::ContigDefinition(std::string id)
+    : ContigDefinition(std::move(id), {})
+{ }
+
+inline ContigDefinition::ContigDefinition(std::string id,
+                                          std::vector<std::pair<std::string, std::string>> attributes)
+    : id_{std::move(id)}
+    , attributes_{std::move(attributes)}
+{
+    if (id_.empty())
+        throw std::runtime_error{"VCF format error: ##contig definition has empty ID field"};
+}
+
+inline ContigDefinition& ContigDefinition::AddAttribute(std::string id, std::string value)
+{
+    return AddAttribute(std::make_pair(std::move(id), std::move(value)));
+}
+
+inline ContigDefinition& ContigDefinition::AddAttribute(std::pair<std::string, std::string> attribute)
+{
+    attributes_.push_back(std::move(attribute));
+    return *this;
+}
+
+inline const std::vector<std::pair<std::string, std::string>>& ContigDefinition::Attributes() const
+{
+    return attributes_;
+}
+
+inline ContigDefinition& ContigDefinition::Attributes(std::vector<std::pair<std::string, std::string>> attributes)
+{
+    attributes_ = std::move(attributes);
+    return *this;
+}
+
+inline const std::string& ContigDefinition::Id() const
+{
+    return id_;
+}
+
+// -------------------
+// FilterDefinition
+// -------------------
+
+inline FilterDefinition::FilterDefinition(std::string id, std::string description)
+    : id_{std::move(id)}
+    , description_{std::move(description)}
+{
+    if (id_.empty())
+        throw std::runtime_error{"VCF format error: FILTER definition has empty ID field"};
+
+    if (description_.empty())
+        throw std::runtime_error{"VCF format error: FILTER definition has empty Description field"};
+}
+
+inline const std::string& FilterDefinition::Description() const { return description_; }
+
+inline const std::string& FilterDefinition::Id() const { return id_; }
+
+// -------------------
+// FormatDefinition
+// -------------------
+
+inline FormatDefinition::FormatDefinition(std::string id,
+               std::string number,
+               std::string type,
+               std::string description)
+    : id_{std::move(id)}
+    , number_{std::move(number)}
+    , type_{std::move(type)}
+    , description_{std::move(description)}
+{
+    if (id_.empty())
+        throw std::runtime_error{"VCF format error: FORMAT definition has empty ID field"};
+
+    if (number_.empty())
+        throw std::runtime_error{"VCF format error: FORMAT definition has empty Number field"};
+
+    if (type_.empty())
+        throw std::runtime_error{"VCF format error: FORMAT definition has empty Type field"};
+
+    if (description_.empty())
+        throw std::runtime_error{"VCF format error: FORMAT definition has empty Description field"};
+}
+
+inline const std::string& FormatDefinition::Description() const { return description_; }
+
+inline const std::string& FormatDefinition::Id() const { return id_; }
+
+inline const std::string& FormatDefinition::Number() const { return number_; }
+
+inline const std::string& FormatDefinition::Type() const { return type_; }
+
+// -------------------
+// GeneralDefinition
+// -------------------
+
+inline GeneralDefinition::GeneralDefinition(std::string id, std::string text)
+    : id_{std::move(id)}
+    , text_{std::move(text)}
+{
+    if (id_.empty())
+        throw std::runtime_error{"VCF format error: general metadata definition has empty label"};
+
+    if (text_.empty())
+        throw std::runtime_error{"VCF format error: general metadata definition has empty value"};
+}
+
+inline const std::string& GeneralDefinition::Id() const { return id_; }
+
+inline const std::string& GeneralDefinition::Text() const { return text_; }
+
+// -------------------
+// InfoDefinition
+// -------------------
+
+inline InfoDefinition::InfoDefinition(std::string id,
+               std::string number,
+               std::string type,
+               std::string description,
+               std::string source,
+               std::string version)
+    : id_{std::move(id)}
+    , number_{std::move(number)}
+    , type_{std::move(type)}
+    , description_{std::move(description)}
+{
+    // verify required fields
+    if (id_.empty())
+        throw std::runtime_error{"VCF format error: INFO definition has empty ID field"};
+
+    if (number_.empty())
+        throw std::runtime_error{"VCF format error: INFO definition has empty Number field"};
+
+    if (type_.empty())
+        throw std::runtime_error{"VCF format error: INFO definition has empty Type field"};
+
+    if (description_.empty())
+        throw std::runtime_error{"VCF format error: INFO definition has empty Description field"};
+
+    if (!source.empty()) source_ = std::move(source);
+    if (!version.empty()) version_ = std::move(version);
+}
+
+inline const std::string& InfoDefinition::Description() const { return description_; }
+
+inline const std::string& InfoDefinition::Id() const { return id_; }
+
+inline const std::string& InfoDefinition::Number() const { return number_; }
+
+inline const boost::optional<std::string>& InfoDefinition::Source() const { return source_; }
+
+inline InfoDefinition& InfoDefinition::Source(std::string s)
+{
+    source_ = std::move(s); return *this;
+}
+
+inline const std::string& InfoDefinition::Type() const { return type_; }
+
+inline const boost::optional<std::string>& InfoDefinition::Version() const { return version_; }
+
+inline InfoDefinition& InfoDefinition::Version(std::string v)
+{
+    version_ = std::move(v); return *this;
+}
+
+} // namespace VCF
+} // namespace PacBio
+
+#include "pbbam/vcf/internal/VcfHeaderTypes.inl"
+
+#endif // PBBAM_VCF_VCFHEADERTYPES_INL
diff --git a/include/pbbam/vcf/internal/VcfVariant.inl b/include/pbbam/vcf/internal/VcfVariant.inl

new file mode 100644 (file)

index 0000000..84aee2a
--- /dev/null
+++ b/include/pbbam/vcf/internal/VcfVariant.inl
@@ -0,0 +1,274 @@
+#ifndef PBBAM_VCF_VCFVcfVariant_INL
+#define PBBAM_VCF_VCFVcfVariant_INL
+
+#include <pbbam/vcf/VcfVariant.h>
+
+#include <cmath>
+
+#include <pbbam/StringUtilities.h>
+
+namespace PacBio {
+namespace VCF {
+
+inline  VcfVariant::VcfVariant()
+    : pos_{PacBio::BAM::UnmappedPosition}
+    , qual_{NAN}
+    , filter_{"PASS"}
+{
+}
+
+inline VcfVariant::VcfVariant(
+        std::string id, std::string chrom, PacBio::BAM::Position pos,
+        std::string refAllele, std::string altAllele)
+    : chrom_{std::move(chrom)}
+    , pos_{pos}
+    , id_{std::move(id)}
+    , refAllele_{std::move(refAllele)}
+    , altAllele_{std::move(altAllele)}
+    , qual_{NAN}
+    , filter_{"PASS"}
+{
+}
+
+inline VcfVariant& VcfVariant::AddInfoField(InfoField field)
+{
+    const auto found = infoLookup_.find(field.id);
+    if (found == infoLookup_.cend()) {
+        infoLookup_.insert({field.id, infoFields_.size()});
+        infoFields_.push_back(std::move(field));
+    }
+    else
+        infoFields_.at(found->second) = std::move(field);
+    return *this;
+}
+
+inline const std::string& VcfVariant::AltAllele() const { return altAllele_; }
+
+inline VcfVariant& VcfVariant::AltAllele(std::string altAllele)
+{
+    altAllele_ = std::move(altAllele);
+    return *this;
+}
+
+inline const std::string& VcfVariant::Chrom() const { return chrom_; }
+
+inline VcfVariant& VcfVariant::Chrom(std::string chrom)
+{
+    chrom_ = std::move(chrom);
+    return *this;
+}
+
+inline const std::string& VcfVariant::Filter() const { return filter_; }
+
+inline VcfVariant& VcfVariant::Filter(std::string filter)
+{
+    filter_ = std::move(filter);
+    return *this;
+}
+
+inline std::vector<std::string> VcfVariant::GenotypeIds() const
+{
+    return format_;
+}
+
+inline VcfVariant& VcfVariant::GenotypeIds(std::vector<std::string> ids)
+{
+    genotypeDataLookup_.clear();
+
+    format_ = std::move(ids);
+    for (size_t i = 0; i < format_.size(); ++i)
+        genotypeDataLookup_.insert({format_.at(i), i});
+    return *this;
+}
+
+inline std::vector<GenotypeField> VcfVariant::Genotypes() const
+{
+    return sampleGenotypes_;
+}
+
+inline VcfVariant& VcfVariant::Genotypes(std::vector<GenotypeField> genotypes)
+{
+    sampleGenotypes_ = std::move(genotypes);
+    return *this;
+}
+
+inline const boost::optional<std::string>& VcfVariant::GenotypeValue(
+        const size_t sampleIndex, const std::string& id) const
+{
+    const auto& genotypeField = sampleGenotypes_.at(sampleIndex);
+    const auto genotypeDataIndex = genotypeDataLookup_.at(id);
+    const auto& genotypeData = genotypeField.data.at(genotypeDataIndex);
+    return genotypeData.value;
+}
+
+inline VcfVariant& VcfVariant::GenotypeValue(
+        const size_t sampleIndex, const std::string& id, boost::optional<std::string> value)
+{
+    auto& genotypeField = sampleGenotypes_.at(sampleIndex);
+    const auto genotypeDataIndex = genotypeDataLookup_.at(id);
+    auto& genotypeData = genotypeField.data.at(genotypeDataIndex);
+    genotypeData.value = std::move(value);
+    return *this;
+}
+
+inline const boost::optional<std::vector<std::string>>& VcfVariant::GenotypeValues(
+        const size_t sampleIndex, const std::string& id) const
+{
+    const auto& genotypeField = sampleGenotypes_.at(sampleIndex);
+    const auto genotypeDataIndex = genotypeDataLookup_.at(id);
+    const auto& genotypeData = genotypeField.data.at(genotypeDataIndex);
+    return genotypeData.values;
+}
+
+inline VcfVariant& VcfVariant::GenotypeValues(
+        const size_t sampleIndex, const std::string& id,
+        boost::optional<std::vector<std::string>> values)
+{
+    auto& genotypeField = sampleGenotypes_.at(sampleIndex);
+    const auto genotypeDataIndex = genotypeDataLookup_.at(id);
+    auto& genotypeData = genotypeField.data.at(genotypeDataIndex);
+    genotypeData.values = std::move(values);
+    return *this;
+}
+
+inline bool VcfVariant::HasInfoField(const std::string& id) const
+{
+    const auto found = infoLookup_.find(id);
+    return found != infoLookup_.cend();
+}
+
+inline const std::string& VcfVariant::Id() const { return id_; }
+
+inline VcfVariant& VcfVariant::Id(std::string id)
+{
+    id_ = std::move(id);
+    return *this;
+}
+
+inline const std::vector<InfoField>& VcfVariant::InfoFields() const
+{
+    return infoFields_;
+}
+
+inline VcfVariant& VcfVariant::InfoFields(std::vector<InfoField> fields)
+{
+    infoFields_.clear();
+    infoLookup_.clear();
+    for (auto&& field : fields)
+        AddInfoField(std::move(field));
+    return *this;
+}
+
+inline const boost::optional<std::string> VcfVariant::InfoValue(const std::string& id) const
+{
+    return infoFields_.at(infoLookup_.at(id)).value;
+}
+
+inline VcfVariant& VcfVariant::InfoValue(const std::string& id, boost::optional<std::string> value)
+{
+    infoFields_.at(infoLookup_.at(id)).value = std::move(value);
+    return *this;
+}
+
+inline const boost::optional<std::vector<std::string>> VcfVariant::InfoValues(const std::string& id) const
+{
+    return infoFields_.at(infoLookup_.at(id)).values;
+}
+
+inline VcfVariant& VcfVariant::InfoValues(const std::string& id, boost::optional<std::vector<std::string>> values)
+{
+    infoFields_.at(infoLookup_.at(id)).values = std::move(values);
+    return *this;
+}
+
+inline bool VcfVariant::IsDeletion() const
+{
+    return refAllele_.size() > altAllele_.size();
+}
+
+inline bool VcfVariant::IsInsertion() const
+{
+    return refAllele_.size() < altAllele_.size();
+}
+
+inline bool VcfVariant::IsQualityMissing() const
+{
+    return std::isnan(qual_);
+}
+
+inline bool VcfVariant::IsSampleHeterozygous(const size_t sampleIndex) const
+{
+    const auto data = GenotypeValue(sampleIndex, "GT");
+    auto fields = PacBio::BAM::Split(data.get(), '/');
+    if (fields.size() == 1)
+        fields = PacBio::BAM::Split(data.get(), '|');
+
+    if (fields.size() != 2)
+        throw std::runtime_error{"VCF format error: malformatted GT field: " + data.get()};
+
+    return fields.at(0) != fields.at(1);
+}
+
+inline bool VcfVariant::IsSamplePhased(const size_t sampleIndex) const
+{
+    const auto data = GenotypeValue(sampleIndex, "GT");
+    const auto phaseFound = data.get().find('|') != std::string::npos;
+    if (phaseFound) assert(data.get().find('/') == std::string::npos);
+    return phaseFound;
+}
+
+inline bool VcfVariant::IsSnp() const
+{
+    return refAllele_.size() == 1 &&
+           altAllele_.size() == 1 &&
+           refAllele_[0] != altAllele_[0];
+}
+
+inline PacBio::BAM::Position VcfVariant::Position() const { return pos_; }
+
+inline VcfVariant& VcfVariant::Position(PacBio::BAM::Position pos)
+{
+    pos_ = pos;
+    return *this;
+}
+
+inline float VcfVariant::Quality() const { return qual_; }
+
+inline VcfVariant& VcfVariant::Quality(float qual)
+{
+    qual_ = qual;
+    return *this;
+}
+
+inline const std::string& VcfVariant::RefAllele() const { return refAllele_; }
+
+inline VcfVariant& VcfVariant::RefAllele(std::string refAllele)
+{
+    refAllele_ = std::move(refAllele);
+    return *this;
+}
+
+inline VcfVariant& VcfVariant::RemoveInfoField(const std::string& id)
+{
+    const auto found = infoLookup_.find(id);
+    if (found == infoLookup_.cend())
+        return *this;
+
+    const auto currentFields = InfoFields();
+
+    infoFields_.clear();
+    infoLookup_.clear();
+
+    for (auto&& field : currentFields)
+    {
+        if (field.id != id)
+            AddInfoField(std::move(field));
+    }
+
+    return *this;
+}
+
+} // namespace VCF
+} // namespace PacBio
+
+#endif // PBBAM_VCF_VCFVcfVariant_INL
diff --git a/include/pbbam/virtual/VirtualPolymeraseBamRecord.h b/include/pbbam/virtual/VirtualPolymeraseBamRecord.h

new file mode 100644 (file)

index 0000000..f51b6ca
--- /dev/null
+++ b/include/pbbam/virtual/VirtualPolymeraseBamRecord.h
@@ -0,0 +1,21 @@
+// File Description
+/// \file VirtualPolymeraseBamRecord.h
+/// \brief Defines the VirtualPolymeraseBamRecord class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALPOLYMERASEBAMRECORD_H
+#define VIRTUALPOLYMERASEBAMRECORD_H
+
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use VirtualZmwBamRecord instead.
+typedef VirtualZmwBamRecord VirtualPolymeraseBamRecord;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALPOLYMERASEBAMRECORD_H
diff --git a/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h b/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h

new file mode 100644 (file)

index 0000000..5ee14fc
--- /dev/null
+++ b/include/pbbam/virtual/VirtualPolymeraseCompositeReader.h
@@ -0,0 +1,22 @@
+// File Description
+/// \file VirtualPolymeraseCompositeReader.h
+/// \brief Defines the VirtualPolymeraseCompositeReader class.
+//
+// Author: Derek Barnett
+
+#ifndef VIRTUALPOLYMERASECOMPOSITEREADER_H
+#define VIRTUALPOLYMERASECOMPOSITEREADER_H
+
+#include "pbbam/virtual/VirtualPolymeraseBamRecord.h"
+#include "pbbam/virtual/ZmwReadStitcher.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use ZmwReadStitcher instead.
+typedef ZmwReadStitcher VirtualPolymeraseCompositeReader;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALPOLYMERASECOMPOSITEREADER_H
diff --git a/include/pbbam/virtual/VirtualPolymeraseReader.h b/include/pbbam/virtual/VirtualPolymeraseReader.h

new file mode 100644 (file)

index 0000000..25b179c
--- /dev/null
+++ b/include/pbbam/virtual/VirtualPolymeraseReader.h
@@ -0,0 +1,22 @@
+// File Description
+/// \file VirtualPolymeraseReader.h
+/// \brief Defines the VirtualPolymeraseReader class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALPOLYMERASEREADER_H
+#define VIRTUALPOLYMERASEREADER_H
+
+#include "pbbam/virtual/VirtualPolymeraseBamRecord.h"
+#include "pbbam/virtual/ZmwReadStitcher.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use ZmwReadStitcher instead.
+typedef ZmwReadStitcher VirtualPolymeraseReader;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALPOLYMERASEREADER_H
diff --git a/include/pbbam/virtual/VirtualRegion.h b/include/pbbam/virtual/VirtualRegion.h

new file mode 100644 (file)

index 0000000..b792b2c
--- /dev/null
+++ b/include/pbbam/virtual/VirtualRegion.h
@@ -0,0 +1,80 @@
+// File Description
+/// \file VirtualRegion.h
+/// \brief Defines the VirtualRegion class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALREGION_H
+#define VIRTUALREGION_H
+
+#include "pbbam/Config.h"
+#include "pbbam/LocalContextFlags.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The VirtualRegion represents an annotation of a polymerase region.
+///
+struct VirtualRegion
+{
+public:
+    VirtualRegionType type;
+    int beginPos;
+    int endPos;
+    LocalContextFlags cxTag = LocalContextFlags::NO_LOCAL_CONTEXT;
+    int barcodeLeft = -1;
+    int barcodeRight = -1;
+    int score = 0;
+
+public:
+    /// \brief Creates a virtual region with basic type & position info.
+    ///
+    VirtualRegion(const VirtualRegionType type_, const int beginPos_, const int endPos_,
+                  const int score_ = 0);
+
+    /// \brief Creates a virtual region with type/position info, as well as context & barcode.
+    ///
+    VirtualRegion(const VirtualRegionType type_, const int beginPos_, const int endPos_,
+                  const LocalContextFlags cxTag_, const int barcodeLeft_, const int barcodeRight_,
+                  const int score_ = 0);
+
+    VirtualRegion() = default;
+    VirtualRegion(const VirtualRegion&) = default;
+    VirtualRegion(VirtualRegion&&) = default;
+    VirtualRegion& operator=(const VirtualRegion&) = default;  // un-"delete"-ed for SWIG
+    VirtualRegion& operator=(VirtualRegion&&) = default;
+    ~VirtualRegion() = default;
+
+    bool operator==(const VirtualRegion& v1) const;
+};
+
+inline VirtualRegion::VirtualRegion(const VirtualRegionType type_, const int beginPos_,
+                                    const int endPos_, const int score_)
+    : type{type_}, beginPos{beginPos_}, endPos{endPos_}, cxTag{}, score{score_}
+{
+}
+
+inline VirtualRegion::VirtualRegion(const VirtualRegionType type_, const int beginPos_,
+                                    const int endPos_, const LocalContextFlags cxTag_,
+                                    const int barcodeLeft_, const int barcodeRight_,
+                                    const int score_)
+    : type{type_}
+    , beginPos{beginPos_}
+    , endPos{endPos_}
+    , cxTag{cxTag_}
+    , barcodeLeft{barcodeLeft_}
+    , barcodeRight{barcodeRight_}
+    , score{score_}
+{
+}
+
+inline bool VirtualRegion::operator==(const VirtualRegion& v1) const
+{
+    return (v1.type == this->type && v1.beginPos == this->beginPos && v1.endPos == this->endPos);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALREGION_H
diff --git a/include/pbbam/virtual/VirtualRegionType.h b/include/pbbam/virtual/VirtualRegionType.h

new file mode 100644 (file)

index 0000000..c23d3ee
--- /dev/null
+++ b/include/pbbam/virtual/VirtualRegionType.h
@@ -0,0 +1,30 @@
+// File Description
+/// \file VirtualRegionType.h
+/// \brief Defines the VirtualRegionType enum.
+//
+// Author: Derek Barnett
+
+#ifndef REGIONTYPE_H
+#define REGIONTYPE_H
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief This enum defines the types of annotated region.
+///
+enum class VirtualRegionType  // : char
+{
+    ADAPTER = 0x41,   ///< Adapter region ('A')
+    BARCODE = 0x42,   ///< Barcode region ('B')
+    FILTERED = 0x46,  ///< Filtered subread ('F')
+    SUBREAD = 0x53,   ///< Subread ('S')
+    HQREGION = 0x48,  ///< High-quality region ('H')
+    LQREGION = 0x4C   ///< Low-quality region ('L'), i.e. outside the HQ region
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // REGIONTYPE_H
diff --git a/include/pbbam/virtual/VirtualRegionTypeMap.h b/include/pbbam/virtual/VirtualRegionTypeMap.h

new file mode 100644 (file)

index 0000000..6f5c7e0
--- /dev/null
+++ b/include/pbbam/virtual/VirtualRegionTypeMap.h
@@ -0,0 +1,30 @@
+// File Description
+/// \file VirtualRegionTypeMap.h
+/// \brief Defines the VirtualRegionTypeMap class.
+//
+// Author: Derek Barnett
+
+#ifndef VIRTUALREGIONTYPEMAP_H
+#define VIRTUALREGIONTYPEMAP_H
+
+#include <map>
+
+#include "pbbam/Config.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The VirtualRegionTypeMap class provides mapping between char codes and
+///        VirtualRegionType enum keys.
+///
+class VirtualRegionTypeMap
+{
+public:
+    static std::map<char, VirtualRegionType> ParseChar;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALREGIONTYPEMAP_H
diff --git a/include/pbbam/virtual/VirtualZmwBamRecord.h b/include/pbbam/virtual/VirtualZmwBamRecord.h

new file mode 100644 (file)

index 0000000..368f4a9
--- /dev/null
+++ b/include/pbbam/virtual/VirtualZmwBamRecord.h
@@ -0,0 +1,86 @@
+// File Description
+/// \file VirtualZmwBamRecord.h
+/// \brief Defines the VirtualZmwBamRecord class.
+//
+// Author: Armin Töpfer
+
+#ifndef VirtualZmwBAMRECORD_H
+#define VirtualZmwBAMRECORD_H
+
+#include <sstream>
+#include <vector>
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/virtual/VirtualRegion.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \brief The VirtualZmwBamRecord class represents a ZMW read stitched
+///        on-the-fly from subreads|hqregion + scraps.
+///
+class VirtualZmwBamRecord : public BamRecord
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a "virtual" ZMW %BAM record, by re-stitching its
+    ///        constituent segments.
+    ///
+    /// \param[in] unorderedSources source data (subreads, scraps, etc.)
+    /// \param[in] header           %BAM header to associate with the new record
+    ///
+    /// \throws std::runtime_error on failure to stitch virtual record
+    ///
+    VirtualZmwBamRecord(std::vector<BamRecord> unorderedSources, const BamHeader& header);
+
+    VirtualZmwBamRecord() = delete;
+    VirtualZmwBamRecord(const VirtualZmwBamRecord&) = default;
+    VirtualZmwBamRecord(VirtualZmwBamRecord&&) = default;
+    VirtualZmwBamRecord& operator=(const VirtualZmwBamRecord&) = default;
+    VirtualZmwBamRecord& operator=(VirtualZmwBamRecord&&) = default;
+    virtual ~VirtualZmwBamRecord() = default;
+
+    /// \}
+
+public:
+    /// \name Virtual Record Attributes
+    ///
+
+    /// \returns true if requested VirtualRegionType has been annotated.
+    ///
+    bool HasVirtualRegionType(const VirtualRegionType regionType) const;
+
+    /// \returns IPD frame data
+    ///
+    Frames IPDV1Frames(Orientation orientation = Orientation::NATIVE) const;
+
+    /// \brief Provides all annotations of the polymerase read as a map (type => regions)
+    ///
+    std::map<VirtualRegionType, std::vector<VirtualRegion>> VirtualRegionsMap() const;
+
+    /// \brief Provides annotations of the polymerase read for a given VirtualRegionType.
+    ///
+    /// \param[in] regionType  requested region type
+    /// \returns regions that match the requested type (empty vector if none found).
+    ///
+    std::vector<VirtualRegion> VirtualRegionsTable(const VirtualRegionType regionType) const;
+
+    /// \}
+
+private:
+    std::vector<BamRecord> sources_;
+    std::map<VirtualRegionType, std::vector<VirtualRegion>> virtualRegionsMap_;
+
+private:
+    void StitchSources();
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VirtualZmwBAMRECORD_H
diff --git a/include/pbbam/virtual/WhitelistedZmwReadStitcher.h b/include/pbbam/virtual/WhitelistedZmwReadStitcher.h

new file mode 100644 (file)

index 0000000..32c19ba
--- /dev/null
+++ b/include/pbbam/virtual/WhitelistedZmwReadStitcher.h
@@ -0,0 +1,108 @@
+// File Description
+/// \file WhitelistedZmwReadStitcher.h
+/// \brief Defines the  ZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#ifndef WHITELISTEDZMWREADSTITCHER_H
+#define WHITELISTEDZMWREADSTITCHER_H
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+#include "pbbam/Config.h"
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+class DataSet;
+class PbiFilter;
+
+/// \brief The WhitelistedZmwReadStitcher class provides an interface for
+///        re-stitching "virtual" ZMW reads from their constituent parts,
+///        limiting results to only those reads originating from a 'whitelist'
+///         of ZMW hole numbers.
+///
+/// Whitelisted ZMWs that are not present in both primary and scraps BAMs
+/// will be "pre-removed." This ensures that, given client code like this:
+///
+/// \include code/WhitelistedZmwReadStitcher.txt
+///
+/// each iteration will always provide valid data - either a valid virtual
+/// record from Next() or a non-empty vector from NextRaw().
+///
+/// \note This reader requires that both input %BAM files also have associated
+///       PBI files available for query. See BamFile::EnsurePacBioIndexExists .
+///
+class PBBAM_EXPORT WhitelistedZmwReadStitcher
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// \brief Creates a reader that will operate on a primary %BAM file (e.g. subread data)
+    ///        and a scraps file, using a ZMW whitelist to filter the input.
+    ///
+    /// \param[in] zmwWhitelist         list of ZMWs to restrict iteration over
+    /// \param[in] primaryBamFilePath   hqregion.bam or subreads.bam file path
+    /// \param[in] scrapsBamFilePath    scraps.bam file path
+    ///
+    /// \note This reader requires that both input %BAM files also have associated PBI
+    ///       files available for query. See BamFile::EnsurePacBioIndexExists .
+    ///
+    /// \throws std::runtime_error if any files (*.bam and/or *.pbi) were not available for reading, or
+    ///         if malformed data encountered
+    ///
+    WhitelistedZmwReadStitcher(const std::vector<int32_t>& zmwWhitelist,
+                               const std::string& primaryBamFilePath,
+                               const std::string& scrapsBamFilePath);
+
+    WhitelistedZmwReadStitcher() = delete;
+    WhitelistedZmwReadStitcher(const WhitelistedZmwReadStitcher&) = delete;
+    WhitelistedZmwReadStitcher(WhitelistedZmwReadStitcher&&) = delete;
+    WhitelistedZmwReadStitcher& operator=(const WhitelistedZmwReadStitcher&) = delete;
+    WhitelistedZmwReadStitcher& operator=(WhitelistedZmwReadStitcher&&) = delete;
+    ~WhitelistedZmwReadStitcher();
+
+    /// \}
+
+public:
+    /// \name Stitched Record Reading
+    /// \{
+
+    /// \returns true if more ZMWs are available for reading.
+    bool HasNext() const;
+
+    /// \returns the re-stitched polymerase read from the next ZMW in the whitelist
+    VirtualZmwBamRecord Next();
+
+    /// \returns the set of reads that belong to the next ZMW in the whitelist.
+    ///          This enables stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw();
+
+    /// \}
+
+public:
+    /// \name File Headers
+    /// \{
+
+    /// \returns the BamHeader associated with this reader's "primary" %BAM file
+    BamHeader PrimaryHeader() const;
+
+    /// \returns the BamHeader associated with this reader's "scraps" %BAM file
+    BamHeader ScrapsHeader() const;
+
+    /// \}
+
+private:
+    struct WhitelistedZmwReadStitcherPrivate;
+    std::unique_ptr<WhitelistedZmwReadStitcherPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // WHITELISTEDZMWREADSTITCHER
diff --git a/include/pbbam/virtual/ZmwReadStitcher.h b/include/pbbam/virtual/ZmwReadStitcher.h

new file mode 100644 (file)

index 0000000..60b20aa
--- /dev/null
+++ b/include/pbbam/virtual/ZmwReadStitcher.h
@@ -0,0 +1,91 @@
+// File Description
+/// \file ZmwReadStitcher.h
+/// \brief Defines the ZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWREADSTITCHER_H
+#define ZMWREADSTITCHER_H
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+
+class DataSet;
+class PbiFilter;
+
+/// \brief The ZmwReadStitcher class provides an interface for re-stitching
+///        "virtual" polymerase reads from their constituent parts.
+///
+/// \note This reader requires that any input %BAM files also have associated PBI
+///       files available for query. See BamFile::EnsurePacBioIndexExists .
+///
+class PBBAM_EXPORT ZmwReadStitcher
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    /// entire file, from BAM names
+    ZmwReadStitcher(std::string primaryBamFilePath, std::string scrapsBamFilePath);
+
+    /// filtered input from BAM names
+    ZmwReadStitcher(std::string primaryBamFilePath, std::string scrapsBamFilePath,
+                    PbiFilter filter);
+
+    /// maybe filtered, from DataSet input
+    ZmwReadStitcher(const DataSet& dataset);
+
+    ZmwReadStitcher() = delete;
+    ZmwReadStitcher(const ZmwReadStitcher&) = delete;
+    ZmwReadStitcher(ZmwReadStitcher&&) = delete;
+    ZmwReadStitcher& operator=(const ZmwReadStitcher&) = delete;
+    ZmwReadStitcher& operator=(ZmwReadStitcher&&) = delete;
+    ~ZmwReadStitcher();
+
+    /// \}
+
+public:
+    /// \name File Headers
+    /// \{
+
+    /// \returns the BamHeader associated with this reader's "primary" %BAM file
+    BamHeader PrimaryHeader() const;
+
+    /// \returns the BamHeader associated with this reader's "scraps" %BAM file
+    BamHeader ScrapsHeader() const;
+
+    /// \}
+
+public:
+    /// \name Stitched Record Reading
+    ///
+
+    /// \returns true if more ZMWs are available for reading.
+    bool HasNext();
+
+    /// \returns the next stitched polymerase read
+    VirtualZmwBamRecord Next();
+
+    /// \returns the next set of reads that belong to one ZMW.
+    ///          This enables stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw();
+
+    /// \}
+
+private:
+    struct ZmwReadStitcherPrivate;
+    std::unique_ptr<ZmwReadStitcherPrivate> d_;
+};
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWREADSTITCHER_H
diff --git a/include/pbbam/virtual/ZmwWhitelistVirtualReader.h b/include/pbbam/virtual/ZmwWhitelistVirtualReader.h

new file mode 100644 (file)

index 0000000..24eaa8e
--- /dev/null
+++ b/include/pbbam/virtual/ZmwWhitelistVirtualReader.h
@@ -0,0 +1,21 @@
+// File Description
+/// \file ZmwWhitelistVirtualReader.h
+/// \brief Defines the ZmwWhitelistVirtualReader class.
+//
+// Author: Derek Barnett
+
+#ifndef ZMWWHITELISTVIRTUALREADER_H
+#define ZMWWHITELISTVIRTUALREADER_H
+
+#include "pbbam/virtual/WhitelistedZmwReadStitcher.h"
+
+namespace PacBio {
+namespace BAM {
+
+/// \deprecated Use WhitelistedZmwReadStitcher instead.
+typedef WhitelistedZmwReadStitcher ZmwWhitelistVirtualReader;
+
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ZMWWHITELISTVIRTUALREADER_H
diff --git a/meson.build b/meson.build

new file mode 100644 (file)

index 0000000..af75f02
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,129 @@
+project(
+  'PacBioBAM',
+  'cpp',
+  version : '0.19.0',
+  default_options : [
+    'buildtype=release',
+    'warning_level=3',
+    'cpp_std=c++11',
+    'b_ndebug=if-release'],
+  license : 'BSD-3',
+  meson_version : '>= 0.46.0')
+
+############
+# CXXFLAGS #
+############
+
+pbbam_warning_flags = []
+cpp = meson.get_compiler('cpp')
+foreach cflag: [
+  '-Wduplicated-cond',
+  '-Wduplicated-branches',
+  '-Wlogical-op',
+  '-Wrestrict',
+  '-Wnull-dereference',
+  '-Wuseless-cast',
+  '-Wdouble-promotion',
+  '-Wshadow',
+  '-Wformat=1']
+    if cpp.has_argument(cflag)
+      pbbam_warning_flags += cflag
+    endif
+endforeach
+
+pbbam_macros = []
+if get_option('permissive-cigar')
+  pbbam_macros += ['-DPBBAM_PERMISSIVE_CIGAR']
+  warning('**********************************************')
+  warning('* You have enabled allowing "M" in BAM files *')
+  warning('*    This is an unsupported combination!     *')
+  warning('**********************************************')
+endif
+
+################
+# dependencies #
+################
+
+# threads
+pbbam_thread_dep = dependency('threads', required : true)
+
+# boost
+pbbam_boost_dep = dependency('boost', required : true)
+
+# TODO(dseifert): Add test for Winsock on Windows
+# Winsock for htslib on Windows
+#if(WIN32)
+#    set(SOCKET_LIBRARIES "ws2_32")
+#endif()
+
+# zlib
+pbbam_zlib_dep = dependency('zlib', required : true)
+
+# htslib
+pbbam_htslib_dep = dependency('htslib', required : true, version : '>=1.4')
+
+###########
+# headers #
+###########
+
+subdir('include')
+
+#####################
+# sources + library #
+#####################
+
+subdir('src')
+
+#########
+# tests #
+#########
+
+if not meson.is_subproject()
+  if get_option('build-tools') or get_option('tests')
+    pbbam_python = find_program('python')
+
+    if get_option('tests')
+      pbbam_clang_formatter = find_program('tools/check-formatting')
+      subdir('tests')
+    endif
+  endif
+endif
+
+#########
+# tools #
+#########
+
+if not meson.is_subproject()
+  if get_option('build-tools') or get_option('tests')
+    subdir('tools')
+  endif
+endif
+
+#################
+# documentation #
+#################
+
+if get_option('build-docs')
+  subdir('docs')
+endif
+
+###################
+# dependency info #
+###################
+
+if not meson.is_subproject()
+  import('pkgconfig').generate(
+    libraries : pbbam_lib,
+    version : meson.project_version(),
+    name : 'pbbam',
+    requires : 'htslib',
+    filebase : 'pbbam',
+    description : 'Library for accessing PacBio-compatible BAM files')
+endif
+
+pbbam_dep = declare_dependency(
+  include_directories : pbbam_include_directories,
+  link_with : pbbam_lib,
+  dependencies : pbbam_htslib_dep,
+  version : meson.project_version(),
+  compile_args : pbbam_macros)
diff --git a/meson_options.txt b/meson_options.txt

new file mode 100644 (file)

index 0000000..56dbe36
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1,24 @@
+option('build-tools',
+    type : 'boolean',
+    value : true,
+    description : 'Build PacBioBAM command line utilities (e.g. pbindex)')
+
+option('build-docs',
+    type : 'boolean',
+    value : false,
+    description : 'Build PacBioBAM\'s API documentation')
+
+option('auto-validate',
+    type : 'boolean',
+    value : false,
+    description : 'Build PacBioBAM with BAM validation')
+
+option('tests',
+    type : 'boolean',
+    value : true,
+    description : 'Enable dependencies required for testing')
+
+option('permissive-cigar',
+    type : 'boolean',
+    value : false,
+    description : 'Allows loading BAM records which contain "M" operations in CIGAR strings.')
diff --git a/scripts/ci/build.sh b/scripts/ci/build.sh

new file mode 100755 (executable)

index 0000000..5c402ec
--- /dev/null
+++ b/scripts/ci/build.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -vex
+
+#########
+# BUILD #
+#########
+
+# configure
+# '--wrap-mode nofallback' prevents meson from downloading
+# stuff from the internet or using subprojects.
+meson \
+  --werror \
+  --default-library shared \
+  --libdir lib \
+  --unity "${ENABLED_UNITY_BUILD:-off}" \
+  --wrap-mode nofallback \
+  --prefix "${PREFIX_ARG:-/usr/local}" \
+  -Dtests="${ENABLED_TESTS:-false}" \
+  "${CURRENT_BUILD_DIR:-build}" .
+
+# build
+ninja -C "${CURRENT_BUILD_DIR:-build}" -v
diff --git a/scripts/ci/install.sh b/scripts/ci/install.sh

new file mode 100755 (executable)

index 0000000..e155347
--- /dev/null
+++ b/scripts/ci/install.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+set -vex
+
+###########
+# INSTALL #
+###########
+
+if [[ ${PREFIX_ARG} ]]; then
+  ## Cleaning out old installation from /mnt/software
+  rm -rf "${PREFIX_ARG}"/*
+fi
+
+DESTDIR="${DESTDIR:-/}" ninja -C "${CURRENT_BUILD_DIR:-build}" -v install
diff --git a/scripts/ci/test.sh b/scripts/ci/test.sh

new file mode 100755 (executable)

index 0000000..92d3f93
--- /dev/null
+++ b/scripts/ci/test.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+set -vex
+
+########
+# TEST #
+########
+
+type module >& /dev/null || . /mnt/software/Modules/current/init/bash
+
+# Note: htslib v1.7 added native long CIGAR support. pbbam "spoofs" it 
+#       when running <1.7. So we'll always check the default htslib for 
+#       general test success/fail, and then check pre-/post-v1.7 explicitly
+#       to ensure we pass in either context (detectable at runtime).
+
+# default htslib
+ninja -C "${CURRENT_BUILD_DIR:-build}" -v test
+
+# explicit htslib v1.6
+module unload htslib
+module load htslib/1.6
+ninja -C "${CURRENT_BUILD_DIR:-build}" -v test
+
+# explicit htslib v1.7
+module unload htslib
+module load htslib/1.7
+ninja -C "${CURRENT_BUILD_DIR:-build}" -v test\
+
+# restore default
+module unload htslib
+module load htslib
diff --git a/src/Accuracy.cpp b/src/Accuracy.cpp

new file mode 100644 (file)

index 0000000..33816e8
--- /dev/null
+++ b/src/Accuracy.cpp
@@ -0,0 +1,18 @@
+// File Description
+/// \file Accuracy.cpp
+/// \brief Implements the Accuracy class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/Accuracy.h"
+
+namespace PacBio {
+namespace BAM {
+
+const float Accuracy::MIN = 0.0f;
+const float Accuracy::MAX = 1.0f;
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/AlignmentPrinter.cpp b/src/AlignmentPrinter.cpp

new file mode 100644 (file)

index 0000000..d0b096c
--- /dev/null
+++ b/src/AlignmentPrinter.cpp
@@ -0,0 +1,119 @@
+// File Description
+/// \file AlignmentPrinter.cpp
+/// \brief Implements the AlignmentPrinter class.
+//
+// Author: Armin Töpfer
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/AlignmentPrinter.h"
+
+#include <cmath>
+#include <cstddef>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+
+AlignmentPrinter::AlignmentPrinter(const IndexedFastaReader& ifr)
+    : ifr_{std::make_unique<IndexedFastaReader>(ifr)}
+{
+}
+
+std::string AlignmentPrinter::Print(const BamRecord& record, const Orientation orientation)
+{
+    const std::string seq{record.Sequence(orientation, true, true)};
+    const std::string ref{ifr_->ReferenceSubsequence(record, orientation, true, true)};
+
+    if (seq.size() != ref.size())
+        throw std::runtime_error{"Sequence and reference parts are of different size"};
+
+    int seqLength = 0;
+    float matches = 0;
+    std::string pretty;
+    Position refCoord = record.ReferenceStart();
+    Position seqCoord = record.QueryStart();
+
+    for (size_t i = 0; i < seq.size();) {
+        auto refCoordStr = std::to_string(refCoord);
+        auto seqCoordStr = std::to_string(seqCoord);
+
+        size_t maxCoordLength = std::max(refCoordStr.size(), seqCoordStr.size());
+        while (refCoordStr.size() < maxCoordLength)
+            refCoordStr = " " + refCoordStr;
+        while (seqCoordStr.size() < maxCoordLength)
+            seqCoordStr = " " + seqCoordStr;
+
+        std::string seqWrap{seqCoordStr + " : "};
+        std::string refWrap{refCoordStr + " : "};
+        std::string prettyWrap(maxCoordLength + 3, ' ');
+        prettyWrap.reserve(seq.size());
+
+        for (int j = 0; i < seq.size() && j < 40; ++i, ++j) {
+            refWrap += ref[i];
+
+            if (seq[i] == ref[i]) {
+                ++matches;
+                if (refCoord == 0 || refCoord % 10)
+                    prettyWrap += '|';
+                else {
+                    prettyWrap += "\033[1m\x1b[31m";
+                    prettyWrap += '|';
+                    prettyWrap += "\033[0m\x1b[39;49m";
+                }
+                seqWrap += seq[i];
+            } else if (seq[i] == '-' || ref[i] == '-') {
+                prettyWrap += ' ';
+                seqWrap += seq[i];
+            } else {
+                prettyWrap += '.';
+                seqWrap += "\033[1m\x1b[31m";
+                seqWrap += seq[i];
+                seqWrap += "\033[0m\x1b[39;49m";
+            }
+            if (seq[i] != '-') {
+                ++seqLength;
+                ++seqCoord;
+            }
+            if (ref[i] != '-') {
+                ++refCoord;
+            }
+        }
+
+        refCoordStr = std::to_string(refCoord);
+        seqCoordStr = std::to_string(seqCoord);
+
+        maxCoordLength = std::max(refCoordStr.size(), seqCoordStr.size());
+        while (refCoordStr.size() < maxCoordLength)
+            refCoordStr = " " + refCoordStr;
+        while (seqCoordStr.size() < maxCoordLength)
+            seqCoordStr = " " + seqCoordStr;
+
+        seqWrap += " : " + seqCoordStr;
+        refWrap += " : " + refCoordStr;
+
+        pretty += refWrap + '\n' + prettyWrap + '\n' + seqWrap + "\n\n";
+    }
+    const float similarity = matches / seq.size();
+
+    std::stringstream output;
+
+    output << "Read        : " << record.FullName() << std::endl;
+    output << "Reference   : " << record.ReferenceName() << std::endl;
+    output << std::endl;
+    output << "Read-length : " << seqLength << std::endl;
+    output << "Concordance : " << std::setprecision(3) << (similarity);
+    output << std::endl;
+    output << std::endl;
+    output << pretty;
+
+    return output.str();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Autovalidate.h b/src/Autovalidate.h

new file mode 100644 (file)

index 0000000..c683519
--- /dev/null
+++ b/src/Autovalidate.h
@@ -0,0 +1,22 @@
+// File Description
+/// \file Autovalidate.h
+/// \brief Sets the default macro for the autovalidation mode.
+//
+// Author: Derek Barnett
+
+#ifndef AUTOVALIDATE_H
+#define AUTOVALIDATE_H
+
+// \brief Auto-validation
+//
+// To validate BAM components (header, records, etc.) you can either use the
+// Validator API provided, or enable auto-validation. To compile pbbam for
+// auto-validation, add the -DPacBioBAM_auto_validate=ON option to your cmake
+// invocation.
+//
+//
+#ifndef PBBAM_AUTOVALIDATE
+#define PBBAM_AUTOVALIDATE 0
+#endif
+
+#endif  // AUTOVALIDATE_H
diff --git a/src/BaiIndexedBamReader.cpp b/src/BaiIndexedBamReader.cpp

new file mode 100644 (file)

index 0000000..911701d
--- /dev/null
+++ b/src/BaiIndexedBamReader.cpp
@@ -0,0 +1,96 @@
+// File Description
+/// \file BaiIndexedBamReader.cpp
+/// \brief Implements the BaiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BaiIndexedBamReader.h"
+
+#include <cstddef>
+
+#include "MemoryUtils.h"
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct BaiIndexedBamReaderPrivate
+{
+public:
+    BaiIndexedBamReaderPrivate(const BamFile& file, const GenomicInterval& interval)
+    {
+        LoadIndex(file.Filename());
+        Interval(file.Header(), interval);
+    }
+
+    void Interval(const BamHeader& header, const GenomicInterval& interval)
+    {
+        htsIterator_.reset();
+
+        if (header.HasSequence(interval.Name())) {
+            auto id = header.SequenceId(interval.Name());
+            if (id >= 0 && static_cast<size_t>(id) < header.NumSequences()) {
+                htsIterator_.reset(
+                    bam_itr_queryi(htsIndex_.get(), id, interval.Start(), interval.Stop()));
+            }
+        }
+
+        if (!htsIterator_)
+            throw std::runtime_error{"could not create iterator for requested region"};
+    }
+
+    void LoadIndex(const std::string& fn)
+    {
+        htsIndex_.reset(bam_index_load(fn.c_str()));
+        if (!htsIndex_) throw std::runtime_error{"could not load BAI index data"};
+    }
+
+    int ReadRawData(BGZF* bgzf, bam1_t* b)
+    {
+        assert(htsIterator_.get());
+        return hts_itr_next(bgzf, htsIterator_.get(), b, nullptr);
+    }
+
+public:
+    GenomicInterval interval_;
+    std::unique_ptr<hts_idx_t, internal::HtslibIndexDeleter> htsIndex_;
+    std::unique_ptr<hts_itr_t, internal::HtslibIteratorDeleter> htsIterator_;
+};
+
+}  // namespace internal
+
+BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval, std::string filename)
+    : BaiIndexedBamReader{interval, BamFile{std::move(filename)}}
+{
+}
+
+BaiIndexedBamReader::BaiIndexedBamReader(const GenomicInterval& interval, BamFile bamFile)
+    : BamReader{std::move(bamFile)}
+    , d_{std::make_unique<internal::BaiIndexedBamReaderPrivate>(File(), interval)}
+{
+}
+
+const GenomicInterval& BaiIndexedBamReader::Interval() const
+{
+    assert(d_);
+    return d_->interval_;
+}
+
+int BaiIndexedBamReader::ReadRawData(BGZF* bgzf, bam1_t* b)
+{
+    assert(d_);
+    return d_->ReadRawData(bgzf, b);
+}
+
+BaiIndexedBamReader& BaiIndexedBamReader::Interval(const GenomicInterval& interval)
+{
+    assert(d_);
+    d_->Interval(Header(), interval);
+    return *this;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamFile.cpp b/src/BamFile.cpp

new file mode 100644 (file)

index 0000000..9141f0e
--- /dev/null
+++ b/src/BamFile.cpp
@@ -0,0 +1,207 @@
+// File Description
+/// \file BamFile.cpp
+/// \brief Implements the BamFile class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamFile.h"
+
+#include <sys/stat.h>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+
+#include <htslib/sam.h>
+
+#include "Autovalidate.h"
+#include "FileUtils.h"
+#include "MemoryUtils.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/PbiFile.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class BamFilePrivate
+{
+public:
+    BamFilePrivate(std::string fn) : filename_{std::move(fn)}, firstAlignmentOffset_{-1}
+    {
+        // ensure we've updated htslib verbosity with requested verbosity here
+        hts_verbose = (PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity);
+
+        // attempt open
+        auto f = RawOpen();
+
+#if !defined(PBBAM_NO_CHECK_EOF) || PBBAM_AUTOVALIDATE
+        // sanity check on file
+        const auto eofCheck = bgzf_check_EOF(f->fp.bgzf);
+        if (eofCheck <= 0) {
+            // 1:  EOF present & correct
+            // 2:  not seekable (e.g. reading from stdin)
+            // 0:  EOF absent
+            // -1: some other error
+            std::ostringstream e;
+            if (eofCheck == 0)
+                e << fn << " : is missing EOF block\n";
+            else
+                e << fn << " : unknown error while checking EOF block\n";
+            throw std::runtime_error{e.str()};
+        }
+#endif
+
+        // attempt fetch header
+        std::unique_ptr<bam_hdr_t, internal::HtslibHeaderDeleter> hdr(sam_hdr_read(f.get()));
+        header_ = internal::BamHeaderMemory::FromRawData(hdr.get());
+
+        // cache first alignment offset
+        firstAlignmentOffset_ = bgzf_tell(f->fp.bgzf);
+    }
+
+    std::unique_ptr<BamFilePrivate> DeepCopy()
+    {
+        return std::make_unique<BamFilePrivate>(filename_);
+    }
+
+    bool HasEOF() const
+    {
+        // streamed input is unknown, since it's not random-accessible
+        if (filename_ == "-") return false;
+
+        // attempt open
+        auto f = RawOpen();
+        return RawEOFCheck(f) == 1;
+    }
+
+    int RawEOFCheck(const std::unique_ptr<samFile, internal::HtslibFileDeleter>& f) const
+    {
+        assert(f);
+        assert(f->fp.bgzf);
+        return bgzf_check_EOF(f->fp.bgzf);
+    }
+
+    std::unique_ptr<samFile, internal::HtslibFileDeleter> RawOpen() const
+    {
+        std::unique_ptr<samFile, internal::HtslibFileDeleter> f(sam_open(filename_.c_str(), "rb"));
+        if (!f || !f->fp.bgzf) throw std::runtime_error{"could not open BAM file: " + filename_};
+        if (f->format.format != bam) throw std::runtime_error{"expected BAM, unknown format"};
+        return f;
+    }
+
+public:
+    std::string filename_;
+    BamHeader header_;
+    int64_t firstAlignmentOffset_;
+};
+
+}  // namespace internal
+
+// ------------------------
+// BamFile implementation
+// ------------------------
+
+BamFile::BamFile(std::string filename)
+    : d_{std::make_unique<internal::BamFilePrivate>(std::move(filename))}
+{
+}
+
+BamFile::BamFile(const BamFile& other) : d_{other.d_->DeepCopy()} {}
+
+BamFile::BamFile(BamFile&& other) : d_{std::move(other.d_)} {}
+
+BamFile& BamFile::operator=(const BamFile& other)
+{
+    if (this != &other) {
+        d_ = other.d_->DeepCopy();
+    }
+    return *this;
+}
+
+BamFile& BamFile::operator=(BamFile&& other)
+{
+    if (this != &other) {
+        d_ = std::move(other.d_);
+    }
+    return *this;
+}
+
+BamFile::~BamFile() {}
+
+void BamFile::CreatePacBioIndex() const { PbiFile::CreateFrom(*this); }
+
+void BamFile::CreateStandardIndex() const
+{
+    if (bam_index_build(d_->filename_.c_str(), 0) != 0)
+        throw std::runtime_error{"could not build BAI index"};
+}
+
+void BamFile::EnsurePacBioIndexExists() const
+{
+    if (!PacBioIndexExists()) CreatePacBioIndex();
+}
+
+void BamFile::EnsureStandardIndexExists() const
+{
+    if (!StandardIndexExists()) CreateStandardIndex();
+}
+
+const std::string& BamFile::Filename() const { return d_->filename_; }
+
+int64_t BamFile::FirstAlignmentOffset() const { return d_->firstAlignmentOffset_; }
+
+bool BamFile::HasEOF() const { return d_->HasEOF(); }
+
+bool BamFile::HasReference(const std::string& name) const { return d_->header_.HasSequence(name); }
+
+const BamHeader& BamFile::Header() const { return d_->header_; }
+
+bool BamFile::IsPacBioBAM() const { return !d_->header_.PacBioBamVersion().empty(); }
+
+bool BamFile::PacBioIndexExists() const
+{
+    return internal::FileUtils::Exists(PacBioIndexFilename());
+}
+
+std::string BamFile::PacBioIndexFilename() const { return d_->filename_ + ".pbi"; }
+
+bool BamFile::PacBioIndexIsNewer() const
+{
+    const auto bamTimestamp = internal::FileUtils::LastModified(Filename());
+    const auto pbiTimestamp = internal::FileUtils::LastModified(PacBioIndexFilename());
+    return bamTimestamp <= pbiTimestamp;
+}
+
+int BamFile::ReferenceId(const std::string& name) const { return d_->header_.SequenceId(name); }
+
+uint32_t BamFile::ReferenceLength(const std::string& name) const
+{
+    return ReferenceLength(ReferenceId(name));
+}
+
+uint32_t BamFile::ReferenceLength(const int id) const
+{
+    return std::stoul(d_->header_.SequenceLength(id));
+}
+
+std::string BamFile::ReferenceName(const int id) const { return d_->header_.SequenceName(id); }
+
+bool BamFile::StandardIndexExists() const
+{
+    return internal::FileUtils::Exists(StandardIndexFilename());
+}
+
+std::string BamFile::StandardIndexFilename() const { return d_->filename_ + ".bai"; }
+
+bool BamFile::StandardIndexIsNewer() const
+{
+    const auto bamTimestamp = internal::FileUtils::LastModified(Filename());
+    const auto baiTimestamp = internal::FileUtils::LastModified(StandardIndexFilename());
+    return bamTimestamp <= baiTimestamp;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamHeader.cpp b/src/BamHeader.cpp

new file mode 100644 (file)

index 0000000..e93b571
--- /dev/null
+++ b/src/BamHeader.cpp
@@ -0,0 +1,333 @@
+// File Description
+/// \file BamHeader.cpp
+/// \brief Implements the BamHeader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamHeader.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <set>
+#include <sstream>
+
+#include <htslib/hts.h>
+
+#include "StringUtils.h"
+#include "Version.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static const std::string BamHeaderPrefixHD{"@HD"};
+static const std::string BamHeaderPrefixSQ{"@SQ"};
+static const std::string BamHeaderPrefixRG{"@RG"};
+static const std::string BamHeaderPrefixPG{"@PG"};
+static const std::string BamHeaderPrefixCO{"@CO"};
+
+static const std::string BamHeaderTokenVN{"VN"};
+static const std::string BamHeaderTokenSO{"SO"};
+static const std::string BamHeaderTokenpb{"pb"};
+
+static inline bool CheckSortOrder(const std::string& lhs, const std::string& rhs)
+{
+    return lhs == rhs;
+}
+
+static inline bool CheckPbVersion(const std::string& lhs, const std::string& rhs)
+{
+    return (Version{lhs} >= Version::Minimum && Version{rhs} >= Version::Minimum);
+}
+
+static inline bool CheckSequences(const std::string& sortOrder,
+                                  const std::vector<SequenceInfo>& lhs,
+                                  const std::vector<SequenceInfo>& rhs)
+{
+    return ((sortOrder == "coordinate") ? lhs == rhs : true);
+}
+
+static void EnsureCanMerge(const BamHeader& lhs, const BamHeader& rhs)
+{
+    // check compatibility
+    const auto sortOrderOk = CheckSortOrder(lhs.SortOrder(), rhs.SortOrder());
+    const auto pbVersionOk = CheckPbVersion(lhs.PacBioBamVersion(), rhs.PacBioBamVersion());
+    const auto sequencesOk = CheckSequences(lhs.SortOrder(), lhs.Sequences(), rhs.Sequences());
+    if (sortOrderOk && pbVersionOk && sequencesOk) return;
+
+    // if any checks failed, format error message & throw
+    std::ostringstream e;
+    e << "could not merge BAM headers:\n";
+
+    if (!sortOrderOk) {
+        e << "  mismatched sort orders (@HD:SO) : (" << lhs.SortOrder() << ", " << rhs.SortOrder()
+          << ")\n";
+    }
+
+    if (!pbVersionOk) {
+        e << "  incompatible PacBio BAM versions (@HD:pb) : (" << lhs.PacBioBamVersion() << ", "
+          << rhs.PacBioBamVersion() << ")\n";
+    }
+
+    if (!sequencesOk) e << "  mismatched sequence lists (@SQ entries)\n";
+
+    throw std::runtime_error{e.str()};
+}
+
+}  // namespace internal
+
+BamHeader::BamHeader(const std::string& samHeaderText)
+    : d_{std::make_shared<internal::BamHeaderPrivate>()}
+{
+    std::istringstream s{samHeaderText};
+    std::string line;
+    std::string firstToken;
+    while (std::getline(s, line)) {
+
+        // skip if line is not long enough to contain true values
+        if (line.length() < 5) continue;
+
+        // determine token at beginning of line
+        firstToken = line.substr(0, 3);
+
+        if (firstToken == internal::BamHeaderPrefixHD) {
+
+            // pop off '@HD\t', then split HD lines into tokens
+            const auto tokens = internal::Split(line.substr(4), '\t');
+            for (const auto& token : tokens) {
+                const auto tokenTag = token.substr(0, 2);
+                const auto tokenValue = token.substr(3);
+
+                // set header contents
+                if (tokenTag == internal::BamHeaderTokenVN)
+                    Version(tokenValue);
+                else if (tokenTag == internal::BamHeaderTokenSO)
+                    SortOrder(tokenValue);
+                else if (tokenTag == internal::BamHeaderTokenpb)
+                    PacBioBamVersion(tokenValue);
+            }
+
+            // check for required tags
+            if (Version().empty()) Version(std::string{hts_version()});
+        }
+
+        else if (firstToken == internal::BamHeaderPrefixSQ)
+            AddSequence(SequenceInfo::FromSam(line));
+
+        else if (firstToken == internal::BamHeaderPrefixRG)
+            AddReadGroup(ReadGroupInfo::FromSam(line));
+
+        else if (firstToken == internal::BamHeaderPrefixPG)
+            AddProgram(ProgramInfo::FromSam(line));
+
+        else if (firstToken == internal::BamHeaderPrefixCO)
+            AddComment(line.substr(4));
+    }
+}
+
+BamHeader& BamHeader::operator+=(const BamHeader& other)
+{
+    internal::EnsureCanMerge(*this, other);
+
+    // merge read groups
+    for (const auto& rg : other.ReadGroups()) {
+        if (!HasReadGroup(rg.Id())) AddReadGroup(rg);
+    }
+
+    // merge programs
+    for (const auto& pg : other.Programs()) {
+        if (!HasProgram(pg.Id())) AddProgram(pg);
+    }
+
+    // merge comments
+    for (const auto& comment : other.Comments())
+        AddComment(comment);
+
+    return *this;
+}
+
+BamHeader& BamHeader::AddSequence(SequenceInfo sequence)
+{
+    const std::string name = sequence.Name();
+    d_->sequences_.push_back(std::move(sequence));
+    d_->sequenceIdLookup_[name] = d_->sequences_.size() - 1;
+    return *this;
+}
+
+BamHeader& BamHeader::ClearSequences()
+{
+    d_->sequenceIdLookup_.clear();
+    d_->sequences_.clear();
+    return *this;
+}
+
+BamHeader BamHeader::DeepCopy() const
+{
+    BamHeader result;
+    result.d_->version_ = d_->version_;
+    result.d_->pacbioBamVersion_ = d_->pacbioBamVersion_;
+    result.d_->sortOrder_ = d_->sortOrder_;
+    result.d_->headerLineCustom_ = d_->headerLineCustom_;
+    result.d_->readGroups_ = d_->readGroups_;
+    result.d_->programs_ = d_->programs_;
+    result.d_->comments_ = d_->comments_;
+    result.d_->sequences_ = d_->sequences_;
+    result.d_->sequenceIdLookup_ = d_->sequenceIdLookup_;
+    return result;
+}
+
+BamHeader& BamHeader::PacBioBamVersion(const std::string& version)
+{
+    d_->pacbioBamVersion_ = version;
+    const internal::Version fileVersion{version};
+    if (fileVersion < internal::Version::Minimum) {
+        throw std::runtime_error{"invalid PacBio BAM version number (" + fileVersion.ToString() +
+                                 ") is older than the minimum supported version (" +
+                                 internal::Version::Minimum.ToString() + ")"};
+    }
+    return *this;
+}
+
+ProgramInfo BamHeader::Program(const std::string& id) const
+{
+    const auto iter = d_->programs_.find(id);
+    if (iter == d_->programs_.cend()) throw std::runtime_error{"program ID not found"};
+    return iter->second;
+}
+
+std::vector<std::string> BamHeader::ProgramIds() const
+{
+    std::vector<std::string> result;
+    result.reserve(d_->programs_.size());
+    for (const auto& pg : d_->programs_)
+        result.push_back(pg.first);
+    return result;
+}
+
+std::vector<ProgramInfo> BamHeader::Programs() const
+{
+    std::vector<ProgramInfo> result;
+    result.reserve(d_->programs_.size());
+    for (const auto& pg : d_->programs_)
+        result.push_back(pg.second);
+    return result;
+}
+
+BamHeader& BamHeader::Programs(std::vector<ProgramInfo> programs)
+{
+    d_->programs_.clear();
+    for (const auto& pg : programs)
+        d_->programs_[pg.Id()] = std::move(pg);
+    return *this;
+}
+
+ReadGroupInfo BamHeader::ReadGroup(const std::string& id) const
+{
+    const auto iter = d_->readGroups_.find(id);
+    if (iter == d_->readGroups_.cend()) throw std::runtime_error{"read group ID not found"};
+    return iter->second;
+}
+
+std::vector<std::string> BamHeader::ReadGroupIds() const
+{
+    std::vector<std::string> result;
+    result.reserve(d_->readGroups_.size());
+    for (const auto& rg : d_->readGroups_)
+        result.push_back(rg.first);
+    return result;
+}
+
+std::vector<ReadGroupInfo> BamHeader::ReadGroups() const
+{
+    std::vector<ReadGroupInfo> result;
+    result.reserve(d_->readGroups_.size());
+    for (const auto& rg : d_->readGroups_)
+        result.push_back(rg.second);
+    return result;
+}
+
+BamHeader& BamHeader::ReadGroups(std::vector<ReadGroupInfo> readGroups)
+{
+    d_->readGroups_.clear();
+    for (auto&& rg : readGroups)
+        d_->readGroups_[rg.Id()] = std::move(rg);
+    return *this;
+}
+
+SequenceInfo BamHeader::Sequence(const std::string& name) const
+{
+    // TODO: SequenceId(name) throws if not found, should we do so here as well?
+
+    const auto iter = d_->sequenceIdLookup_.find(name);
+    if (iter == d_->sequenceIdLookup_.cend()) return SequenceInfo();
+    const auto index = iter->second;
+    assert(index >= 0 && static_cast<size_t>(index) < d_->sequences_.size());
+    return d_->sequences_.at(index);
+}
+
+int32_t BamHeader::SequenceId(const std::string& name) const
+{
+    const auto iter = d_->sequenceIdLookup_.find(name);
+    if (iter == d_->sequenceIdLookup_.cend()) throw std::runtime_error{"sequence not found"};
+    return iter->second;
+}
+
+std::vector<std::string> BamHeader::SequenceNames() const
+{
+    std::vector<std::string> result;
+    result.reserve(d_->sequences_.size());
+    for (const auto& seq : d_->sequences_)
+        result.push_back(seq.Name());
+    return result;
+}
+
+BamHeader& BamHeader::Sequences(std::vector<SequenceInfo> sequences)
+{
+    d_->sequences_.clear();
+    for (auto&& seq : sequences)
+        AddSequence(std::move(seq));
+    return *this;
+}
+
+std::string BamHeader::ToSam() const
+{
+    // init stream
+    std::ostringstream out;
+
+    // @HD
+    const auto outputVersion = (d_->version_.empty() ? std::string{hts_version()} : d_->version_);
+    const auto outputSortOrder = (d_->sortOrder_.empty() ? std::string{"unknown"} : d_->sortOrder_);
+    const auto outputPbBamVersion =
+        (d_->pacbioBamVersion_.empty() ? internal::Version::Current.ToString()
+                                       : d_->pacbioBamVersion_);
+
+    out << internal::BamHeaderPrefixHD
+        << internal::MakeSamTag(internal::BamHeaderTokenVN, outputVersion)
+        << internal::MakeSamTag(internal::BamHeaderTokenSO, outputSortOrder)
+        << internal::MakeSamTag(internal::BamHeaderTokenpb, outputPbBamVersion) << '\n';
+
+    // @SQ
+    for (const auto& seq : d_->sequences_)
+        out << seq.ToSam() << '\n';
+
+    // @RG
+    for (const auto& rgIter : d_->readGroups_)
+        out << rgIter.second.ToSam() << '\n';
+
+    // @PG
+    for (const auto& progIter : d_->programs_)
+        out << progIter.second.ToSam() << '\n';
+
+    // @CO
+    for (const auto& comment : d_->comments_)
+        out << internal::BamHeaderPrefixCO << '\t' << comment << '\n';
+
+    // return result
+    return out.str();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamReader.cpp b/src/BamReader.cpp

new file mode 100644 (file)

index 0000000..344774c
--- /dev/null
+++ b/src/BamReader.cpp
@@ -0,0 +1,136 @@
+// File Description
+/// \file BamReader.cpp
+/// \brief Implements the BamReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamReader.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <sstream>
+
+#include <htslib/bgzf.h>
+#include <htslib/hfile.h>
+#include <htslib/hts.h>
+
+#include "Autovalidate.h"
+#include "MemoryUtils.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/Validator.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct BamReaderPrivate
+{
+public:
+    BamReaderPrivate(BamFile bamFile) : bamFile_{std::move(bamFile)} { DoOpen(); }
+
+    void DoOpen()
+    {
+        // fetch file pointer
+        htsFile_.reset(sam_open(bamFile_.Filename().c_str(), "rb"));
+        if (!htsFile_) throw std::runtime_error{"could not open BAM file for reading"};
+    }
+
+public:
+    std::unique_ptr<samFile, internal::HtslibFileDeleter> htsFile_;
+    BamFile bamFile_;
+};
+
+}  // namespace internal
+
+BamReader::BamReader(std::string fn) : BamReader{BamFile{std::move(fn)}} {}
+
+BamReader::BamReader(BamFile bamFile)
+    : d_{std::make_unique<internal::BamReaderPrivate>(std::move(bamFile))}
+{
+    // skip header
+    VirtualSeek(d_->bamFile_.FirstAlignmentOffset());
+}
+
+BamReader::~BamReader() {}
+
+BGZF* BamReader::Bgzf() const
+{
+    assert(d_);
+    assert(d_->htsFile_);
+    assert(d_->htsFile_->fp.bgzf);
+    return d_->htsFile_->fp.bgzf;
+}
+
+const BamFile& BamReader::File() const
+{
+    assert(d_);
+    return d_->bamFile_;
+}
+
+const std::string& BamReader::Filename() const
+{
+    assert(d_);
+    return d_->bamFile_.Filename();
+}
+
+const BamHeader& BamReader::Header() const
+{
+    assert(d_);
+    return d_->bamFile_.Header();
+}
+
+bool BamReader::GetNext(BamRecord& record)
+{
+    assert(Bgzf());
+    assert(internal::BamRecordMemory::GetRawData(record).get());
+
+    const auto result = ReadRawData(Bgzf(), internal::BamRecordMemory::GetRawData(record).get());
+
+    // success
+    if (result >= 0) {
+        internal::BamRecordMemory::UpdateRecordTags(record);
+        record.header_ = Header();
+        record.ResetCachedPositions();
+
+#if PBBAM_AUTOVALIDATE
+        Validator::Validate(record);
+#endif
+        return true;
+    }
+
+    // EOF or end-of-data range (not an error)
+    else if (result == -1)
+        return false;
+
+    // error corrupted file
+    else {
+        std::ostringstream msg;
+        msg << "corrupted BAM file: ";
+        if (result == -2)
+            msg << "probably truncated";
+        else if (result == -3)
+            msg << "could not read BAM record's' core data";
+        else if (result == -4)
+            msg << "could not read BAM record's' variable-length data";
+        else
+            msg << "unknown reason " + std::to_string(result);
+        msg << " (" << Filename() << ')';
+        throw std::runtime_error{msg.str()};
+    }
+}
+
+int BamReader::ReadRawData(BGZF* bgzf, bam1_t* b) { return bam_read1(bgzf, b); }
+
+void BamReader::VirtualSeek(int64_t virtualOffset)
+{
+    const auto result = bgzf_seek(Bgzf(), virtualOffset, SEEK_SET);
+    if (result != 0) throw std::runtime_error{"Failed to seek in BAM file"};
+}
+
+int64_t BamReader::VirtualTell() const { return bgzf_tell(Bgzf()); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamRecord.cpp b/src/BamRecord.cpp

new file mode 100644 (file)

index 0000000..0f37281
--- /dev/null
+++ b/src/BamRecord.cpp
@@ -0,0 +1,2162 @@
+// File Description
+/// \file BamRecord.cpp
+/// \brief Implements the BamRecord class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamRecord.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <stdexcept>
+
+#include <htslib/sam.h>
+#include <boost/numeric/conversion/cast.hpp>
+
+#include "BamRecordTags.h"
+#include "MemoryUtils.h"
+#include "Pulse2BaseCache.h"
+#include "SequenceUtils.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/ZmwTypeMap.h"
+#include "pbbam/virtual/VirtualRegionTypeMap.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// record type names
+static const std::string recordTypeName_ZMW{"ZMW"};
+static const std::string recordTypeName_Polymerase{"POLYMERASE"};
+static const std::string recordTypeName_HqRegion{"HQREGION"};
+static const std::string recordTypeName_Subread{"SUBREAD"};
+static const std::string recordTypeName_CCS{"CCS"};
+static const std::string recordTypeName_Scrap{"SCRAP"};
+static const std::string recordTypeName_Transcript{"TRANSCRIPT"};
+static const std::string recordTypeName_Unknown{"UNKNOWN"};
+
+static int32_t HoleNumberFromName(const std::string& fullName)
+{
+    const auto mainTokens = Split(fullName, '/');
+    if (mainTokens.at(0) == "transcript") {
+        if (mainTokens.size() != 2) throw std::runtime_error{"malformed transcript record name"};
+        return std::stoi(mainTokens.at(1));
+    } else {
+        if (mainTokens.size() != 3) throw std::runtime_error("malformed record name");
+        return std::stoi(mainTokens.at(1));
+    }
+}
+
+static Position QueryEndFromName(const std::string& fullName)
+{
+    const auto mainTokens = Split(fullName, '/');
+    if (mainTokens.size() != 3) throw std::runtime_error{"malformed record name"};
+
+    const auto queryTokens = Split(mainTokens.at(2), '_');
+    if (queryTokens.size() != 2) throw std::runtime_error{"malformed record name"};
+
+    return stoi(queryTokens.at(1));
+}
+
+static Position QueryStartFromName(const std::string& fullName)
+{
+    const auto mainTokens = Split(fullName, '/');
+    if (mainTokens.size() != 3) throw std::runtime_error{"malformed record name"};
+
+    const auto queryTokens = Split(mainTokens.at(2), '_');
+    if (queryTokens.size() != 2) throw std::runtime_error{"malformed record name"};
+
+    return stoi(queryTokens.at(0));
+}
+
+static inline std::string Label(const BamRecordTag tag) { return BamRecordTags::LabelFor(tag); }
+
+static BamRecordImpl* CreateOrEdit(const BamRecordTag tag, const Tag& value, BamRecordImpl* impl)
+{
+    if (impl->HasTag(tag))
+        impl->EditTag(tag, value);
+    else
+        impl->AddTag(tag, value);
+    return impl;
+}
+
+static std::pair<int32_t, int32_t> AlignedOffsets(const BamRecord& record, const int seqLength)
+{
+    int32_t startOffset = 0;
+    int32_t endOffset = seqLength;
+
+    const auto b = internal::BamRecordMemory::GetRawData(record);
+    uint32_t* cigarData = bam_get_cigar(b.get());
+    const size_t numCigarOps = b->core.n_cigar;
+    if (numCigarOps > 0) {
+
+        // start offset
+        for (size_t i = 0; i < numCigarOps; ++i) {
+            const auto type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+            if (type == CigarOperationType::HARD_CLIP) {
+                if (startOffset != 0 && startOffset != seqLength) {
+                    startOffset = -1;
+                    break;
+                }
+            } else if (type == CigarOperationType::SOFT_CLIP)
+                startOffset += bam_cigar_oplen(cigarData[i]);
+            else
+                break;
+        }
+
+        // end offset
+        for (int i = numCigarOps - 1; i >= 0; --i) {
+            const auto type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+            if (type == CigarOperationType::HARD_CLIP) {
+                if (endOffset != 0 && endOffset != seqLength) {
+                    endOffset = -1;
+                    break;
+                }
+            } else if (type == CigarOperationType::SOFT_CLIP)
+                endOffset -= bam_cigar_oplen(cigarData[i]);
+            else
+                break;
+        }
+
+        if (endOffset == 0) endOffset = seqLength;
+    }
+    return {startOffset, endOffset};
+}
+
+template <typename T>
+T Clip(const T& input, const size_t pos, const size_t len)
+{
+    if (input.empty()) return {};
+    return T{input.cbegin() + pos, input.cbegin() + pos + len};
+}
+
+template <typename T>
+T ClipPulse(const T& input, internal::Pulse2BaseCache* p2bCache, const size_t pos, const size_t len)
+{
+    assert(p2bCache);
+    if (input.empty()) return {};
+
+    // find start
+    size_t start = p2bCache->FindFirst();
+    size_t basesSeen = 0;
+    while (basesSeen < pos) {
+        start = p2bCache->FindNext(start);
+        ++basesSeen;
+    }
+
+    // find end
+    size_t end = start;
+    size_t seen = 1;
+    while (seen < len) {
+        end = p2bCache->FindNext(end);
+        ++seen;
+    }
+
+    // return clipped
+    return {input.cbegin() + start, input.cbegin() + end + 1};
+}
+
+template <class InputIt, class Size, class OutputIt>
+OutputIt Move_N(InputIt first, Size count, OutputIt result)
+{
+    return std::move(first, first + count, result);
+}
+
+template <typename F, typename N>
+static void ClipAndGapify(const BamRecordImpl& impl, const bool aligned, const bool exciseSoftClips,
+                          F* seq, N paddingNullValue, N deletionNullValue)
+{
+    assert(seq);
+
+    const bool clipOrGapRequested = aligned || exciseSoftClips;
+    if (impl.IsMapped() && clipOrGapRequested) {
+        // determine final container length
+        auto incrementsOutputLength = [](const CigarOperationType type, const bool isAligned,
+                                         const bool exciseSoftClipsFromAln) {
+            if (type == CigarOperationType::HARD_CLIP ||
+                type == CigarOperationType::REFERENCE_SKIP) {
+                return false;
+            } else if (type == CigarOperationType::SOFT_CLIP && exciseSoftClipsFromAln) {
+                return false;
+            } else if (!isAligned && (type == CigarOperationType::DELETION ||
+                                      type == CigarOperationType::PADDING)) {
+                return false;
+            } else
+                return true;
+        };
+
+        size_t outputLength = 0;
+        const auto cigar = impl.CigarData();
+        for (const CigarOperation& op : cigar) {
+            if (incrementsOutputLength(op.Type(), aligned, exciseSoftClips))
+                outputLength += op.Length();
+        }
+
+        // move original data to temp, prep output container size
+        F originalSeq = std::move(*seq);
+        seq->resize(outputLength);
+
+        // apply CIGAR ops
+        size_t srcIndex = 0;
+        size_t dstIndex = 0;
+        for (const CigarOperation& op : cigar) {
+            const auto opType = op.Type();
+            const auto opLength = op.Length();
+
+            // nothing to do for hard-clipped & ref-skipped positions
+            if (opType == CigarOperationType::HARD_CLIP ||
+                opType == CigarOperationType::REFERENCE_SKIP) {
+                continue;
+            }
+
+            // maybe skip soft-clipped positions
+            else if (opType == CigarOperationType::SOFT_CLIP) {
+                if (exciseSoftClips)
+                    srcIndex += opLength;
+                else {
+                    Move_N(originalSeq.begin() + srcIndex, opLength, seq->begin() + dstIndex);
+                    srcIndex += opLength;
+                    dstIndex += opLength;
+                }
+            }
+
+            // maybe add deletion/padding values
+            else if (aligned && opType == CigarOperationType::DELETION) {
+                for (size_t i = 0; i < opLength; ++i)
+                    (*seq)[dstIndex++] = deletionNullValue;
+            } else if (aligned && opType == CigarOperationType::PADDING) {
+                for (size_t i = 0; i < opLength; ++i)
+                    (*seq)[dstIndex++] = paddingNullValue;
+            }
+
+            // all other CIGAR ops
+            else {
+                Move_N(originalSeq.begin() + srcIndex, opLength, seq->begin() + dstIndex);
+                srcIndex += opLength;
+                dstIndex += opLength;
+            }
+        }
+    }
+}
+
+static inline void ClipAndGapifyBases(const BamRecordImpl& impl, const bool aligned,
+                                      const bool exciseSoftClips, std::string* seq)
+{
+    ClipAndGapify<std::string, char>(impl, aligned, exciseSoftClips, seq, '*', '-');
+}
+
+static inline void ClipAndGapifyFrames(const BamRecordImpl& impl, const bool aligned,
+                                       const bool exciseSoftClips, Frames* frames)
+{
+    assert(frames);
+    std::vector<uint16_t> data{std::move(frames->Data())};
+    ClipAndGapify<std::vector<uint16_t>, uint16_t>(impl, aligned, exciseSoftClips, &data, 0, 0);
+    frames->Data(data);
+}
+
+static inline void ClipAndGapifyPhotons(const BamRecordImpl& impl, const bool aligned,
+                                        const bool exciseSoftClips, std::vector<float>* data)
+{
+    ClipAndGapify<std::vector<float>, float>(impl, aligned, exciseSoftClips, data, 0.0, 0.0);
+}
+
+static inline void ClipAndGapifyQualities(const BamRecordImpl& impl, const bool aligned,
+                                          const bool exciseSoftClips, QualityValues* quals)
+{
+    ClipAndGapify<QualityValues, QualityValue>(impl, aligned, exciseSoftClips, quals,
+                                               QualityValue(0), QualityValue(0));
+}
+
+static inline void ClipAndGapifyUInts(const BamRecordImpl& impl, const bool aligned,
+                                      const bool exciseSoftClips, std::vector<uint32_t>* data)
+{
+    ClipAndGapify<std::vector<uint32_t>, uint32_t>(impl, aligned, exciseSoftClips, data, 0, 0);
+}
+
+static inline void ClipAndGapifyUInt8s(const BamRecordImpl& impl, const bool aligned,
+                                       const bool exciseSoftClips, std::vector<uint8_t>* data)
+{
+    ClipAndGapify<std::vector<uint8_t>, uint8_t>(impl, aligned, exciseSoftClips, data, 0, 0);
+}
+
+static RecordType NameToType(const std::string& name)
+{
+    if (name == recordTypeName_Subread) return RecordType::SUBREAD;
+    if (name == recordTypeName_ZMW || name == recordTypeName_Polymerase) return RecordType::ZMW;
+    if (name == recordTypeName_HqRegion) return RecordType::HQREGION;
+    if (name == recordTypeName_CCS) return RecordType::CCS;
+    if (name == recordTypeName_Scrap) return RecordType::SCRAP;
+    if (name == recordTypeName_Transcript) return RecordType::TRANSCRIPT;
+    return RecordType::UNKNOWN;
+}
+
+static void OrientBasesAsRequested(std::string* bases, Orientation current, Orientation requested,
+                                   bool isReverseStrand, bool isPulse)
+{
+    assert(bases);
+    if (current != requested && isReverseStrand) {
+        if (isPulse)
+            internal::ReverseComplementCaseSens(*bases);
+        else
+            internal::ReverseComplement(*bases);
+    }
+}
+
+template <typename Container>
+inline void OrientTagDataAsRequested(Container* data, Orientation current, Orientation requested,
+                                     bool isReverseStrand)
+{
+    assert(data);
+    if (current != requested && isReverseStrand) std::reverse(data->begin(), data->end());
+}
+
+static inline bool ConsumesQuery(const CigarOperationType type)
+{
+    return (bam_cigar_type(static_cast<int>(type)) & 0x1) != 0;
+}
+
+static inline bool ConsumesReference(const CigarOperationType type)
+{
+    return (bam_cigar_type(static_cast<int>(type)) & 0x2) != 0;
+}
+
+}  // namespace internal
+
+const float BamRecord::photonFactor = 10.0;
+
+BamRecord::BamRecord()
+    : alignedStart_{PacBio::BAM::UnmappedPosition}, alignedEnd_{PacBio::BAM::UnmappedPosition}
+{
+}
+
+BamRecord::BamRecord(BamHeader header)
+    : header_{std::move(header)}
+    , alignedStart_{PacBio::BAM::UnmappedPosition}
+    , alignedEnd_{PacBio::BAM::UnmappedPosition}
+{
+}
+
+BamRecord::BamRecord(BamRecordImpl impl)
+    : impl_{std::move(impl)}
+    , alignedStart_{PacBio::BAM::UnmappedPosition}
+    , alignedEnd_{PacBio::BAM::UnmappedPosition}
+{
+}
+
+BamRecord::BamRecord(const BamRecord& other)
+    : impl_{other.impl_}
+    , header_{other.header_}
+    , alignedStart_{other.alignedStart_}
+    , alignedEnd_{other.alignedEnd_}
+{
+}
+
+BamRecord::BamRecord(BamRecord&& other)
+    : impl_{std::move(other.impl_)}
+    , header_{std::move(other.header_)}
+    , alignedStart_{std::move(other.alignedStart_)}
+    , alignedEnd_{std::move(other.alignedEnd_)}
+    , p2bCache_{std::move(other.p2bCache_)}
+{
+}
+
+BamRecord& BamRecord::operator=(const BamRecord& other)
+{
+    if (this != &other) {
+        impl_ = other.impl_;
+        header_ = other.header_;
+        alignedStart_ = other.alignedStart_;
+        alignedEnd_ = other.alignedEnd_;
+        p2bCache_.reset();  // just reset, for now at least
+    }
+    return *this;
+}
+
+BamRecord& BamRecord::operator=(BamRecord&& other)
+{
+    if (this != &other) {
+        impl_ = std::move(other.impl_);
+        header_ = std::move(other.header_);
+        alignedStart_ = std::move(other.alignedStart_);
+        alignedEnd_ = std::move(other.alignedEnd_);
+        p2bCache_ = std::move(other.p2bCache_);
+    }
+    return *this;
+}
+
+BamRecord::~BamRecord() {}
+
+Position BamRecord::AlignedEnd() const
+{
+    if (alignedEnd_ == PacBio::BAM::UnmappedPosition) CalculateAlignedPositions();
+    return alignedEnd_;
+}
+
+Position BamRecord::AlignedStart() const
+{
+    if (alignedStart_ == PacBio::BAM::UnmappedPosition) CalculateAlignedPositions();
+    return alignedStart_;
+}
+
+Strand BamRecord::AlignedStrand() const
+{
+    return impl_.IsReverseStrand() ? Strand::REVERSE : Strand::FORWARD;
+}
+
+QualityValues BamRecord::AltLabelQV(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                    PulseBehavior pulseBehavior) const
+{
+    return FetchQualities(BamRecordTag::ALT_LABEL_QV, orientation, aligned, exciseSoftClips,
+                          pulseBehavior);
+}
+
+BamRecord& BamRecord::AltLabelQV(const QualityValues& altLabelQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::ALT_LABEL_QV, altLabelQVs.Fastq(), &impl_);
+    return *this;
+}
+
+std::string BamRecord::AltLabelTag(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                   PulseBehavior pulseBehavior) const
+{
+    return FetchBases(BamRecordTag::ALT_LABEL_TAG, orientation, aligned, exciseSoftClips,
+                      pulseBehavior);
+}
+
+BamRecord& BamRecord::AltLabelTag(const std::string& tags)
+{
+    internal::CreateOrEdit(BamRecordTag::ALT_LABEL_TAG, tags, &impl_);
+    return *this;
+}
+
+int16_t BamRecord::BarcodeForward() const { return Barcodes().first; }
+
+int16_t BamRecord::BarcodeReverse() const { return Barcodes().second; }
+
+uint8_t BamRecord::BarcodeQuality() const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::BARCODE_QUALITY);
+    const auto bq = impl_.TagValue(tagName);
+    if (bq.IsNull())
+        return 0;  // ?? "missing" value for tags ?? should we consider boost::optional<T> for these kind of guys ??
+    return bq.ToUInt8();
+}
+
+BamRecord& BamRecord::BarcodeQuality(const uint8_t quality)
+{
+    internal::CreateOrEdit(BamRecordTag::BARCODE_QUALITY, quality, &impl_);
+    return *this;
+}
+
+std::pair<int16_t, int16_t> BamRecord::Barcodes() const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::BARCODES);
+    const Tag bc = impl_.TagValue(tagName);
+    if (bc.IsNull()) throw std::runtime_error{"barcode tag (bc) was requested but is missing"};
+
+    // NOTE: barcodes are still stored, per the spec, as uint16, even though
+    // we're now using them as int16_t in the API (bug 31511)
+    //
+    if (!bc.IsUInt16Array())
+        throw std::runtime_error{
+            "barcode tag (bc) is malformed: should be a uint16_t array of size==2."};
+    const auto bcArray = bc.ToUInt16Array();
+    if (bcArray.size() != 2)
+        throw std::runtime_error{
+            "barcode tag (bc) is malformed: should be a uint16_t array of size==2."};
+
+    return {boost::numeric_cast<int16_t>(bcArray[0]), boost::numeric_cast<int16_t>(bcArray[1])};
+}
+
+BamRecord& BamRecord::Barcodes(const std::pair<int16_t, int16_t>& barcodeIds)
+{
+    const std::vector<uint16_t> data{boost::numeric_cast<uint16_t>(barcodeIds.first),
+                                     boost::numeric_cast<uint16_t>(barcodeIds.second)};
+    internal::CreateOrEdit(BamRecordTag::BARCODES, data, &impl_);
+    return *this;
+}
+
+void BamRecord::CalculateAlignedPositions() const
+{
+    // reset
+    ResetCachedPositions();
+
+    // skip if unmapped, or has no queryStart/End
+    if (!impl_.IsMapped()) return;
+
+    // get the query start/end
+    const auto seqLength = static_cast<int>(impl_.SequenceLength());
+    const bool isCcsOrTranscript = IsCcsOrTranscript(Type());
+    const Position qStart = isCcsOrTranscript ? 0 : QueryStart();
+    const Position qEnd = isCcsOrTranscript ? seqLength : QueryEnd();
+
+    if (qStart == PacBio::BAM::UnmappedPosition || qEnd == PacBio::BAM::UnmappedPosition) return;
+
+    // determine clipped end ranges
+    const auto alignedOffsets = internal::AlignedOffsets(*this, seqLength);
+    const auto startOffset = alignedOffsets.first;
+    const auto endOffset = alignedOffsets.second;
+    if (endOffset == -1 || startOffset == -1) return;  // TODO: handle error more??
+
+    // store aligned positions (polymerase read coordinates)
+    if (impl_.IsReverseStrand()) {
+        alignedStart_ = qStart + (seqLength - endOffset);
+        alignedEnd_ = qEnd - startOffset;
+    } else {
+        alignedStart_ = qStart + startOffset;
+        alignedEnd_ = qEnd - (seqLength - endOffset);
+    }
+}
+
+void BamRecord::CalculatePulse2BaseCache() const
+{
+    // skip already calculated
+    if (p2bCache_) return;
+
+    // else try to calculate p2b cache.
+    if (!HasPulseCall())
+        throw std::runtime_error{"BamRecord cannot calculate pulse2base mapping without 'pc' tag."};
+    const auto pulseCalls =
+        FetchBases(BamRecordTag::PULSE_CALL, Orientation::NATIVE, false, false, PulseBehavior::ALL);
+    p2bCache_ = std::make_unique<internal::Pulse2BaseCache>(pulseCalls);
+}
+
+Cigar BamRecord::CigarData(bool exciseAllClips) const
+{
+    auto isClippingOp = [](const CigarOperation& op) {
+        const auto type = op.Type();
+        return type == CigarOperationType::SOFT_CLIP || type == CigarOperationType::HARD_CLIP;
+    };
+
+    auto cigar = impl_.CigarData();
+    if (exciseAllClips) {
+        cigar.erase(std::remove_if(cigar.begin(), cigar.end(), isClippingOp), cigar.end());
+    }
+    return cigar;
+}
+
+BamRecord& BamRecord::Clip(const ClipType clipType, const Position start, const Position end)
+{
+    switch (clipType) {
+        case ClipType::CLIP_NONE:
+            return *this;
+        case ClipType::CLIP_TO_QUERY:
+            return ClipToQuery(start, end);
+        case ClipType::CLIP_TO_REFERENCE:
+            return ClipToReference(start, end);
+        default:
+            throw std::runtime_error{"unsupported clip type requested"};
+    }
+}
+
+void BamRecord::ClipTags(const size_t clipFrom, const size_t clipLength)
+{
+    const auto ipdCodec = ReadGroup().IpdCodec();
+    const auto pwCodec = ReadGroup().PulseWidthCodec();
+
+    // update BAM tags
+    TagCollection tags = impl_.Tags();
+    if (HasDeletionQV())
+        tags[internal::Label(BamRecordTag::DELETION_QV)] =
+            internal::Clip(DeletionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasInsertionQV())
+        tags[internal::Label(BamRecordTag::INSERTION_QV)] =
+            internal::Clip(InsertionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasMergeQV())
+        tags[internal::Label(BamRecordTag::MERGE_QV)] =
+            internal::Clip(MergeQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasSubstitutionQV())
+        tags[internal::Label(BamRecordTag::SUBSTITUTION_QV)] =
+            internal::Clip(SubstitutionQV(Orientation::NATIVE), clipFrom, clipLength).Fastq();
+    if (HasIPD()) {
+        if (ipdCodec == FrameCodec::RAW)
+            tags[internal::Label(BamRecordTag::IPD)] =
+                internal::Clip(IPD(Orientation::NATIVE).Data(), clipFrom, clipLength);
+        else if (ipdCodec == FrameCodec::V1)
+            tags[internal::Label(BamRecordTag::IPD)] =
+                internal::Clip(IPD(Orientation::NATIVE).Encode(), clipFrom, clipLength);
+    }
+    if (HasPulseWidth()) {
+        if (pwCodec == FrameCodec::RAW)
+            tags[internal::Label(BamRecordTag::PULSE_WIDTH)] =
+                internal::Clip(PulseWidth(Orientation::NATIVE).Data(), clipFrom, clipLength);
+        else if (pwCodec == FrameCodec::V1)
+            tags[internal::Label(BamRecordTag::PULSE_WIDTH)] =
+                internal::Clip(PulseWidth(Orientation::NATIVE).Encode(), clipFrom, clipLength);
+    }
+    if (HasDeletionTag())
+        tags[internal::Label(BamRecordTag::DELETION_TAG)] =
+            internal::Clip(DeletionTag(Orientation::NATIVE), clipFrom, clipLength);
+    if (HasSubstitutionTag())
+        tags[internal::Label(BamRecordTag::SUBSTITUTION_TAG)] =
+            internal::Clip(SubstitutionTag(Orientation::NATIVE), clipFrom, clipLength);
+
+    // internal BAM tags
+    if (HasPulseCall()) {
+
+        // ensure p2bCache initialized
+        CalculatePulse2BaseCache();
+        internal::Pulse2BaseCache* p2bCache = p2bCache_.get();
+
+        if (HasAltLabelQV())
+            tags[internal::Label(BamRecordTag::ALT_LABEL_QV)] =
+                internal::ClipPulse(AltLabelQV(Orientation::NATIVE), p2bCache, clipFrom, clipLength)
+                    .Fastq();
+        if (HasLabelQV())
+            tags[internal::Label(BamRecordTag::LABEL_QV)] =
+                internal::ClipPulse(LabelQV(Orientation::NATIVE), p2bCache, clipFrom, clipLength)
+                    .Fastq();
+        if (HasPulseMergeQV())
+            tags[internal::Label(BamRecordTag::PULSE_MERGE_QV)] =
+                internal::ClipPulse(PulseMergeQV(Orientation::NATIVE), p2bCache, clipFrom,
+                                    clipLength)
+                    .Fastq();
+        if (HasAltLabelTag())
+            tags[internal::Label(BamRecordTag::ALT_LABEL_TAG)] = internal::ClipPulse(
+                AltLabelTag(Orientation::NATIVE), p2bCache, clipFrom, clipLength);
+        if (HasPulseCall())
+            tags[internal::Label(BamRecordTag::PULSE_CALL)] =
+                internal::ClipPulse(PulseCall(Orientation::NATIVE), p2bCache, clipFrom, clipLength);
+        if (HasPkmean())
+            tags[internal::Label(BamRecordTag::PKMEAN)] = EncodePhotons(
+                internal::ClipPulse(Pkmean(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPkmid())
+            tags[internal::Label(BamRecordTag::PKMID)] = EncodePhotons(
+                internal::ClipPulse(Pkmid(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPkmean2())
+            tags[internal::Label(BamRecordTag::PKMEAN_2)] = EncodePhotons(
+                internal::ClipPulse(Pkmean2(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPkmid2())
+            tags[internal::Label(BamRecordTag::PKMID_2)] = EncodePhotons(
+                internal::ClipPulse(Pkmid2(Orientation::NATIVE), p2bCache, clipFrom, clipLength));
+        if (HasPrePulseFrames())
+            tags[internal::Label(BamRecordTag::PRE_PULSE_FRAMES)] = internal::ClipPulse(
+                PrePulseFrames(Orientation::NATIVE).Data(), p2bCache, clipFrom, clipLength);
+        if (HasPulseCallWidth())
+            tags[internal::Label(BamRecordTag::PULSE_CALL_WIDTH)] = internal::ClipPulse(
+                PulseCallWidth(Orientation::NATIVE).Data(), p2bCache, clipFrom, clipLength);
+        if (HasStartFrame())
+            tags[internal::Label(BamRecordTag::START_FRAME)] = internal::ClipPulse(
+                StartFrame(Orientation::NATIVE), p2bCache, clipFrom, clipLength);
+    }
+
+    impl_.Tags(tags);
+}
+
+void BamRecord::ClipFields(const size_t clipFrom, const size_t clipLength)
+{
+    const bool isForwardStrand = (AlignedStrand() == Strand::FORWARD);
+
+    // clip seq, quals
+    std::string sequence{internal::Clip(Sequence(Orientation::NATIVE), clipFrom, clipLength)};
+    QualityValues qualities{internal::Clip(Qualities(Orientation::NATIVE), clipFrom, clipLength)};
+    if (!isForwardStrand) {
+        internal::ReverseComplement(sequence);
+        internal::Reverse(qualities);
+    }
+    impl_.SetSequenceAndQualities(sequence, qualities.Fastq());
+
+    ClipTags(clipFrom, clipLength);
+}
+
+BamRecord& BamRecord::ClipToQuery(const Position start, const Position end)
+{
+    // cache original coords, skip out if clip not needed
+    const auto seqLength = static_cast<int>(impl_.SequenceLength());
+    const bool isCcsOrTranscript = IsCcsOrTranscript(Type());
+    const Position origQStart = isCcsOrTranscript ? 0 : QueryStart();
+    const Position origQEnd = isCcsOrTranscript ? seqLength : QueryEnd();
+    if (start <= origQStart && end >= origQEnd) return *this;
+
+    // determine new offsets into data
+    const size_t startOffset = start - origQStart;
+    const size_t endOffset = origQEnd - end;
+
+    // maybe update CIGAR & aligned position
+    if (IsMapped()) {
+
+        // fetch a 'working copy' of CIGAR data
+        Cigar cigar = impl_.CigarData();
+
+        // clip leading CIGAR ops
+        size_t referencePositionOffset = 0;
+        size_t remaining = startOffset;
+        while (remaining > 0 && !cigar.empty()) {
+            CigarOperation& firstOp = cigar.front();
+            const auto firstOpLength = firstOp.Length();
+            const bool consumesQuery = internal::ConsumesQuery(firstOp.Type());
+            const bool consumesRef = internal::ConsumesReference(firstOp.Type());
+
+            // if (!consumesQuery)
+            //    just pop (e.g. deletion) ?
+            // else {
+            //    check bounds, like clip to reference ?
+            // }
+
+            // CIGAR op ends at or before clip
+            if (firstOpLength <= remaining) {
+                cigar.erase(cigar.begin());
+                if (consumesQuery) remaining -= firstOpLength;
+                if (consumesRef) referencePositionOffset += firstOpLength;
+            }
+
+            // CIGAR op straddles clip
+            else {
+                firstOp.Length(firstOpLength - remaining);
+                if (consumesRef) referencePositionOffset += remaining;
+                remaining = 0;
+            }
+        }
+
+        // clip trailing CIGAR ops
+        remaining = endOffset;
+        while (remaining > 0 && !cigar.empty()) {
+            CigarOperation& lastOp = cigar.back();
+            const auto lastOpLength = lastOp.Length();
+            const bool consumesQuery = internal::ConsumesQuery(lastOp.Type());
+
+            // CIGAR op ends at or after clip
+            if (lastOpLength <= remaining) {
+                cigar.pop_back();
+                if (consumesQuery) remaining -= lastOpLength;
+            }
+
+            // CIGAR op straddles clip
+            else {
+                lastOp.Length(lastOpLength - remaining);
+                remaining = 0;
+            }
+        }
+
+        // update CIGAR & position
+        impl_.CigarData(cigar);
+        impl_.Position(impl_.Position() + referencePositionOffset);
+    }
+
+    // clip SEQ, QUAL, & tags
+    const size_t clipFrom = startOffset;
+    const size_t clipLength = (end - start);
+    ClipFields(clipFrom, clipLength);
+
+    // update query start/end
+    // TODO: update name to reflect new QS/QE ???
+    internal::CreateOrEdit(BamRecordTag::QUERY_START, start, &impl_);
+    internal::CreateOrEdit(BamRecordTag::QUERY_END, end, &impl_);
+    //    UpdateName();
+
+    // reset any cached aligned start/end
+    ResetCachedPositions();
+    return *this;
+}
+
+BamRecord& BamRecord::ClipToReference(const Position start, const Position end)
+{
+    // skip if not mapped, clipping to reference doesn't make sense
+    // or should we even consider throwing here?
+    if (!IsMapped()) return *this;
+
+    const bool isForwardStrand = (AlignedStrand() == Strand::FORWARD);
+    return (isForwardStrand ? ClipToReferenceForward(start, end)
+                            : ClipToReferenceReverse(start, end));
+}
+
+BamRecord& BamRecord::ClipToReferenceForward(const PacBio::BAM::Position start,
+                                             const PacBio::BAM::Position end)
+{
+    assert(IsMapped());
+    assert(AlignedStrand() == Strand::FORWARD);
+
+    // cache original coords
+    const int seqLength = static_cast<int>(impl_.SequenceLength());
+    const bool isCcsOrTranscript = IsCcsOrTranscript(Type());
+    const Position origQStart = isCcsOrTranscript ? 0 : QueryStart();
+    const Position origQEnd = isCcsOrTranscript ? seqLength : QueryEnd();
+    const Position origTStart = ReferenceStart();
+    const Position origTEnd = ReferenceEnd();
+    assert(AlignedStart() >= origQStart);
+    assert(AlignedEnd() <= origQEnd);
+
+    // skip if already within requested clip range
+    if (start <= origTStart && end >= origTEnd) return *this;
+
+    const Position newTStart = std::max(origTStart, start);
+    const Position newTEnd = std::min(origTEnd, end);
+
+    // fetch a 'working copy' of CIGAR data
+    Cigar cigar = impl_.CigarData();
+
+    // we're going to skip query sequence outside aligned region
+    size_t queryPosRemovedFront = 0;
+    size_t queryPosRemovedBack = 0;
+
+    // ------------------------
+    // clip leading CIGAR ops
+    // ------------------------
+
+    size_t remaining = newTStart - origTStart;
+    while (remaining > 0 && !cigar.empty()) {
+        CigarOperation& firstOp = cigar.front();
+        const auto firstOpLength = firstOp.Length();
+        const bool consumesQuery = internal::ConsumesQuery(firstOp.Type());
+        const bool consumesRef = internal::ConsumesReference(firstOp.Type());
+
+        if (!consumesRef) {
+
+            // e.g. softclip - just pop it completely
+            cigar.erase(cigar.begin());
+            if (consumesQuery) queryPosRemovedFront += firstOpLength;
+
+        } else {
+            assert(consumesRef);
+
+            // CIGAR ends at or before clip
+            if (firstOpLength <= remaining) {
+                cigar.erase(cigar.begin());
+                if (consumesQuery) queryPosRemovedFront += firstOpLength;
+                if (consumesRef) remaining -= firstOpLength;
+            }
+
+            // CIGAR straddles clip
+            else {
+                assert(firstOpLength > remaining);
+                firstOp.Length(firstOpLength - remaining);
+                if (consumesQuery) queryPosRemovedFront += remaining;
+                remaining = 0;
+            }
+        }
+    }
+
+    // -------------------------
+    // clip trailing CIGAR ops
+    // -------------------------
+
+    remaining = origTEnd - newTEnd;
+    while (remaining > 0 && !cigar.empty()) {
+        CigarOperation& lastOp = cigar.back();
+        const auto lastOpLength = lastOp.Length();
+        const bool consumesQuery = internal::ConsumesQuery(lastOp.Type());
+        const bool consumesRef = internal::ConsumesReference(lastOp.Type());
+
+        if (!consumesRef) {
+
+            // e.g. softclip - just pop it completely
+            cigar.pop_back();
+            if (consumesQuery) queryPosRemovedBack += lastOpLength;
+
+        } else {
+            assert(consumesRef);
+
+            // CIGAR ends at or after clip
+            if (lastOpLength <= remaining) {
+                cigar.pop_back();
+                if (consumesQuery) queryPosRemovedBack += lastOpLength;
+                if (consumesRef) remaining -= lastOpLength;
+            }
+
+            // CIGAR straddles clip
+            else {
+                assert(lastOpLength > remaining);
+                lastOp.Length(lastOpLength - remaining);
+                if (consumesQuery) queryPosRemovedBack += remaining;
+                remaining = 0;
+            }
+        }
+    }
+
+    // update CIGAR and position
+    impl_.CigarData(cigar);
+    impl_.Position(newTStart);
+
+    // clip SEQ, QUAL, tags
+    const Position qStart = origQStart + queryPosRemovedFront;
+    const Position qEnd = origQEnd - queryPosRemovedBack;
+    const size_t clipFrom = queryPosRemovedFront;
+    const size_t clipLength = qEnd - qStart;
+    ClipFields(clipFrom, clipLength);
+
+    // update query start/end
+    internal::CreateOrEdit(BamRecordTag::QUERY_START, qStart, &impl_);
+    internal::CreateOrEdit(BamRecordTag::QUERY_END, qEnd, &impl_);
+    //    UpdateName();
+
+    // reset any cached aligned start/end
+    ResetCachedPositions();
+    return *this;
+}
+
+BamRecord& BamRecord::ClipToReferenceReverse(const PacBio::BAM::Position start,
+                                             const PacBio::BAM::Position end)
+{
+    assert(IsMapped());
+    assert(AlignedStrand() == Strand::REVERSE);
+
+    // cache original coords
+    const int seqLength = static_cast<int>(impl_.SequenceLength());
+    const bool isCcsOrTranscript = IsCcsOrTranscript(Type());
+    const Position origQStart = isCcsOrTranscript ? 0 : QueryStart();
+    const Position origQEnd = isCcsOrTranscript ? seqLength : QueryEnd();
+    const Position origTStart = ReferenceStart();
+    const Position origTEnd = ReferenceEnd();
+
+    // skip if already within requested clip range
+    if (start <= origTStart && end >= origTEnd) return *this;
+    assert(AlignedStart() >= origQStart);
+    assert(AlignedEnd() <= origQEnd);
+
+    const Position newTStart = std::max(origTStart, start);
+    const Position newTEnd = std::min(origTEnd, end);
+
+    Cigar cigar = impl_.CigarData();
+
+    size_t queryPosRemovedFront = 0;
+    size_t queryPosRemovedBack = 0;
+
+    // update CIGAR - clip front ops, then clip back ops
+    size_t remaining = newTStart - origTStart;
+    while (remaining > 0 && !cigar.empty()) {
+        CigarOperation& firstOp = cigar.front();
+        const auto firstOpType = firstOp.Type();
+        const auto firstOpLength = firstOp.Length();
+        const bool consumesQuery = internal::ConsumesQuery(firstOpType);
+        const bool consumesRef = internal::ConsumesReference(firstOpType);
+
+        if (!consumesRef) {
+
+            // e.g. softclip - just pop it completely
+            cigar.erase(cigar.begin());
+            if (consumesQuery) queryPosRemovedBack += firstOpLength;
+
+        } else {
+            assert(consumesRef);
+
+            // CIGAR ends at or before clip
+            if (firstOpLength <= remaining) {
+                cigar.erase(cigar.begin());
+                if (consumesQuery) queryPosRemovedBack += firstOpLength;
+                if (consumesRef) remaining -= firstOpLength;
+            }
+
+            // CIGAR straddles clip
+            else {
+                assert(firstOpLength > remaining);
+                firstOp.Length(firstOpLength - remaining);
+                if (consumesQuery) queryPosRemovedBack += remaining;
+                remaining = 0;
+            }
+        }
+    }
+
+    remaining = origTEnd - newTEnd;
+    while (remaining > 0 && !cigar.empty()) {
+        CigarOperation& lastOp = cigar.back();
+        const auto lastOpType = lastOp.Type();
+        const auto lastOpLength = lastOp.Length();
+        const bool consumesQuery = internal::ConsumesQuery(lastOpType);
+        const bool consumesRef = internal::ConsumesReference(lastOpType);
+
+        if (!consumesRef) {
+
+            // e.g. softclip - just pop it completely
+            cigar.pop_back();
+            if (consumesQuery) queryPosRemovedFront += lastOpLength;
+
+        } else {
+            assert(consumesRef);
+
+            // CIGAR ends at or before clip
+            if (lastOpLength <= remaining) {
+                cigar.pop_back();
+                if (consumesQuery) queryPosRemovedFront += lastOpLength;
+                if (consumesRef) remaining -= lastOpLength;
+            }
+
+            // CIGAR straddles clip
+            else {
+                assert(lastOpLength > remaining);
+                lastOp.Length(lastOpLength - remaining);
+                if (consumesQuery) queryPosRemovedFront += remaining;
+                remaining = 0;
+            }
+        }
+    }
+    impl_.CigarData(cigar);
+
+    // update aligned reference position
+    impl_.Position(newTStart);
+
+    // clip SEQ, QUAL, tags
+    const Position qStart = origQStart + queryPosRemovedFront;
+    const Position qEnd = origQEnd - queryPosRemovedBack;
+    const size_t clipFrom = queryPosRemovedFront;
+    const size_t clipLength = qEnd - qStart;
+    ClipFields(clipFrom, clipLength);
+
+    // update query start/end
+    internal::CreateOrEdit(BamRecordTag::QUERY_START, qStart, &impl_);
+    internal::CreateOrEdit(BamRecordTag::QUERY_END, qEnd, &impl_);
+    //    UpdateName();
+
+    // reset any cached aligned start/end
+    ResetCachedPositions();
+    return *this;
+}
+
+QualityValues BamRecord::DeletionQV(Orientation orientation, bool aligned,
+                                    bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::DELETION_QV, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::DeletionQV(const QualityValues& deletionQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::DELETION_QV, deletionQVs.Fastq(), &impl_);
+    return *this;
+}
+
+std::string BamRecord::DeletionTag(Orientation orientation, bool aligned,
+                                   bool exciseSoftClips) const
+{
+    return FetchBases(BamRecordTag::DELETION_TAG, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::DeletionTag(const std::string& tags)
+{
+    internal::CreateOrEdit(BamRecordTag::DELETION_TAG, tags, &impl_);
+    return *this;
+}
+
+std::vector<uint16_t> BamRecord::EncodePhotons(const std::vector<float>& data)
+{
+    std::vector<uint16_t> encoded;
+    encoded.reserve(data.size());
+    for (const auto& d : data)
+        encoded.emplace_back(d * photonFactor);
+    return encoded;
+}
+
+std::string BamRecord::FetchBasesRaw(const BamRecordTag tag) const
+{
+    const Tag seqTag = impl_.TagValue(tag);
+    return seqTag.ToString();
+}
+
+std::string BamRecord::FetchBases(const BamRecordTag tag, const Orientation orientation,
+                                  const bool aligned, const bool exciseSoftClips,
+                                  const PulseBehavior pulseBehavior) const
+{
+    const bool isBamSeq = (tag == BamRecordTag::SEQ);
+    const bool isPulse = internal::BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    std::string bases;
+    Orientation current;
+    if (isBamSeq) {  // SEQ stored in genomic orientation
+        bases = impl_.Sequence();
+        current = Orientation::GENOMIC;
+    } else {  // all tags stored in native orientation
+        bases = FetchBasesRaw(tag);
+        current = Orientation::NATIVE;
+    }
+
+    // maybe strip 'squashed' pulse loci
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        CalculatePulse2BaseCache();
+        bases = p2bCache_->RemoveSquashedPulses(bases);
+    }
+
+    // if we need to touch CIGAR
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "Cannot return data at all pulses when gapping and/or soft-clipping are requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        internal::OrientBasesAsRequested(&bases, current, Orientation::GENOMIC,
+                                         impl_.IsReverseStrand(), isPulse);
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        internal::ClipAndGapifyBases(impl_, aligned, exciseSoftClips, &bases);
+    }
+
+    // return in the orientation requested
+    internal::OrientBasesAsRequested(&bases, current, orientation, impl_.IsReverseStrand(),
+                                     isPulse);
+    return bases;
+}
+
+Frames BamRecord::FetchFramesRaw(const BamRecordTag tag) const
+{
+    const Tag frameTag = impl_.TagValue(tag);
+    if (frameTag.IsNull()) return {};  // throw ?
+
+    // lossy frame codes
+    if (frameTag.IsUInt8Array()) {
+        const auto codes = frameTag.ToUInt8Array();
+        return Frames::Decode(codes);
+    }
+
+    // lossless frame data
+    else {
+        assert(frameTag.IsUInt16Array());
+        return Frames{frameTag.ToUInt16Array()};
+    }
+}
+
+Frames BamRecord::FetchFrames(const BamRecordTag tag, const Orientation orientation,
+                              const bool aligned, const bool exciseSoftClips,
+                              const PulseBehavior pulseBehavior) const
+{
+    const bool isPulse = internal::BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    Frames frames = FetchFramesRaw(tag);
+    Orientation current = Orientation::NATIVE;
+
+    // maybe strip 'squashed' pulse loci
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        CalculatePulse2BaseCache();
+        frames.DataRaw() = p2bCache_->RemoveSquashedPulses(frames.Data());
+    }
+
+    // if we need to touch the CIGAR
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "Cannot return data at all pulses when gapping and/or soft-clipping are requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        internal::OrientTagDataAsRequested(&frames, current, Orientation::GENOMIC,
+                                           impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        internal::ClipAndGapifyFrames(impl_, aligned, exciseSoftClips, &frames);
+    }
+
+    // return in the orientation requested
+    internal::OrientTagDataAsRequested(&frames, current, orientation, impl_.IsReverseStrand());
+    return frames;
+}
+
+std::vector<float> BamRecord::FetchPhotonsRaw(const BamRecordTag tag) const
+{
+    const Tag frameTag = impl_.TagValue(tag);
+    if (frameTag.IsNull()) return {};
+    if (!frameTag.IsUInt16Array())
+        throw std::runtime_error{"Photons are not a uint16_t array, tag " +
+                                 internal::BamRecordTags::LabelFor(tag)};
+
+    const auto data = frameTag.ToUInt16Array();
+    std::vector<float> photons;
+    photons.reserve(data.size());
+    for (const auto& d : data)
+        photons.emplace_back(d / photonFactor);
+    return photons;
+}
+
+std::vector<float> BamRecord::FetchPhotons(const BamRecordTag tag, const Orientation orientation,
+                                           const bool aligned, const bool exciseSoftClips,
+                                           const PulseBehavior pulseBehavior) const
+{
+    const bool isPulse = internal::BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    auto data = FetchPhotonsRaw(tag);
+    Orientation current = Orientation::NATIVE;
+
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        // strip 'squashed' pulse loci
+        CalculatePulse2BaseCache();
+        data = p2bCache_->RemoveSquashedPulses(data);
+    }
+
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "Cannot return data at all pulses when gapping and/or soft-clipping are requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        internal::OrientTagDataAsRequested(&data, current, Orientation::GENOMIC,
+                                           impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        internal::ClipAndGapifyPhotons(impl_, aligned, exciseSoftClips, &data);
+    }
+
+    // return in the orientation requested
+    internal::OrientTagDataAsRequested(&data, current, orientation, impl_.IsReverseStrand());
+    return data;
+}
+
+QualityValues BamRecord::FetchQualitiesRaw(const BamRecordTag tag) const
+{
+    const Tag qvsTag = impl_.TagValue(tag);
+    return QualityValues::FromFastq(qvsTag.ToString());
+}
+
+QualityValues BamRecord::FetchQualities(const BamRecordTag tag, const Orientation orientation,
+                                        const bool aligned, const bool exciseSoftClips,
+                                        const PulseBehavior pulseBehavior) const
+{
+    // requested data info
+    const bool isBamQual = (tag == BamRecordTag::QUAL);
+    const bool isPulse = internal::BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    QualityValues quals;
+    Orientation current;
+    if (isBamQual) {  // QUAL stored in genomic orientation
+        quals = impl_.Qualities();
+        current = Orientation::GENOMIC;
+    } else {  // all tags stored in native orientation
+        quals = FetchQualitiesRaw(tag);
+        current = Orientation::NATIVE;
+    }
+
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        // strip 'squashed' pulse loci
+        CalculatePulse2BaseCache();
+        quals = p2bCache_->RemoveSquashedPulses(quals);
+    }
+
+    // if we need to touch CIGAR
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "Cannot return data at all pulses when gapping and/or soft-clipping are requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        internal::OrientTagDataAsRequested(&quals, current, Orientation::GENOMIC,
+                                           impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        internal::ClipAndGapifyQualities(impl_, aligned, exciseSoftClips, &quals);
+    }
+
+    // return in the orientation requested
+    internal::OrientTagDataAsRequested(&quals, current, orientation, impl_.IsReverseStrand());
+    return quals;
+}
+
+std::vector<uint32_t> BamRecord::FetchUInt32sRaw(const BamRecordTag tag) const
+{
+    // fetch tag data
+    const Tag frameTag = impl_.TagValue(tag);
+    if (frameTag.IsNull()) return {};
+    if (!frameTag.IsUInt32Array())
+        throw std::runtime_error{"Tag data are not a uint32_t array, tag " +
+                                 internal::BamRecordTags::LabelFor(tag)};
+    return frameTag.ToUInt32Array();
+}
+
+std::vector<uint32_t> BamRecord::FetchUInt32s(const BamRecordTag tag, const Orientation orientation,
+                                              const bool aligned, const bool exciseSoftClips,
+                                              const PulseBehavior pulseBehavior) const
+{
+    const bool isPulse = internal::BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    auto arr = FetchUInt32sRaw(tag);
+    Orientation current = Orientation::NATIVE;
+
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        // strip 'squashed' pulse loci
+        CalculatePulse2BaseCache();
+        arr = p2bCache_->RemoveSquashedPulses(arr);
+    }
+
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "Cannot return data at all pulses when gapping and/or soft-clipping are requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        internal::OrientTagDataAsRequested(&arr, current, Orientation::GENOMIC,
+                                           impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        internal::ClipAndGapifyUInts(impl_, aligned, exciseSoftClips, &arr);
+    }
+
+    // return in the orientation requested
+    internal::OrientTagDataAsRequested(&arr, current, orientation, impl_.IsReverseStrand());
+    return arr;
+}
+
+std::vector<uint8_t> BamRecord::FetchUInt8sRaw(const BamRecordTag tag) const
+{
+    // fetch tag data
+    const Tag frameTag = impl_.TagValue(tag);
+    if (frameTag.IsNull()) return {};
+    if (!frameTag.IsUInt8Array())
+        throw std::runtime_error{"Tag data are not a uint8_t array, tag " +
+                                 internal::BamRecordTags::LabelFor(tag)};
+    return frameTag.ToUInt8Array();
+}
+
+std::vector<uint8_t> BamRecord::FetchUInt8s(const BamRecordTag tag, const Orientation orientation,
+                                            const bool aligned, const bool exciseSoftClips,
+                                            const PulseBehavior pulseBehavior) const
+{
+    const bool isPulse = internal::BamRecordTags::IsPulse(tag);
+
+    // fetch raw
+    auto arr = FetchUInt8sRaw(tag);
+    Orientation current = Orientation::NATIVE;
+
+    if (isPulse && pulseBehavior == PulseBehavior::BASECALLS_ONLY) {
+        // strip 'squashed' pulse loci
+        CalculatePulse2BaseCache();
+        arr = p2bCache_->RemoveSquashedPulses(arr);
+    }
+
+    if (aligned || exciseSoftClips) {
+
+        if (isPulse && pulseBehavior != PulseBehavior::BASECALLS_ONLY)
+            throw std::runtime_error{
+                "Cannot return data at all pulses when gapping and/or soft-clipping are requested. "
+                "Use PulseBehavior::BASECALLS_ONLY instead."};
+
+        // force into genomic orientation
+        internal::OrientTagDataAsRequested(&arr, current, Orientation::GENOMIC,
+                                           impl_.IsReverseStrand());
+        current = Orientation::GENOMIC;
+
+        // clip & gapify as requested
+        internal::ClipAndGapifyUInt8s(impl_, aligned, exciseSoftClips, &arr);
+    }
+
+    // return in the orientation requested
+    internal::OrientTagDataAsRequested(&arr, current, orientation, impl_.IsReverseStrand());
+    return arr;
+}
+
+std::string BamRecord::FullName() const { return impl_.Name(); }
+
+bool BamRecord::HasAltLabelQV() const { return impl_.HasTag(BamRecordTag::ALT_LABEL_QV); }
+
+bool BamRecord::HasAltLabelTag() const { return impl_.HasTag(BamRecordTag::ALT_LABEL_TAG); }
+
+bool BamRecord::HasBarcodes() const { return impl_.HasTag(BamRecordTag::BARCODES); }
+
+bool BamRecord::HasBarcodeQuality() const { return impl_.HasTag(BamRecordTag::BARCODE_QUALITY); }
+
+bool BamRecord::HasLabelQV() const { return impl_.HasTag(BamRecordTag::LABEL_QV); }
+
+bool BamRecord::HasDeletionQV() const { return impl_.HasTag(BamRecordTag::DELETION_QV); }
+
+bool BamRecord::HasDeletionTag() const { return impl_.HasTag(BamRecordTag::DELETION_TAG); }
+
+bool BamRecord::HasHoleNumber() const
+{
+    return impl_.HasTag(BamRecordTag::HOLE_NUMBER) &&
+           !impl_.TagValue(BamRecordTag::HOLE_NUMBER).IsNull();
+}
+
+bool BamRecord::HasInsertionQV() const { return impl_.HasTag(BamRecordTag::INSERTION_QV); }
+
+bool BamRecord::HasNumPasses() const { return impl_.HasTag(BamRecordTag::NUM_PASSES); }
+
+bool BamRecord::HasPreBaseFrames() const { return HasIPD(); }
+
+bool BamRecord::HasIPD() const { return impl_.HasTag(BamRecordTag::IPD); }
+
+bool BamRecord::HasLocalContextFlags() const { return impl_.HasTag(BamRecordTag::CONTEXT_FLAGS); }
+
+bool BamRecord::HasMergeQV() const { return impl_.HasTag(BamRecordTag::MERGE_QV); }
+
+bool BamRecord::HasPulseMergeQV() const { return impl_.HasTag(BamRecordTag::PULSE_MERGE_QV); }
+
+bool BamRecord::HasPkmean() const { return impl_.HasTag(BamRecordTag::PKMEAN); }
+
+bool BamRecord::HasPkmean2() const { return impl_.HasTag(BamRecordTag::PKMEAN_2); }
+
+bool BamRecord::HasPkmid() const { return impl_.HasTag(BamRecordTag::PKMID); }
+
+bool BamRecord::HasPkmid2() const { return impl_.HasTag(BamRecordTag::PKMID_2); }
+
+bool BamRecord::HasPrePulseFrames() const { return impl_.HasTag(BamRecordTag::PRE_PULSE_FRAMES); }
+
+bool BamRecord::HasPulseCall() const
+{
+    return impl_.HasTag(BamRecordTag::PULSE_CALL) &&
+           !impl_.TagValue(BamRecordTag::PULSE_CALL).IsNull();
+}
+
+bool BamRecord::HasPulseExclusion(void) const
+{
+    return impl_.HasTag(BamRecordTag::PULSE_EXCLUSION);
+}
+
+bool BamRecord::HasPulseCallWidth(void) const
+{
+    return impl_.HasTag(BamRecordTag::PULSE_CALL_WIDTH);
+}
+
+bool BamRecord::HasPulseWidth() const { return impl_.HasTag(BamRecordTag::PULSE_WIDTH); }
+
+bool BamRecord::HasQueryEnd() const { return impl_.HasTag(BamRecordTag::QUERY_END); }
+
+bool BamRecord::HasQueryStart() const { return impl_.HasTag(BamRecordTag::QUERY_START); }
+
+bool BamRecord::HasReadAccuracy() const
+{
+    return impl_.HasTag(BamRecordTag::READ_ACCURACY) &&
+           !impl_.TagValue(BamRecordTag::READ_ACCURACY).IsNull();
+}
+
+bool BamRecord::HasScrapRegionType() const
+{
+    return impl_.HasTag(BamRecordTag::SCRAP_REGION_TYPE) &&
+           !impl_.TagValue(BamRecordTag::SCRAP_REGION_TYPE).IsNull();
+}
+
+bool BamRecord::HasScrapZmwType() const
+{
+    return impl_.HasTag(BamRecordTag::SCRAP_ZMW_TYPE) &&
+           !impl_.TagValue(BamRecordTag::SCRAP_ZMW_TYPE).IsNull();
+}
+
+bool BamRecord::HasStartFrame() const { return impl_.HasTag(BamRecordTag::START_FRAME); }
+
+bool BamRecord::HasSignalToNoise() const { return impl_.HasTag(BamRecordTag::SNR); }
+
+bool BamRecord::HasSubstitutionQV() const { return impl_.HasTag(BamRecordTag::SUBSTITUTION_QV); }
+
+bool BamRecord::HasSubstitutionTag() const { return impl_.HasTag(BamRecordTag::SUBSTITUTION_TAG); }
+
+BamHeader BamRecord::Header() const { return header_; }
+
+int32_t BamRecord::HoleNumber() const
+{
+    const Tag holeNumber = impl_.TagValue(BamRecordTag::HOLE_NUMBER);
+    if (!holeNumber.IsNull()) return holeNumber.ToInt32();
+
+    // missing zm tag - try to pull from name
+    return internal::HoleNumberFromName(FullName());
+}
+
+BamRecord& BamRecord::HoleNumber(const int32_t holeNumber)
+{
+    internal::CreateOrEdit(BamRecordTag::HOLE_NUMBER, holeNumber, &impl_);
+    return *this;
+}
+
+BamRecordImpl& BamRecord::Impl() { return impl_; }
+
+const BamRecordImpl& BamRecord::Impl() const { return impl_; }
+
+QualityValues BamRecord::InsertionQV(Orientation orientation, bool aligned,
+                                     bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::INSERTION_QV, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::InsertionQV(const QualityValues& insertionQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::INSERTION_QV, insertionQVs.Fastq(), &impl_);
+    return *this;
+}
+
+Frames BamRecord::IPD(Orientation orientation, bool aligned, bool exciseSoftClips) const
+{
+    return FetchFrames(BamRecordTag::IPD, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::IPD(const Frames& frames, const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY)
+        internal::CreateOrEdit(BamRecordTag::IPD, frames.Encode(), &impl_);
+    else
+        internal::CreateOrEdit(BamRecordTag::IPD, frames.Data(), &impl_);
+    return *this;
+}
+
+Frames BamRecord::IPDRaw(Orientation orientation) const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::IPD);
+    const Tag frameTag = impl_.TagValue(tagName);
+    if (frameTag.IsNull()) return {};
+
+    Frames frames;
+
+    // lossy frame codes
+    if (frameTag.IsUInt8Array()) {
+        const auto codes = frameTag.ToUInt8Array();
+        const std::vector<uint16_t> codes16(codes.begin(), codes.end());
+        frames.Data(std::move(codes16));
+    }
+
+    // lossless frame data
+    else {
+        assert(frameTag.IsUInt16Array());
+        frames.Data(frameTag.ToUInt16Array());
+    }
+
+    // return in requested orientation
+    internal::OrientTagDataAsRequested(&frames,
+                                       Orientation::NATIVE,  // current
+                                       orientation,          // requested
+                                       impl_.IsReverseStrand());
+    return frames;
+}
+
+bool BamRecord::IsMapped() const { return impl_.IsMapped(); }
+
+QualityValues BamRecord::LabelQV(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchQualities(BamRecordTag::LABEL_QV, orientation, aligned, exciseSoftClips,
+                          pulseBehavior);
+}
+
+BamRecord& BamRecord::LabelQV(const QualityValues& labelQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::LABEL_QV, labelQVs.Fastq(), &impl_);
+    return *this;
+}
+
+LocalContextFlags BamRecord::LocalContextFlags() const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::CONTEXT_FLAGS);
+    const Tag cxTag = impl_.TagValue(tagName);
+    return static_cast<PacBio::BAM::LocalContextFlags>(cxTag.ToUInt8());
+}
+
+BamRecord& BamRecord::LocalContextFlags(const PacBio::BAM::LocalContextFlags flags)
+{
+    internal::CreateOrEdit(BamRecordTag::CONTEXT_FLAGS, static_cast<uint8_t>(flags), &impl_);
+    return *this;
+}
+
+BamRecord& BamRecord::Map(const int32_t referenceId, const Position refStart, const Strand strand,
+                          const Cigar& cigar, const uint8_t mappingQuality)
+{
+    impl_.Position(refStart);
+    impl_.ReferenceId(referenceId);
+    impl_.CigarData(cigar);
+    impl_.MapQuality(mappingQuality);
+    impl_.SetMapped(true);
+
+    if (strand == Strand::FORWARD)
+        impl_.SetReverseStrand(false);
+
+    else {
+        assert(strand == Strand::REVERSE);
+        impl_.SetReverseStrand(true);
+
+        // switch seq & qual
+        std::string sequence = impl_.Sequence();
+        QualityValues qualities = impl_.Qualities();
+
+        internal::ReverseComplement(sequence);
+        internal::Reverse(qualities);
+
+        impl_.SetSequenceAndQualities(sequence, qualities.Fastq());
+    }
+
+    // reset any cached aligned start/end
+    alignedStart_ = PacBio::BAM::UnmappedPosition;
+    alignedEnd_ = PacBio::BAM::UnmappedPosition;
+
+    return *this;
+}
+
+uint8_t BamRecord::MapQuality() const { return impl_.MapQuality(); }
+
+QualityValues BamRecord::MergeQV(Orientation orientation, bool aligned, bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::MERGE_QV, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::MergeQV(const QualityValues& mergeQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::MERGE_QV, mergeQVs.Fastq(), &impl_);
+    return *this;
+}
+
+std::string BamRecord::MovieName() const { return ReadGroup().MovieName(); }
+
+size_t BamRecord::NumDeletedBases() const
+{
+    auto tEnd = ReferenceEnd();
+    auto tStart = ReferenceStart();
+    auto numMatchesAndMismatches = NumMatchesAndMismatches();
+    auto nM = numMatchesAndMismatches.first;
+    auto nMM = numMatchesAndMismatches.second;
+    return (tEnd - tStart - nM - nMM);
+}
+
+size_t BamRecord::NumInsertedBases() const
+{
+    auto aEnd = AlignedEnd();
+    auto aStart = AlignedStart();
+    auto numMatchesAndMismatches = NumMatchesAndMismatches();
+    auto nM = numMatchesAndMismatches.first;
+    auto nMM = numMatchesAndMismatches.second;
+    return (aEnd - aStart - nM - nMM);
+}
+
+size_t BamRecord::NumMatches() const { return NumMatchesAndMismatches().first; }
+
+std::pair<size_t, size_t> BamRecord::NumMatchesAndMismatches() const
+{
+    std::pair<size_t, size_t> result = std::make_pair(0, 0);
+
+    auto b = internal::BamRecordMemory::GetRawData(this);
+    uint32_t* cigarData = bam_get_cigar(b.get());
+    for (uint32_t i = 0; i < b->core.n_cigar; ++i) {
+        const auto type = static_cast<CigarOperationType>(bam_cigar_op(cigarData[i]));
+        if (type == CigarOperationType::SEQUENCE_MATCH)
+            result.first += bam_cigar_oplen(cigarData[i]);
+        else if (type == CigarOperationType::SEQUENCE_MISMATCH)
+            result.second += bam_cigar_oplen(cigarData[i]);
+    }
+    return result;
+}
+
+size_t BamRecord::NumMismatches() const { return NumMatchesAndMismatches().second; }
+
+int32_t BamRecord::NumPasses() const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::NUM_PASSES);
+    const Tag numPasses = impl_.TagValue(tagName);
+    return numPasses.ToInt32();
+}
+
+BamRecord& BamRecord::NumPasses(const int32_t numPasses)
+{
+    internal::CreateOrEdit(BamRecordTag::NUM_PASSES, numPasses, &impl_);
+    return *this;
+}
+
+std::vector<float> BamRecord::Pkmean(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                     PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMEAN, orientation, aligned, exciseSoftClips, pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmean(const std::vector<float>& photons)
+{
+    Pkmean(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmean(const std::vector<uint16_t>& encodedPhotons)
+{
+    internal::CreateOrEdit(BamRecordTag::PKMEAN, encodedPhotons, &impl_);
+    return *this;
+}
+
+std::vector<float> BamRecord::Pkmid(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                    PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMID, orientation, aligned, exciseSoftClips, pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmid(const std::vector<float>& photons)
+{
+    Pkmid(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmid(const std::vector<uint16_t>& encodedPhotons)
+{
+    internal::CreateOrEdit(BamRecordTag::PKMID, encodedPhotons, &impl_);
+    return *this;
+}
+
+std::vector<float> BamRecord::Pkmean2(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                      PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMEAN_2, orientation, aligned, exciseSoftClips,
+                        pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmean2(const std::vector<float>& photons)
+{
+    Pkmean2(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmean2(const std::vector<uint16_t>& encodedPhotons)
+{
+    internal::CreateOrEdit(BamRecordTag::PKMEAN_2, encodedPhotons, &impl_);
+    return *this;
+}
+
+std::vector<float> BamRecord::Pkmid2(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                     PulseBehavior pulseBehavior) const
+{
+    return FetchPhotons(BamRecordTag::PKMID_2, orientation, aligned, exciseSoftClips,
+                        pulseBehavior);
+}
+
+BamRecord& BamRecord::Pkmid2(const std::vector<float>& photons)
+{
+    Pkmid2(EncodePhotons(photons));
+    return *this;
+}
+
+BamRecord& BamRecord::Pkmid2(const std::vector<uint16_t>& encodedPhotons)
+{
+    internal::CreateOrEdit(BamRecordTag::PKMID_2, encodedPhotons, &impl_);
+    return *this;
+}
+
+Frames BamRecord::PreBaseFrames(Orientation orientation, bool aligned, bool exciseSoftClips) const
+{
+    return IPD(orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::PreBaseFrames(const Frames& frames, const FrameEncodingType encoding)
+{
+    return IPD(frames, encoding);
+}
+
+Frames BamRecord::PrePulseFrames(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchFrames(BamRecordTag::PRE_PULSE_FRAMES, orientation, aligned, exciseSoftClips,
+                       pulseBehavior);
+}
+
+BamRecord& BamRecord::PrePulseFrames(const Frames& frames, const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY) {
+        internal::CreateOrEdit(BamRecordTag::PRE_PULSE_FRAMES, frames.Encode(), &impl_);
+    } else {
+        internal::CreateOrEdit(BamRecordTag::PRE_PULSE_FRAMES, frames.Data(), &impl_);
+    }
+    return *this;
+}
+
+Frames BamRecord::PulseWidthRaw(Orientation orientation, bool /* aligned */,
+                                bool /* exciseSoftClips */) const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::PULSE_WIDTH);
+    const Tag frameTag = impl_.TagValue(tagName);
+    if (frameTag.IsNull()) return {};
+
+    Frames frames;
+
+    // lossy frame codes
+    if (frameTag.IsUInt8Array()) {
+        const auto codes = frameTag.ToUInt8Array();
+        const std::vector<uint16_t> codes16(codes.begin(), codes.end());
+        frames.Data(std::move(codes16));
+    }
+
+    // lossless frame data
+    else {
+        assert(frameTag.IsUInt16Array());
+        frames.Data(frameTag.ToUInt16Array());
+    }
+
+    // return in requested orientation
+    internal::OrientTagDataAsRequested(&frames,
+                                       Orientation::NATIVE,  // current
+                                       orientation,          // requested
+                                       impl_.IsReverseStrand());
+    return frames;
+}
+
+QualityValues BamRecord::PulseMergeQV(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                      PulseBehavior pulseBehavior) const
+{
+    return FetchQualities(BamRecordTag::PULSE_MERGE_QV, orientation, aligned, exciseSoftClips,
+                          pulseBehavior);
+}
+
+BamRecord& BamRecord::PulseMergeQV(const QualityValues& mergeQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::PULSE_MERGE_QV, mergeQVs.Fastq(), &impl_);
+    return *this;
+}
+
+std::string BamRecord::PulseCall(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchBases(BamRecordTag::PULSE_CALL, orientation, aligned, exciseSoftClips,
+                      pulseBehavior);
+}
+
+BamRecord& BamRecord::PulseCall(const std::string& tags)
+{
+    internal::CreateOrEdit(BamRecordTag::PULSE_CALL, tags, &impl_);
+    return *this;
+}
+
+Frames BamRecord::PulseCallWidth(Orientation orientation, bool aligned, bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior) const
+{
+    return FetchFrames(BamRecordTag::PULSE_CALL_WIDTH, orientation, aligned, exciseSoftClips,
+                       pulseBehavior);
+}
+
+BamRecord& BamRecord::PulseCallWidth(const Frames& frames, const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY) {
+        internal::CreateOrEdit(BamRecordTag::PULSE_CALL_WIDTH, frames.Encode(), &impl_);
+    } else {
+        internal::CreateOrEdit(BamRecordTag::PULSE_CALL_WIDTH, frames.Data(), &impl_);
+    }
+    return *this;
+}
+
+std::vector<PacBio::BAM::PulseExclusionReason> BamRecord::PulseExclusionReason(
+    Orientation orientation, bool aligned, bool exciseSoftClips, PulseBehavior pulseBehavior) const
+{
+    std::vector<PacBio::BAM::PulseExclusionReason> reasons;
+
+    const auto reasonNums = FetchUInt8s(BamRecordTag::PULSE_EXCLUSION, orientation, aligned,
+                                        exciseSoftClips, pulseBehavior);
+
+    std::transform(
+        reasonNums.cbegin(), reasonNums.cend(), std::back_inserter(reasons),
+        [](const uint8_t num) { return static_cast<PacBio::BAM::PulseExclusionReason>(num); });
+
+    return reasons;
+}
+
+BamRecord& BamRecord::PulseExclusionReason(
+    const std::vector<PacBio::BAM::PulseExclusionReason>& reasons)
+{
+    std::vector<uint8_t> reasonNums;
+    std::transform(reasons.cbegin(), reasons.cend(), std::back_inserter(reasonNums),
+                   [](const PacBio::BAM::PulseExclusionReason& reason) {
+                       return static_cast<uint8_t>(reason);
+                   });
+
+    internal::CreateOrEdit(BamRecordTag::PULSE_EXCLUSION, reasonNums, &impl_);
+    return *this;
+}
+
+Frames BamRecord::PulseWidth(Orientation orientation, bool aligned, bool exciseSoftClips) const
+{
+    return FetchFrames(BamRecordTag::PULSE_WIDTH, orientation, aligned, exciseSoftClips,
+                       PulseBehavior::ALL);
+}
+
+BamRecord& BamRecord::PulseWidth(const Frames& frames, const FrameEncodingType encoding)
+{
+    if (encoding == FrameEncodingType::LOSSY) {
+        internal::CreateOrEdit(BamRecordTag::PULSE_WIDTH, frames.Encode(), &impl_);
+    } else {
+        internal::CreateOrEdit(BamRecordTag::PULSE_WIDTH, frames.Data(), &impl_);
+    }
+    return *this;
+}
+
+QualityValues BamRecord::Qualities(Orientation orientation, bool aligned,
+                                   bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::QUAL, orientation, aligned, exciseSoftClips);
+}
+
+Position BamRecord::QueryEnd() const
+{
+    // try 'qe' tag
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::QUERY_END);
+    const Tag qe = impl_.TagValue(tagName);
+    if (!qe.IsNull()) return qe.ToInt32();
+
+    // tag missing, need to check movie name (fallback for non-PB BAMs, but ignore for CCS reads)
+    RecordType type;
+    try {
+        type = Type();
+    } catch (std::exception&) {
+        return 0;
+    }
+    if (type == RecordType::CCS) throw std::runtime_error{"no query end for CCS read type"};
+    if (type == RecordType::TRANSCRIPT)
+        throw std::runtime_error{"no query end for transcript read type"};
+
+    // PacBio BAM, non-CCS/transcript
+    try {
+        return internal::QueryEndFromName(FullName());
+    } catch (std::exception&) {
+        // return fallback position
+        return 0;
+    }
+}
+
+BamRecord& BamRecord::QueryEnd(const Position pos)
+{
+    internal::CreateOrEdit(BamRecordTag::QUERY_END, static_cast<int32_t>(pos), &impl_);
+    UpdateName();
+    return *this;
+}
+
+Position BamRecord::QueryStart() const
+{
+    // try 'qs' tag
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::QUERY_START);
+    const Tag qs = impl_.TagValue(tagName);
+    if (!qs.IsNull()) return qs.ToInt32();
+
+    // tag missing, need to check movie name (fallback for non-PB BAMs, but ignore for CCS reads)
+    RecordType type;
+    try {
+        type = Type();
+    } catch (std::exception&) {
+        return 0;
+    }
+    if (type == RecordType::CCS) throw std::runtime_error{"no query start for CCS read type"};
+    if (type == RecordType::TRANSCRIPT)
+        throw std::runtime_error{"no query start for transcript read type"};
+
+    // PacBio BAM, non-CCS/transcript
+    try {
+        return internal::QueryStartFromName(FullName());
+    } catch (std::exception&) {
+        // return fallback position
+        return 0;
+    }
+}
+
+BamRecord& BamRecord::QueryStart(const Position pos)
+{
+    internal::CreateOrEdit(BamRecordTag::QUERY_START, static_cast<int32_t>(pos), &impl_);
+    UpdateName();
+    return *this;
+}
+
+Accuracy BamRecord::ReadAccuracy() const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::READ_ACCURACY);
+    const Tag readAccuracy = impl_.TagValue(tagName);
+    return {readAccuracy.ToFloat()};
+}
+
+BamRecord& BamRecord::ReadAccuracy(const Accuracy& accuracy)
+{
+    internal::CreateOrEdit(BamRecordTag::READ_ACCURACY, static_cast<float>(accuracy), &impl_);
+    return *this;
+}
+
+ReadGroupInfo BamRecord::ReadGroup() const { return header_.ReadGroup(ReadGroupId()); }
+
+BamRecord& BamRecord::ReadGroup(const ReadGroupInfo& rg)
+{
+    internal::CreateOrEdit(BamRecordTag::READ_GROUP, rg.Id(), &impl_);
+    UpdateName();
+    return *this;
+}
+
+std::string BamRecord::ReadGroupId() const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::READ_GROUP);
+    const Tag rgTag = impl_.TagValue(tagName);
+    if (rgTag.IsNull()) return {};
+    return rgTag.ToString();
+}
+
+BamRecord& BamRecord::ReadGroupId(const std::string& id)
+{
+    internal::CreateOrEdit(BamRecordTag::READ_GROUP, id, &impl_);
+    UpdateName();
+    return *this;
+}
+
+int32_t BamRecord::ReadGroupNumericId() const { return ReadGroupInfo::IdToInt(ReadGroupId()); }
+
+Position BamRecord::ReferenceEnd() const
+{
+    if (!impl_.IsMapped()) return PacBio::BAM::UnmappedPosition;
+    const auto htsData = internal::BamRecordMemory::GetRawData(impl_);
+    if (!htsData) return PacBio::BAM::UnmappedPosition;
+    return bam_endpos(htsData.get());
+}
+
+int32_t BamRecord::ReferenceId() const { return impl_.ReferenceId(); }
+
+std::string BamRecord::ReferenceName() const
+{
+    if (IsMapped())
+        return Header().SequenceName(ReferenceId());
+    else
+        throw std::runtime_error{"unmapped record has no associated reference name"};
+}
+
+Position BamRecord::ReferenceStart() const { return impl_.Position(); }
+
+void BamRecord::ResetCachedPositions() const
+{
+    alignedEnd_ = PacBio::BAM::UnmappedPosition;
+    alignedStart_ = PacBio::BAM::UnmappedPosition;
+}
+
+void BamRecord::ResetCachedPositions()
+{
+    alignedEnd_ = PacBio::BAM::UnmappedPosition;
+    alignedStart_ = PacBio::BAM::UnmappedPosition;
+}
+
+VirtualRegionType BamRecord::ScrapRegionType() const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::SCRAP_REGION_TYPE);
+    const Tag srTag = impl_.TagValue(tagName);
+    return VirtualRegionTypeMap::ParseChar[srTag.ToUInt8()];
+}
+
+BamRecord& BamRecord::ScrapRegionType(const VirtualRegionType type)
+{
+    internal::CreateOrEdit(BamRecordTag::SCRAP_REGION_TYPE, static_cast<uint8_t>(type), &impl_);
+    return *this;
+}
+
+BamRecord& BamRecord::ScrapRegionType(const char type)
+{
+    internal::CreateOrEdit(BamRecordTag::SCRAP_REGION_TYPE, type, &impl_);
+    return *this;
+}
+
+ZmwType BamRecord::ScrapZmwType() const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::SCRAP_ZMW_TYPE);
+    const Tag szTag = impl_.TagValue(tagName);
+    return ZmwTypeMap::ParseChar[szTag.ToUInt8()];
+}
+
+BamRecord& BamRecord::ScrapZmwType(const ZmwType type)
+{
+    internal::CreateOrEdit(BamRecordTag::SCRAP_ZMW_TYPE, static_cast<uint8_t>(type), &impl_);
+    return *this;
+}
+
+BamRecord& BamRecord::ScrapZmwType(const char type)
+{
+    internal::CreateOrEdit(BamRecordTag::SCRAP_ZMW_TYPE, type, &impl_);
+    return *this;
+}
+
+std::string BamRecord::Sequence(const Orientation orientation, bool aligned,
+                                bool exciseSoftClips) const
+{
+    return FetchBases(BamRecordTag::SEQ, orientation, aligned, exciseSoftClips);
+}
+
+std::vector<float> BamRecord::SignalToNoise() const
+{
+    const auto tagName = internal::BamRecordTags::LabelFor(BamRecordTag::SNR);
+    const Tag snTag = impl_.TagValue(tagName);
+    return snTag.ToFloatArray();
+}
+
+BamRecord& BamRecord::SignalToNoise(const std::vector<float>& snr)
+{
+    internal::CreateOrEdit(BamRecordTag::SNR, snr, &impl_);
+    return *this;
+}
+
+std::vector<uint32_t> BamRecord::StartFrame(Orientation orientation, bool aligned,
+                                            bool exciseSoftClips, PulseBehavior pulseBehavior) const
+{
+    return FetchUInt32s(BamRecordTag::START_FRAME, orientation, aligned, exciseSoftClips,
+                        pulseBehavior);
+}
+
+BamRecord& BamRecord::StartFrame(const std::vector<uint32_t>& startFrame)
+{
+    internal::CreateOrEdit(BamRecordTag::START_FRAME, startFrame, &impl_);
+    return *this;
+}
+
+QualityValues BamRecord::SubstitutionQV(Orientation orientation, bool aligned,
+                                        bool exciseSoftClips) const
+{
+    return FetchQualities(BamRecordTag::SUBSTITUTION_QV, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::SubstitutionQV(const QualityValues& substitutionQVs)
+{
+    internal::CreateOrEdit(BamRecordTag::SUBSTITUTION_QV, substitutionQVs.Fastq(), &impl_);
+    return *this;
+}
+
+std::string BamRecord::SubstitutionTag(Orientation orientation, bool aligned,
+                                       bool exciseSoftClips) const
+{
+    return FetchBases(BamRecordTag::SUBSTITUTION_TAG, orientation, aligned, exciseSoftClips);
+}
+
+BamRecord& BamRecord::SubstitutionTag(const std::string& tags)
+{
+    internal::CreateOrEdit(BamRecordTag::SUBSTITUTION_TAG, tags, &impl_);
+    return *this;
+}
+
+RecordType BamRecord::Type() const
+{
+    try {
+        const auto typeName = ReadGroup().ReadType();
+        return internal::NameToType(typeName);
+    } catch (std::exception&) {
+
+        // read group not found, peek at name to see if we're possibly
+        // CCS or TRANSCRIPT
+        //
+        const auto name = FullName();
+        if (name.find("transcript") == 0)
+            return RecordType::TRANSCRIPT;
+        else if (name.find("/ccs") != std::string::npos)
+            return RecordType::CCS;
+        else
+            return RecordType::UNKNOWN;
+    }
+}
+
+void BamRecord::UpdateName()
+{
+    std::string newName;
+    newName.reserve(100);
+
+    const auto holeNumber = (HasHoleNumber() ? std::to_string(HoleNumber()) : "?");
+    if (Type() == RecordType::TRANSCRIPT) {
+        newName = "transcript/" + holeNumber;
+    } else {
+        newName += MovieName();
+        newName += "/";
+        newName += holeNumber;
+        newName += "/";
+
+        if (Type() == RecordType::CCS)
+            newName += "ccs";
+
+        else {
+            if (HasQueryStart())
+                newName += std::to_string(QueryStart());
+            else
+                newName += "?";
+
+            newName += '_';
+
+            if (HasQueryEnd())
+                newName += std::to_string(QueryEnd());
+            else
+                newName += "?";
+        }
+    }
+    impl_.Name(newName);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamRecordBuilder.cpp b/src/BamRecordBuilder.cpp

new file mode 100644 (file)

index 0000000..5699bbc
--- /dev/null
+++ b/src/BamRecordBuilder.cpp
@@ -0,0 +1,314 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamRecordBuilder.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include <htslib/sam.h>
+
+#include "MemoryUtils.h"
+#include "pbbam/BamTagCodec.h"
+
+namespace PacBio {
+namespace BAM {
+
+BamRecordBuilder::BamRecordBuilder()
+{
+    // ensure proper clean slate
+    Reset();
+
+    // initialize with some space for data
+    name_.reserve(256);
+    sequence_.reserve(2096);
+    qualities_.reserve(2096);
+    cigar_.reserve(256);
+}
+
+BamRecordBuilder::BamRecordBuilder(BamHeader header) : header_{std::move(header)}
+{
+    // ensure proper clean slate
+    Reset();
+
+    // initialize with some space for data
+    name_.reserve(256);
+    sequence_.reserve(2096);
+    qualities_.reserve(2096);
+    cigar_.reserve(256);
+}
+
+BamRecordBuilder::BamRecordBuilder(const BamRecord& prototype) : header_{prototype.Header()}
+{
+    Reset(prototype);
+}
+
+BamRecord BamRecordBuilder::Build() const
+{
+    BamRecord result{header_};
+    BuildInPlace(result);
+    return result;
+}
+
+bool BamRecordBuilder::BuildInPlace(BamRecord& record) const
+{
+    // initialize with basic 'core data'
+    auto recordRawData = internal::BamRecordMemory::GetRawData(record);
+    if (!recordRawData || !recordRawData->data)
+        throw std::runtime_error{"BamRecord memory in invalid state"};
+    recordRawData->core = core_;
+
+    // setup variable length data
+    const auto encodedTags = BamTagCodec::Encode(tags_);
+
+    const size_t nameLength = name_.size() + 1;
+    const size_t numCigarOps = cigar_.size();
+    const size_t cigarLength = numCigarOps * sizeof(uint32_t);
+    const size_t seqLength = sequence_.size();
+    const size_t qualLength = seqLength;
+    const size_t tagLength = encodedTags.size();
+    const size_t dataLength = nameLength + cigarLength + seqLength + qualLength + tagLength;
+
+    // realloc if necessary
+    uint8_t* varLengthDataBlock = recordRawData->data;
+    if (!varLengthDataBlock) throw std::runtime_error{"BamRecord memory in invalid state"};
+
+    size_t allocatedDataLength = recordRawData->m_data;
+    if (allocatedDataLength < dataLength) {
+        allocatedDataLength = dataLength;
+        kroundup32(allocatedDataLength);
+        varLengthDataBlock =
+            static_cast<uint8_t*>(realloc(varLengthDataBlock, allocatedDataLength));
+    }
+    recordRawData->data = varLengthDataBlock;
+    recordRawData->l_data = dataLength;
+    recordRawData->m_data = allocatedDataLength;
+
+    size_t index = 0;
+
+    // name
+    memcpy(&varLengthDataBlock[index], name_.c_str(), nameLength);
+    index += nameLength;
+
+    // cigar
+    if (cigarLength > 0) {
+        std::vector<uint32_t> encodedCigar(numCigarOps);
+        for (size_t i = 0; i < numCigarOps; ++i) {
+            const auto& op = cigar_.at(i);
+            encodedCigar[i] = op.Length() << BAM_CIGAR_SHIFT;
+            const auto type = static_cast<uint8_t>(op.Type());
+            if (type >= 8)
+                throw std::runtime_error{"invalid CIGAR op type: " + std::to_string(type)};
+            encodedCigar[i] |= type;
+        }
+        memcpy(&varLengthDataBlock[index], &encodedCigar[0], cigarLength);
+        index += cigarLength;
+
+        // update bin after we've calculated cigar info
+        const int32_t endPosition = bam_cigar2rlen(recordRawData->core.n_cigar, &encodedCigar[0]);
+        recordRawData->core.bin = hts_reg2bin(core_.pos, endPosition, 14, 5);
+    }
+
+    // seq & qual
+    if (seqLength > 0) {
+
+        uint8_t* s = &varLengthDataBlock[index];
+        for (size_t i = 0; i < seqLength; ++i)
+            s[i >> 1] |= (seq_nt16_table[static_cast<int>(sequence_.at(i))] << ((~i & 1) << 2));
+        index += seqLength;
+
+        uint8_t* q = &varLengthDataBlock[index];
+        if (!qualities_.empty())
+            memset(q, 0xFF, seqLength);
+        else {
+            for (size_t i = 0; i < seqLength; ++i)
+                q[i] = qualities_.at(i) - 33;
+        }
+        index += seqLength;
+    }
+
+    // tags
+    if (tagLength > 0) {
+        if (encodedTags.empty()) throw std::runtime_error{"expected tags but none are encoded"};
+        memcpy(&varLengthDataBlock[index], &encodedTags[0], tagLength);
+        index += tagLength;
+    }
+
+    // sanity check
+    if (index != dataLength) {
+        throw std::runtime_error{"BAM encoding error: expected to write " +
+                                 std::to_string(dataLength) + " bytes but wrote " +
+                                 std::to_string(index) + " bytes instead"};
+    }
+    return true;
+}
+
+BamRecordBuilder& BamRecordBuilder::Cigar(PacBio::BAM::Cigar cigar)
+{
+    core_.n_cigar = cigar.size();
+    cigar_ = std::move(cigar);
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::Name(std::string name)
+{
+    core_.l_qname = name.size() + 1;  // (NULL-term)
+    name_ = std::move(name);
+    return *this;
+}
+
+void BamRecordBuilder::Reset()
+{
+    // zeroize fixed-length data
+    memset(&core_, 0, sizeof(bam1_core_t));
+    core_.l_qname = 1;  // always has a NULL-term
+
+    // reset variable-length data
+    name_.clear();
+    sequence_.clear();
+    qualities_.clear();
+    cigar_.clear();
+    tags_.clear();
+}
+
+void BamRecordBuilder::Reset(BamRecord prototype)
+{
+    // ensure clean slate
+    Reset();
+    header_ = prototype.Header();
+
+    // reset variable-length data
+    const BamRecordImpl& impl = internal::BamRecordMemory::GetImpl(prototype);
+    name_ = impl.Name();
+    sequence_ = impl.Sequence();
+    qualities_ = impl.Qualities().Fastq();
+    cigar_ = impl.CigarData();
+    tags_ = impl.Tags();
+
+    // reset core data
+    const auto rawData =
+        internal::BamRecordMemory::GetRawData(prototype);  //  prototype.impl_.RawData().get();
+    if (!rawData) throw std::runtime_error{"BamRecord memory in invalid state"};
+    core_ = std::move(rawData->core);
+}
+
+BamRecordBuilder& BamRecordBuilder::Sequence(std::string sequence)
+{
+    core_.l_qseq = sequence.size();
+    sequence_ = std::move(sequence);
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetDuplicate(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::DUPLICATE;
+    else
+        core_.flag &= ~BamRecordImpl::DUPLICATE;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetFailedQC(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::FAILED_QC;
+    else
+        core_.flag &= ~BamRecordImpl::FAILED_QC;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetFirstMate(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::MATE_1;
+    else
+        core_.flag &= ~BamRecordImpl::MATE_1;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetMapped(bool ok)
+{
+    if (ok)
+        core_.flag &= ~BamRecordImpl::UNMAPPED;
+    else
+        core_.flag |= BamRecordImpl::UNMAPPED;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetMateMapped(bool ok)
+{
+    if (ok)
+        core_.flag &= ~BamRecordImpl::MATE_UNMAPPED;
+    else
+        core_.flag |= BamRecordImpl::MATE_UNMAPPED;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetMateReverseStrand(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::MATE_REVERSE_STRAND;
+    else
+        core_.flag &= ~BamRecordImpl::MATE_REVERSE_STRAND;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetPaired(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::PAIRED;
+    else
+        core_.flag &= ~BamRecordImpl::PAIRED;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetPrimaryAlignment(bool ok)
+{
+    if (ok)
+        core_.flag &= ~BamRecordImpl::SECONDARY;
+    else
+        core_.flag |= BamRecordImpl::SECONDARY;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetProperPair(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::PROPER_PAIR;
+    else
+        core_.flag &= ~BamRecordImpl::PROPER_PAIR;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetReverseStrand(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::REVERSE_STRAND;
+    else
+        core_.flag &= ~BamRecordImpl::REVERSE_STRAND;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetSecondMate(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::MATE_2;
+    else
+        core_.flag &= ~BamRecordImpl::MATE_2;
+    return *this;
+}
+
+BamRecordBuilder& BamRecordBuilder::SetSupplementaryAlignment(bool ok)
+{
+    if (ok)
+        core_.flag |= BamRecordImpl::SUPPLEMENTARY;
+    else
+        core_.flag &= ~BamRecordImpl::SUPPLEMENTARY;
+    return *this;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamRecordImpl.cpp b/src/BamRecordImpl.cpp

new file mode 100644 (file)

index 0000000..0681eb1
--- /dev/null
+++ b/src/BamRecordImpl.cpp
@@ -0,0 +1,644 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamRecordImpl.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <tuple>
+#include <utility>
+
+#include <htslib/hts_endian.h>
+
+#include "pbbam/BamTagCodec.h"
+
+#include "BamRecordTags.h"
+#include "MemoryUtils.h"
+#include "StringUtils.h"
+
+namespace PacBio {
+namespace BAM {
+
+namespace {
+
+static bool DoesHtslibSupportLongCigar()
+{
+    const std::string htsVersion = hts_version();
+
+    // remove any "-<blah>" for non-release versions
+    const auto versionBase = PacBio::BAM::Split(htsVersion, '-');
+    if (versionBase.empty())
+        throw std::runtime_error{"invalid htslib version format: " + htsVersion};
+
+    // grab major/minor version numbers
+    const auto versionParts = PacBio::BAM::Split(versionBase[0], '.');
+    if (versionParts.size() < 2)
+        throw std::runtime_error{"invalid htslib version format: " + htsVersion};
+
+    // check against v1.7
+    const int versionMajor = std::stoi(versionParts[0]);
+    const int versionMinor = std::stoi(versionParts[1]);
+    static constexpr const int v17_major = 1;
+    static constexpr const int v17_minor = 7;
+    return std::tie(versionMajor, versionMinor) >= std::tie(v17_major, v17_minor);
+}
+
+static const bool has_native_long_cigar_support = DoesHtslibSupportLongCigar();
+
+Cigar FetchRawCigar(const uint32_t* const src, const uint32_t len)
+{
+    Cigar result;
+    result.reserve(len);
+    for (uint32_t i = 0; i < len; ++i) {
+        const uint32_t length = bam_cigar_oplen(src[i]);
+        const auto type = static_cast<CigarOperationType>(bam_cigar_op(src[i]));
+        result.push_back(CigarOperation(type, length));
+    }
+    return result;
+}
+
+bool HasLongCigar(const bam1_t* const b)
+{
+    auto* c = &b->core;
+
+    // if empty CIGAR or unmapped
+    if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return false;
+
+    // if existing CIGAR doesn't look like a 'fake CIGAR'
+    const auto firstCigarOp = *(bam_get_cigar(b));
+    if (bam_cigar_op(firstCigarOp) != static_cast<uint32_t>(CigarOperationType::SOFT_CLIP) ||
+        static_cast<int32_t>(bam_cigar_oplen(firstCigarOp)) != c->l_qseq) {
+        return false;
+    }
+
+    // if CG tag missing, not expected type
+    const uint8_t* const CG = bam_aux_get(b, "CG");
+    if (CG == nullptr) return false;
+    if (CG[0] != 'B' || CG[1] != 'I') return false;
+
+    // if CG tag data is empty
+    uint32_t numElements = 0;
+    memcpy(&numElements, &CG[2], sizeof(uint32_t));
+    if (numElements == 0) return false;
+
+    // we've found long CIGAR data in the CG tag
+    return true;
+}
+
+}  // namespace anonymous
+
+BamRecordImpl::BamRecordImpl() : d_(nullptr)
+{
+    InitializeData();
+    assert(d_);
+}
+
+BamRecordImpl::BamRecordImpl(const BamRecordImpl& other)
+    : d_{bam_dup1(other.d_.get()), internal::HtslibRecordDeleter()}, tagOffsets_{other.tagOffsets_}
+{
+    assert(d_);
+}
+
+BamRecordImpl::BamRecordImpl(BamRecordImpl&& other) : tagOffsets_{std::move(other.tagOffsets_)}
+{
+    d_.swap(other.d_);
+    other.d_.reset();
+    assert(d_);
+}
+
+BamRecordImpl& BamRecordImpl::operator=(const BamRecordImpl& other)
+{
+    if (this != &other) {
+        if (d_ == nullptr) InitializeData();
+        bam_copy1(d_.get(), other.d_.get());
+        tagOffsets_ = other.tagOffsets_;
+    }
+    assert(d_);
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::operator=(BamRecordImpl&& other)
+{
+    if (this != &other) {
+        d_.swap(other.d_);
+        other.d_.reset();
+
+        tagOffsets_ = std::move(other.tagOffsets_);
+    }
+    assert(d_);
+    return *this;
+}
+
+bool BamRecordImpl::AddTag(const std::string& tagName, const Tag& value)
+{
+    return AddTag(tagName, value, TagModifier::NONE);
+}
+
+bool BamRecordImpl::AddTag(const BamRecordTag tag, const Tag& value)
+{
+    return AddTag(internal::BamRecordTags::LabelFor(tag), value, TagModifier::NONE);
+}
+
+bool BamRecordImpl::AddTag(const std::string& tagName, const Tag& value,
+                           const TagModifier additionalModifier)
+{
+    if (tagName.size() != 2 || HasTag(tagName)) return false;
+    const auto added = AddTagImpl(tagName, value, additionalModifier);
+    if (added) UpdateTagMap();
+    return added;
+}
+
+bool BamRecordImpl::AddTag(const BamRecordTag tag, const Tag& value,
+                           const TagModifier additionalModifier)
+{
+    return AddTag(internal::BamRecordTags::LabelFor(tag), value, additionalModifier);
+}
+
+bool BamRecordImpl::AddTagImpl(const std::string& tagName, const Tag& value,
+                               const TagModifier additionalModifier)
+{
+    const auto rawData = BamTagCodec::ToRawData(value, additionalModifier);
+    if (rawData.empty()) return false;
+
+    bam_aux_append(d_.get(), tagName.c_str(), BamTagCodec::TagTypeCode(value, additionalModifier),
+                   rawData.size(), const_cast<uint8_t*>(rawData.data()));
+    return true;
+}
+
+Cigar BamRecordImpl::CigarData() const
+{
+    const auto* b = d_.get();
+    if (!has_native_long_cigar_support && HasLongCigar(b)) {
+        // fetch long CIGAR from tag
+        const auto cigarTag = TagValue("CG");
+        const auto cigarTagValue = cigarTag.ToUInt32Array();
+        return FetchRawCigar(cigarTagValue.data(), cigarTagValue.size());
+    } else {
+        // fetch CIGAR from the standard location
+        return FetchRawCigar(bam_get_cigar(b), b->core.n_cigar);
+    }
+}
+
+BamRecordImpl& BamRecordImpl::CigarData(const Cigar& cigar)
+{
+    // if long CIGAR, using htslib version < 1.7, set it "manually"
+    if (!has_native_long_cigar_support && cigar.size() >= 65536) {
+        // Add the 'fake' CIGAR in normal place.
+        Cigar fake;
+        fake.emplace_back(CigarOperationType::SOFT_CLIP, SequenceLength());
+        const uint32_t alignedLength =
+            static_cast<uint32_t>(bam_cigar2rlen(d_->core.n_cigar, bam_get_cigar(d_.get())));
+        fake.emplace_back(CigarOperationType::REFERENCE_SKIP, alignedLength);
+        SetCigarData(fake);
+
+        // Add raw CIGAR data to CG tag.
+        std::vector<uint32_t> cigarData(cigar.size());
+        cigarData.reserve(cigar.size());
+        for (size_t i = 0; i < cigar.size(); ++i) {
+            const CigarOperation& op = cigar.at(i);
+            cigarData[i] = bam_cigar_gen(op.Length(), static_cast<int>(op.Type()));
+        }
+        if (HasTag("CG"))
+            EditTag("CG", Tag{cigarData});
+        else
+            AddTag("CG", Tag{cigarData});
+    }
+
+    // otherwise (v1.7+ or short CIGAR), use standard APIs
+    else {
+        if (HasTag("CG")) RemoveTag("CG");
+        SetCigarData(cigar);
+    }
+
+    return *this;
+}
+
+BamRecordImpl& BamRecordImpl::CigarData(const std::string& cigarString)
+{
+    return CigarData(Cigar::FromStdString(cigarString));
+}
+
+bool BamRecordImpl::EditTag(const std::string& tagName, const Tag& newValue)
+{
+    return EditTag(tagName, newValue, TagModifier::NONE);
+}
+
+bool BamRecordImpl::EditTag(const BamRecordTag tag, const Tag& newValue)
+{
+    return EditTag(internal::BamRecordTags::LabelFor(tag), newValue, TagModifier::NONE);
+}
+
+bool BamRecordImpl::EditTag(const std::string& tagName, const Tag& newValue,
+                            const TagModifier additionalModifier)
+{
+    // try remove old value (with delayed tag map update)
+    const bool removed = RemoveTagImpl(tagName);
+    if (!removed) return false;
+
+    // if old value removed, add new value
+    const bool added = AddTagImpl(tagName, newValue, additionalModifier);
+    if (added) UpdateTagMap();
+    return added;
+}
+
+bool BamRecordImpl::EditTag(const BamRecordTag tag, const Tag& newValue,
+                            const TagModifier additionalModifier)
+{
+    return EditTag(internal::BamRecordTags::LabelFor(tag), newValue, additionalModifier);
+}
+
+BamRecordImpl BamRecordImpl::FromRawData(const std::shared_ptr<bam1_t>& rawData)
+{
+    BamRecordImpl result;
+    bam_copy1(result.d_.get(), rawData.get());
+    return result;
+}
+
+bool BamRecordImpl::HasTag(const std::string& tagName) const
+{
+    if (tagName.size() != 2) return false;
+    return TagOffset(tagName) != -1;
+
+    // 27635
+    //    return bam_aux_get(d_.get(), tagName.c_str()) != 0;
+}
+
+bool BamRecordImpl::HasTag(const BamRecordTag tag) const
+{
+    return HasTag(internal::BamRecordTags::LabelFor(tag));
+}
+
+void BamRecordImpl::InitializeData()
+{
+    d_.reset(bam_init1(), internal::HtslibRecordDeleter());
+    d_->data = static_cast<uint8_t*>(
+        calloc(0x800, sizeof(uint8_t)));  // maybe make this value tune-able later?
+    d_->m_data = 0x800;
+
+    // init unmapped
+    Position(PacBio::BAM::UnmappedPosition);
+    MatePosition(PacBio::BAM::UnmappedPosition);
+    ReferenceId(-1);
+    MateReferenceId(-1);
+    SetMapped(false);
+    MapQuality(255);
+
+    // initialized with empty qname (null term + 3 'extra nulls' for alignment
+    d_->core.l_extranul = 3;
+    d_->core.l_qname = 4;
+    d_->l_data = 4;
+}
+
+void BamRecordImpl::MaybeReallocData()
+{
+    // about to grow data contents to l_data size, but m_data is our current max.
+    // so we may need to grow. if so, use kroundup to double to next power of 2
+    //
+    // from sam.h:
+    //   decltype(m_data) = uint32_t
+    //   decltype(l_data) = int
+    if (d_->m_data < static_cast<uint32_t>(d_->l_data)) {
+        d_->m_data = d_->l_data;
+        kroundup32(d_->m_data);
+        d_->data = static_cast<uint8_t*>(realloc(d_->data, d_->m_data));
+    }
+}
+
+std::string BamRecordImpl::Name() const { return std::string(bam_get_qname(d_)); }
+
+BamRecordImpl& BamRecordImpl::Name(const std::string& name)
+{
+    // determine change in memory needed
+    // diffNumBytes: pos -> growing, neg -> shrinking
+    const size_t numChars = name.size() + 1;  // +1 for NULL-term
+    const size_t numExtraNulls = 4 - (numChars % 4);
+    const size_t totalNameSize = numChars + numExtraNulls;
+
+    const int diffNumBytes = totalNameSize - d_->core.l_qname;
+    const int oldLengthData = d_->l_data;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+
+    // shift trailing data (cigar, seq, qual, tags) as needed
+    const uint32_t* oldCigarStart = bam_get_cigar(d_);
+    const size_t trailingDataLength =
+        oldLengthData - (reinterpret_cast<const unsigned char*>(oldCigarStart) -
+                         reinterpret_cast<const unsigned char*>(d_->data));
+    d_->core.l_qname = totalNameSize;
+    d_->core.l_extranul = numExtraNulls;
+    uint32_t* newCigarStart = bam_get_cigar(d_);
+    memmove(newCigarStart, oldCigarStart, trailingDataLength);
+
+    // fill in new name
+    memcpy(d_->data, name.c_str(), numChars);
+    memset(d_->data + numChars, '\0', numExtraNulls);
+    return *this;
+}
+
+QualityValues BamRecordImpl::Qualities() const
+{
+    if (d_->core.l_qseq == 0) return QualityValues();
+
+    uint8_t* qualData = bam_get_qual(d_);
+    if (qualData[0] == 0xff) return QualityValues();
+
+    const size_t numQuals = d_->core.l_qseq;
+    QualityValues result;
+    result.reserve(numQuals);
+    for (size_t i = 0; i < numQuals; ++i)
+        result.push_back(QualityValue(qualData[i]));
+    return result;
+}
+
+bool BamRecordImpl::RemoveTag(const std::string& tagName)
+{
+    const bool removed = RemoveTagImpl(tagName);
+    if (removed) UpdateTagMap();
+    return removed;
+}
+
+bool BamRecordImpl::RemoveTag(const BamRecordTag tag)
+{
+    return RemoveTag(internal::BamRecordTags::LabelFor(tag));
+}
+
+bool BamRecordImpl::RemoveTagImpl(const std::string& tagName)
+{
+    if (tagName.size() != 2) return false;
+    uint8_t* data = bam_aux_get(d_.get(), tagName.c_str());
+    if (data == nullptr) return false;
+    const bool ok = bam_aux_del(d_.get(), data) == 0;
+    return ok;
+}
+
+std::string BamRecordImpl::Sequence() const
+{
+    std::string result(d_->core.l_qseq, '\0');
+    static const constexpr std::array<char, 16> DnaLookup{
+        {'=', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'}};
+    const uint8_t* seqData = bam_get_seq(d_);
+    for (int i = 0; i < d_->core.l_qseq; ++i)
+        result[i] = DnaLookup[bam_seqi(seqData, i)];
+    return result;
+}
+
+size_t BamRecordImpl::SequenceLength() const { return d_->core.l_qseq; }
+
+void BamRecordImpl::SetCigarData(const Cigar& cigar)
+{
+    // determine change in memory needed
+    // diffNumBytes: pos -> growing, neg -> shrinking
+    const size_t numCigarOps = cigar.size();
+    const int diffNumCigars = numCigarOps - d_->core.n_cigar;
+    const int diffNumBytes = diffNumCigars * sizeof(uint32_t);
+    const int oldLengthData = d_->l_data;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+
+    // shift trailing data (seq, qual, tags) as needed
+    const uint8_t* oldSequenceStart = bam_get_seq(d_);
+    const size_t trailingDataLength = oldLengthData - (oldSequenceStart - d_->data);
+    d_->core.n_cigar = numCigarOps;
+    uint8_t* newSequenceStart = bam_get_seq(d_);
+    memmove(newSequenceStart, oldSequenceStart, trailingDataLength);
+
+    // fill in new CIGAR data
+    uint32_t* cigarDataStart = bam_get_cigar(d_);
+    for (size_t i = 0; i < numCigarOps; ++i) {
+        const CigarOperation& cigarOp = cigar.at(i);
+        cigarDataStart[i] = bam_cigar_gen(cigarOp.Length(), static_cast<int>(cigarOp.Type()));
+    }
+}
+
+BamRecordImpl& BamRecordImpl::SetSequenceAndQualities(const std::string& sequence,
+                                                      const std::string& qualities)
+{
+    if (!qualities.empty() && (sequence.size() != qualities.size()))
+        throw std::runtime_error{"If QUAL provided, must be of the same length as SEQ"};
+
+    return SetSequenceAndQualitiesInternal(sequence.c_str(), sequence.size(), qualities.c_str(),
+                                           false);
+}
+
+BamRecordImpl& BamRecordImpl::SetSequenceAndQualities(const char* sequence,
+                                                      const size_t sequenceLength,
+                                                      const char* qualities)
+{
+    return SetSequenceAndQualitiesInternal(sequence, sequenceLength, qualities, false);
+}
+
+BamRecordImpl& BamRecordImpl::SetPreencodedSequenceAndQualities(const char* encodedSequence,
+                                                                const size_t rawSequenceLength,
+                                                                const char* qualities)
+{
+    return SetSequenceAndQualitiesInternal(encodedSequence, rawSequenceLength, qualities, true);
+}
+
+BamRecordImpl& BamRecordImpl::SetSequenceAndQualitiesInternal(const char* sequence,
+                                                              const size_t sequenceLength,
+                                                              const char* qualities,
+                                                              bool isPreencoded)
+{
+    // determine change in memory needed
+    // diffNumBytes: pos -> growing, neg -> shrinking
+    const auto encodedSequenceLength = static_cast<int>((sequenceLength + 1) / 2);
+    const int oldSeqAndQualLength =
+        ((d_->core.l_qseq + 1) / 2) + d_->core.l_qseq;                       // encoded seq + qual
+    const int newSeqAndQualLength = encodedSequenceLength + sequenceLength;  // encoded seq + qual
+    const int diffNumBytes = newSeqAndQualLength - oldSeqAndQualLength;
+    const int oldLengthData = d_->l_data;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+
+    // shift trailing data (tags) as needed
+    const unsigned char* oldTagStart = bam_get_aux(d_);
+    const size_t trailingDataLength =
+        oldLengthData - (oldTagStart - reinterpret_cast<const unsigned char*>(d_->data));
+    d_->core.l_qseq = sequenceLength;
+    uint8_t* newTagStart = bam_get_aux(d_);
+    memmove(newTagStart, oldTagStart, trailingDataLength);
+
+    // fill in new sequence
+    uint8_t* pEncodedSequence = bam_get_seq(d_);
+    if (isPreencoded) {
+        memcpy(pEncodedSequence, sequence, encodedSequenceLength);
+    } else {
+        memset(pEncodedSequence, 0, encodedSequenceLength);
+        for (size_t i = 0; i < sequenceLength; ++i)
+            pEncodedSequence[i >> 1] |= seq_nt16_table[static_cast<int>(sequence[i])]
+                                        << ((~i & 1) << 2);
+    }
+
+    // fill in quality values
+    uint8_t* encodedQualities = bam_get_qual(d_);
+    if ((qualities == nullptr) || (strlen(qualities) == 0))
+        memset(encodedQualities, 0xff, sequenceLength);
+    else {
+        for (size_t i = 0; i < sequenceLength; ++i)
+            encodedQualities[i] = qualities[i] - 33;  // FASTQ ASCII -> int conversion
+    }
+    return *this;
+}
+
+int BamRecordImpl::TagOffset(const std::string& tagName) const
+{
+    if (tagName.size() != 2) throw std::runtime_error{"invalid tag name size"};
+
+    if (tagOffsets_.empty()) UpdateTagMap();
+
+    const uint16_t tagCode =
+        (static_cast<uint8_t>(tagName.at(0)) << 8) | static_cast<uint8_t>(tagName.at(1));
+    const auto found = tagOffsets_.find(tagCode);
+    return (found != tagOffsets_.cend() ? found->second : -1);
+}
+
+BamRecordImpl& BamRecordImpl::Tags(const TagCollection& tags)
+{
+    // convert tags to binary
+    const std::vector<uint8_t> tagData = BamTagCodec::Encode(tags);
+    const size_t numBytes = tagData.size();
+    const uint8_t* data = tagData.data();
+
+    // determine change in memory needed
+    uint8_t* tagStart = bam_get_aux(d_);
+    const size_t oldNumBytes = d_->l_data - (tagStart - d_->data);
+    const int diffNumBytes = numBytes - oldNumBytes;
+    d_->l_data += diffNumBytes;
+    MaybeReallocData();
+    tagStart = bam_get_aux(d_);
+
+    // fill in new tag data
+    memcpy(static_cast<void*>(tagStart), data, numBytes);
+
+    // update tag info
+    UpdateTagMap();
+    return *this;
+}
+
+TagCollection BamRecordImpl::Tags() const
+{
+    const uint8_t* tagDataStart = bam_get_aux(d_);
+    const size_t numBytes = d_->l_data - (tagDataStart - d_->data);
+    return BamTagCodec::Decode(std::vector<uint8_t>(tagDataStart, tagDataStart + numBytes));
+}
+
+Tag BamRecordImpl::TagValue(const std::string& tagName) const
+{
+    if (tagName.size() != 2) return {};
+
+    const int offset = TagOffset(tagName);
+    if (offset == -1) return {};
+
+    bam1_t* b = d_.get();
+    assert(bam_get_aux(b));
+    uint8_t* tagData = bam_get_aux(b) + offset;
+    if (offset >= b->l_data) return {};
+
+    // skip tag name
+    return BamTagCodec::FromRawData(tagData);
+}
+
+Tag BamRecordImpl::TagValue(const BamRecordTag tag) const
+{
+    return TagValue(internal::BamRecordTags::LabelFor(tag));
+}
+
+void BamRecordImpl::UpdateTagMap() const
+{
+    // clear out offsets, leave map structure basically intact
+    for (auto& tag : tagOffsets_)
+        tag.second = -1;
+
+    const uint8_t* tagStart = bam_get_aux(d_);
+    if (tagStart == nullptr) return;
+    const ptrdiff_t numBytes = d_->l_data - (tagStart - d_->data);
+
+    // NOTE: using a 16-bit 'code' for tag name here instead of string, to avoid
+    // a lot of string constructions & comparisons. All valid tags will be 2 chars
+    // anyway, so this should be a nice lookup mechanism.
+    //
+    uint16_t tagNameCode;
+    int64_t i = 0;
+    while (i < numBytes) {
+
+        // store (tag name code -> start offset into tag data)
+        tagNameCode = static_cast<char>(tagStart[i]) << 8 | static_cast<char>(tagStart[i + 1]);
+        i += 2;
+        tagOffsets_[tagNameCode] = i;
+
+        // skip tag contents
+        const auto tagType = static_cast<char>(tagStart[i++]);
+        switch (tagType) {
+            case 'A':
+            case 'a':
+            case 'c':
+            case 'C': {
+                i += 1;
+                break;
+            }
+            case 's':
+            case 'S': {
+                i += 2;
+                break;
+            }
+            case 'i':
+            case 'I':
+            case 'f': {
+                i += 4;
+                break;
+            }
+
+            case 'Z':
+            case 'H': {
+                // null-terminated string
+                i += strlen(reinterpret_cast<const char*>(&tagStart[i])) + 1;
+                break;
+            }
+
+            case 'B': {
+                const char subTagType = tagStart[i++];
+                size_t elementSize = 0;
+                switch (subTagType) {
+                    case 'c':
+                    case 'C':
+                        elementSize = 1;
+                        break;
+                    case 's':
+                    case 'S':
+                        elementSize = 2;
+                        break;
+                    case 'i':
+                    case 'I':
+                    case 'f':
+                        elementSize = 4;
+                        break;
+
+                    // unknown subTagType
+                    default:
+                        throw std::runtime_error{"unsupported array-tag-type encountered: " +
+                                                 std::string{1, subTagType}};
+                }
+
+                uint32_t numElements = 0;
+                memcpy(&numElements, &tagStart[i], sizeof(uint32_t));
+                i += (4 + (elementSize * numElements));
+                break;
+            }
+
+            // unknown tagType
+            default:
+                throw std::runtime_error{"unsupported tag-type encountered: " +
+                                         std::string{1, tagType}};
+        }
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamRecordTags.cpp b/src/BamRecordTags.cpp

new file mode 100644 (file)

index 0000000..cffa3d1
--- /dev/null
+++ b/src/BamRecordTags.cpp
@@ -0,0 +1,68 @@
+// File Description
+/// \file BamRecordTags.h
+/// \brief Implements the BamRecordTags utility class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "BamRecordTags.h"
+
+#include <cassert>
+#include <unordered_map>
+
+#include "EnumClassHash.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// clang-format off
+const BamRecordTags::TagLookupType BamRecordTags::tagLookup =
+{
+    //     enum name                   label  isPulse?
+    //     ---------                   -----  --------
+    { BamRecordTag::ALT_LABEL_QV,      {"pv", true}  },
+    { BamRecordTag::ALT_LABEL_TAG,     {"pt", true}  },
+    { BamRecordTag::BARCODE_QUALITY,   {"bq", false} },
+    { BamRecordTag::BARCODES,          {"bc", false} },
+    { BamRecordTag::CONTEXT_FLAGS,     {"cx", false} },
+    { BamRecordTag::DELETION_QV,       {"dq", false} },
+    { BamRecordTag::DELETION_TAG,      {"dt", false} },
+    { BamRecordTag::HOLE_NUMBER,       {"zm", false} },
+    { BamRecordTag::INSERTION_QV,      {"iq", false} },
+    { BamRecordTag::IPD,               {"ip", false} },
+    { BamRecordTag::LABEL_QV,          {"pq", true}  },
+    { BamRecordTag::LONG_CIGAR,        {"CG", false} },
+    { BamRecordTag::MERGE_QV,          {"mq", false} },
+    { BamRecordTag::NUM_PASSES,        {"np", false} },
+    { BamRecordTag::PKMEAN,            {"pa", true}  },
+    { BamRecordTag::PKMEAN_2,          {"ps", true}  },
+    { BamRecordTag::PKMID,             {"pm", true}  },
+    { BamRecordTag::PKMID_2,           {"pi", true}  },
+    { BamRecordTag::PRE_PULSE_FRAMES,  {"pd", true}  },
+    { BamRecordTag::PULSE_CALL,        {"pc", true}  },
+    { BamRecordTag::PULSE_CALL_WIDTH,  {"px", true}  },
+    { BamRecordTag::PULSE_EXCLUSION,   {"pe", true}  },
+    { BamRecordTag::PULSE_MERGE_QV,    {"pg", true}  },
+    { BamRecordTag::PULSE_WIDTH,       {"pw", false} }, // 'pulse' in the name; but stored per-base, not per-pulse
+    { BamRecordTag::QUERY_END,         {"qe", false} },
+    { BamRecordTag::QUERY_START,       {"qs", false} },
+    { BamRecordTag::READ_ACCURACY,     {"rq", false} },
+    { BamRecordTag::READ_GROUP,        {"RG", false} },
+    { BamRecordTag::SCRAP_REGION_TYPE, {"sc", false} },
+    { BamRecordTag::SCRAP_ZMW_TYPE,    {"sz", false} },
+    { BamRecordTag::SNR,               {"sn", false} },
+    { BamRecordTag::START_FRAME,       {"sf", true}  },
+    { BamRecordTag::SUBSTITUTION_QV,   {"sq", false} },
+    { BamRecordTag::SUBSTITUTION_TAG,  {"st", false} },
+
+    // faux tags
+    { BamRecordTag::SEQ,  {"  ",  false} },
+    { BamRecordTag::QUAL, {"  ", false} }
+};
+// clang-format on
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamRecordTags.h b/src/BamRecordTags.h

new file mode 100644 (file)

index 0000000..c955b07
--- /dev/null
+++ b/src/BamRecordTags.h
@@ -0,0 +1,57 @@
+// File Description
+/// \file BamRecordTags.h
+/// \brief Defines the BamRecordTags utility class.
+//
+// Author: Derek Barnett
+
+#ifndef BAMRECORDTAGS_H
+#define BAMRECORDTAGS_H
+
+#include <cassert>
+#include <string>
+#include <unordered_map>
+
+#include "EnumClassHash.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/BamRecordImpl.h"
+#include "pbbam/BamRecordTag.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class BamRecordTags
+{
+public:
+    // tag info
+    static inline bool IsPulse(const BamRecordTag tag);
+    static inline std::string LabelFor(const BamRecordTag tag);
+
+private:
+    struct BamRecordTagData
+    {
+        const std::string label_;  //[3]; // 2-char tag plus NULL
+        const bool isPulse_;
+    };
+    typedef std::unordered_map<BamRecordTag, BamRecordTagData, EnumClassHash> TagLookupType;
+
+    static const TagLookupType tagLookup;
+};
+
+inline bool BamRecordTags::IsPulse(const BamRecordTag tag)
+{
+    assert(tagLookup.find(tag) != tagLookup.cend());
+    return tagLookup.at(tag).isPulse_;
+}
+
+inline std::string BamRecordTags::LabelFor(const BamRecordTag tag)
+{
+    assert(tagLookup.find(tag) != tagLookup.cend());
+    return tagLookup.at(tag).label_;
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // BAMRECORDTAGS_H
diff --git a/src/BamTagCodec.cpp b/src/BamTagCodec.cpp

new file mode 100644 (file)

index 0000000..40bbb19
--- /dev/null
+++ b/src/BamTagCodec.cpp
@@ -0,0 +1,554 @@
+// File Description
+/// \file BamTagCodec.cpp
+/// \brief Implements the BamTagCodec class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamTagCodec.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include <htslib/kstring.h>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template <typename T>
+inline void appendBamValue(const T& value, kstring_t* str)
+{
+    kputsn_(reinterpret_cast<const char*>(&value), sizeof(value), str);
+}
+
+template <typename T>
+inline void appendBamMultiValue(const std::vector<T>& container, kstring_t* str)
+{
+    const uint32_t n = container.size();
+    kputsn_(&n, sizeof(n), str);
+    kputsn_(reinterpret_cast<const char*>(&container[0]), n * sizeof(T), str);
+}
+
+template <typename T>
+inline T readBamValue(const uint8_t* src, size_t& offset)
+{
+    T value;
+    memcpy(&value, &src[offset], sizeof(value));
+    offset += sizeof(value);
+    return value;
+}
+
+template <typename T>
+std::vector<T> readBamMultiValue(const uint8_t* src, size_t& offset)
+{
+    uint32_t numElements;
+    memcpy(&numElements, &src[offset], sizeof(uint32_t));
+    offset += 4;
+
+    std::vector<T> result;
+    result.reserve(numElements);
+    for (size_t i = 0; i < numElements; ++i) {
+        const T value = readBamValue<T>(src, offset);
+        result.push_back(value);
+    }
+    return result;
+}
+
+}  // namespace internal
+
+TagCollection BamTagCodec::Decode(const std::vector<uint8_t>& data)
+{
+    TagCollection tags;
+
+    // NOTE: not completely safe - no real bounds-checking yet on input data
+
+    const uint8_t* pData = data.data();
+    const size_t numBytes = data.size();
+    size_t i = 0;
+    while (i < numBytes) {
+
+        std::string tagName;
+        tagName.reserve(2);
+        tagName.append(1, pData[i++]);
+        tagName.append(1, pData[i++]);
+
+        using internal::readBamMultiValue;
+        using internal::readBamValue;
+
+        const auto tagType = static_cast<char>(pData[i++]);
+        switch (tagType) {
+            case 'A':
+            case 'a': {
+                tags[tagName] = readBamValue<uint8_t>(pData, i);
+                tags[tagName].Modifier(TagModifier::ASCII_CHAR);
+                break;
+            }
+
+            case 'c':
+                tags[tagName] = readBamValue<int8_t>(pData, i);
+                break;
+            case 'C':
+                tags[tagName] = readBamValue<uint8_t>(pData, i);
+                break;
+            case 's':
+                tags[tagName] = readBamValue<int16_t>(pData, i);
+                break;
+            case 'S':
+                tags[tagName] = readBamValue<uint16_t>(pData, i);
+                break;
+            case 'i':
+                tags[tagName] = readBamValue<int32_t>(pData, i);
+                break;
+            case 'I':
+                tags[tagName] = readBamValue<uint32_t>(pData, i);
+                break;
+            case 'f':
+                tags[tagName] = readBamValue<float>(pData, i);
+                break;
+
+            case 'Z':
+            case 'H': {
+                const size_t dataLength = strlen(reinterpret_cast<const char*>(&pData[i]));
+                std::string value(reinterpret_cast<const char*>(&pData[i]), dataLength);
+                tags[tagName] = value;
+                if (tagType == 'H') tags[tagName].Modifier(TagModifier::HEX_STRING);
+                i += dataLength + 1;
+                break;
+            }
+
+            case 'B': {
+                const char subTagType = pData[i++];
+                switch (subTagType) {
+                    case 'c':
+                        tags[tagName] = readBamMultiValue<int8_t>(pData, i);
+                        break;
+                    case 'C':
+                        tags[tagName] = readBamMultiValue<uint8_t>(pData, i);
+                        break;
+                    case 's':
+                        tags[tagName] = readBamMultiValue<int16_t>(pData, i);
+                        break;
+                    case 'S':
+                        tags[tagName] = readBamMultiValue<uint16_t>(pData, i);
+                        break;
+                    case 'i':
+                        tags[tagName] = readBamMultiValue<int32_t>(pData, i);
+                        break;
+                    case 'I':
+                        tags[tagName] = readBamMultiValue<uint32_t>(pData, i);
+                        break;
+                    case 'f':
+                        tags[tagName] = readBamMultiValue<float>(pData, i);
+                        break;
+
+                    // unknown subTagType
+                    default:
+                        throw std::runtime_error{"unsupported array-tag-type encountered: " +
+                                                 std::string{1, subTagType}};
+                }
+                break;
+            }
+
+            // unknown tagType
+            default:
+                throw std::runtime_error{"unsupported tag-type encountered: " +
+                                         std::string{1, tagType}};
+        }
+    }
+
+    return tags;
+}
+
+std::vector<uint8_t> BamTagCodec::Encode(const TagCollection& tags)
+{
+    kstring_t str = {0, 0, nullptr};
+
+    for (const auto& tagIter : tags) {
+
+        const auto& name = tagIter.first;
+        if (name.size() != 2) throw std::runtime_error{"malformatted tag name: " + name};
+
+        const auto& tag = tagIter.second;
+        if (tag.IsNull()) continue;
+
+        // "<TAG>:"
+        kputsn_(name.c_str(), 2, &str);
+
+        // "<TYPE>:<DATA>" for printable, ASCII char
+        if (tag.HasModifier(TagModifier::ASCII_CHAR)) {
+            const char c = tag.ToAscii();
+            if (c != '\0') {
+                kputc_('A', &str);
+                kputc_(c, &str);
+                continue;
+            }
+        }
+
+        using internal::appendBamMultiValue;
+        using internal::appendBamValue;
+
+        // "<TYPE>:<DATA>" for all other data
+        switch (tag.Type()) {
+            case TagDataType::INT8: {
+                kputc_('c', &str);
+                appendBamValue(tag.ToInt8(), &str);
+                break;
+            }
+            case TagDataType::UINT8: {
+                kputc_('C', &str);
+                appendBamValue(tag.ToUInt8(), &str);
+                break;
+            }
+            case TagDataType::INT16: {
+                kputc_('s', &str);
+                appendBamValue(tag.ToInt16(), &str);
+                break;
+            }
+            case TagDataType::UINT16: {
+                kputc_('S', &str);
+                appendBamValue(tag.ToUInt16(), &str);
+                break;
+            }
+            case TagDataType::INT32: {
+                kputc_('i', &str);
+                appendBamValue(tag.ToInt32(), &str);
+                break;
+            }
+            case TagDataType::UINT32: {
+                kputc_('I', &str);
+                appendBamValue(tag.ToUInt32(), &str);
+                break;
+            }
+            case TagDataType::FLOAT: {
+                kputc_('f', &str);
+                appendBamValue(tag.ToFloat(), &str);
+                break;
+            }
+
+            case TagDataType::STRING: {
+                if (tag.HasModifier(TagModifier::HEX_STRING))
+                    kputc_('H', &str);
+                else
+                    kputc_('Z', &str);
+                const auto s = tag.ToString();
+                kputsn_(s.c_str(), s.size() + 1, &str);  // this adds the null-term
+                break;
+            }
+
+            case TagDataType::INT8_ARRAY: {
+                kputc_('B', &str);
+                kputc_('c', &str);
+                appendBamMultiValue(tag.ToInt8Array(), &str);
+                break;
+            }
+            case TagDataType::UINT8_ARRAY: {
+                kputc_('B', &str);
+                kputc_('C', &str);
+                appendBamMultiValue(tag.ToUInt8Array(), &str);
+                break;
+            }
+            case TagDataType::INT16_ARRAY: {
+                kputc_('B', &str);
+                kputc_('s', &str);
+                appendBamMultiValue(tag.ToInt16Array(), &str);
+                break;
+            }
+            case TagDataType::UINT16_ARRAY: {
+                kputc_('B', &str);
+                kputc_('S', &str);
+                appendBamMultiValue(tag.ToUInt16Array(), &str);
+                break;
+            }
+            case TagDataType::INT32_ARRAY: {
+                kputc_('B', &str);
+                kputc_('i', &str);
+                appendBamMultiValue(tag.ToInt32Array(), &str);
+                break;
+            }
+            case TagDataType::UINT32_ARRAY: {
+                kputc_('B', &str);
+                kputc_('I', &str);
+                appendBamMultiValue(tag.ToUInt32Array(), &str);
+                break;
+            }
+            case TagDataType::FLOAT_ARRAY: {
+                kputc_('B', &str);
+                kputc_('f', &str);
+                appendBamMultiValue(tag.ToFloatArray(), &str);
+                break;
+            }
+
+            // unsupported tag type
+            default: {
+                free(str.s);
+                throw std::runtime_error{"unsupported tag-type encountered: " +
+                                         std::to_string(static_cast<uint16_t>(tag.Type()))};
+            }
+        }
+    }
+
+    std::vector<uint8_t> result;
+    result.resize(str.l);
+    memcpy(reinterpret_cast<char*>(result.data()), str.s, str.l);
+    free(str.s);
+    return result;
+}
+
+Tag BamTagCodec::FromRawData(uint8_t* rawData)
+{
+    using internal::readBamMultiValue;
+    using internal::readBamValue;
+
+    size_t offset = 0;
+    const auto tagType = static_cast<char>(*rawData++);
+    switch (tagType) {
+        case 'A':
+        case 'a': {
+            Tag t{readBamValue<uint8_t>(rawData, offset)};
+            t.Modifier(TagModifier::ASCII_CHAR);
+            return t;
+        }
+
+        case 'c':
+            return {readBamValue<int8_t>(rawData, offset)};
+        case 'C':
+            return {readBamValue<uint8_t>(rawData, offset)};
+        case 's':
+            return {readBamValue<int16_t>(rawData, offset)};
+        case 'S':
+            return {readBamValue<uint16_t>(rawData, offset)};
+        case 'i':
+            return {readBamValue<int32_t>(rawData, offset)};
+        case 'I':
+            return {readBamValue<uint32_t>(rawData, offset)};
+        case 'f':
+            return {readBamValue<float>(rawData, offset)};
+
+        case 'Z':
+        case 'H': {
+            const size_t dataLength = strlen(reinterpret_cast<const char*>(&rawData[0]));
+            std::string value(reinterpret_cast<const char*>(&rawData[0]), dataLength);
+            Tag t{value};
+            if (tagType == 'H') t.Modifier(TagModifier::HEX_STRING);
+            return t;
+        }
+
+        case 'B': {
+            const char subTagType = *rawData++;
+            switch (subTagType) {
+
+                case 'c':
+                    return {readBamMultiValue<int8_t>(rawData, offset)};
+                case 'C':
+                    return {readBamMultiValue<uint8_t>(rawData, offset)};
+                case 's':
+                    return {readBamMultiValue<int16_t>(rawData, offset)};
+                case 'S':
+                    return {readBamMultiValue<uint16_t>(rawData, offset)};
+                case 'i':
+                    return {readBamMultiValue<int32_t>(rawData, offset)};
+                case 'I':
+                    return {readBamMultiValue<uint32_t>(rawData, offset)};
+                case 'f':
+                    return {readBamMultiValue<float>(rawData, offset)};
+
+                // unknown subTagType
+                default:
+                    throw std::runtime_error{"unsupported array-tag-type encountered: " +
+                                             std::string{1, subTagType}};
+            }
+            break;
+        }
+
+        // unknown tagType
+        default:
+            throw std::runtime_error{"unsupported tag-type encountered: " +
+                                     std::string{1, tagType}};
+    }
+    return Tag();  // to avoid compiler warning
+}
+
+std::vector<uint8_t> BamTagCodec::ToRawData(const Tag& tag, const TagModifier& additionalModifier)
+{
+    // temp raw data destination (for use with htslib methods)
+    kstring_t str = {0, 0, nullptr};
+
+    // "<TYPE>:<DATA>" for printable, ASCII char
+    if (tag.HasModifier(TagModifier::ASCII_CHAR) || additionalModifier == TagModifier::ASCII_CHAR) {
+        const char c = tag.ToAscii();
+        if (c != '\0') kputc_(c, &str);
+    }
+
+    // for all others
+    else {
+
+        using internal::appendBamMultiValue;
+        using internal::appendBamValue;
+
+        switch (tag.Type()) {
+
+            // single, numeric values
+            case TagDataType::INT8:
+                appendBamValue(tag.ToInt8(), &str);
+                break;
+            case TagDataType::UINT8:
+                appendBamValue(tag.ToUInt8(), &str);
+                break;
+            case TagDataType::INT16:
+                appendBamValue(tag.ToInt16(), &str);
+                break;
+            case TagDataType::UINT16:
+                appendBamValue(tag.ToUInt16(), &str);
+                break;
+            case TagDataType::INT32:
+                appendBamValue(tag.ToInt32(), &str);
+                break;
+            case TagDataType::UINT32:
+                appendBamValue(tag.ToUInt32(), &str);
+                break;
+            case TagDataType::FLOAT:
+                appendBamValue(tag.ToFloat(), &str);
+                break;
+
+            // string & hex-string values
+            case TagDataType::STRING: {
+                const auto s = tag.ToString();
+                kputsn_(s.c_str(), s.size() + 1, &str);  // this adds the null-term
+                break;
+            }
+
+            // array-type values
+            case TagDataType::INT8_ARRAY: {
+                kputc_('c', &str);
+                appendBamMultiValue(tag.ToInt8Array(), &str);
+                break;
+            }
+            case TagDataType::UINT8_ARRAY: {
+                kputc_('C', &str);
+                appendBamMultiValue(tag.ToUInt8Array(), &str);
+                break;
+            }
+            case TagDataType::INT16_ARRAY: {
+                kputc_('s', &str);
+                appendBamMultiValue(tag.ToInt16Array(), &str);
+                break;
+            }
+            case TagDataType::UINT16_ARRAY: {
+                kputc_('S', &str);
+                appendBamMultiValue(tag.ToUInt16Array(), &str);
+                break;
+            }
+            case TagDataType::INT32_ARRAY: {
+                kputc_('i', &str);
+                appendBamMultiValue(tag.ToInt32Array(), &str);
+                break;
+            }
+            case TagDataType::UINT32_ARRAY: {
+                kputc_('I', &str);
+                appendBamMultiValue(tag.ToUInt32Array(), &str);
+                break;
+            }
+            case TagDataType::FLOAT_ARRAY: {
+                kputc_('f', &str);
+                appendBamMultiValue(tag.ToFloatArray(), &str);
+                break;
+            }
+
+            // unsupported tag type
+            default: {
+                free(str.s);
+                throw std::runtime_error{"unsupported tag-type encountered: " +
+                                         std::to_string(static_cast<uint16_t>(tag.Type()))};
+            }
+        }
+    }
+
+    // store temp contents in actual destination
+    std::vector<uint8_t> result;
+    result.resize(str.l);
+    memcpy(reinterpret_cast<char*>(&result[0]), str.s, str.l);
+    free(str.s);
+    return result;
+}
+
+uint8_t BamTagCodec::TagTypeCode(const Tag& tag, const TagModifier& additionalModifier)
+{
+    if (tag.HasModifier(TagModifier::ASCII_CHAR) || additionalModifier == TagModifier::ASCII_CHAR) {
+        int64_t value = 0;
+        switch (tag.Type()) {
+            case TagDataType::INT8:
+                value = static_cast<int64_t>(tag.ToInt8());
+                break;
+            case TagDataType::UINT8:
+                value = static_cast<int64_t>(tag.ToUInt8());
+                break;
+            case TagDataType::INT16:
+                value = static_cast<int64_t>(tag.ToInt16());
+                break;
+            case TagDataType::UINT16:
+                value = static_cast<int64_t>(tag.ToUInt16());
+                break;
+            case TagDataType::INT32:
+                value = static_cast<int64_t>(tag.ToInt32());
+                break;
+            case TagDataType::UINT32:
+                value = static_cast<int64_t>(tag.ToUInt32());
+                break;
+            default:
+                // non integers not allowed
+                throw std::runtime_error{"tag-type not convertible to ASCII, tag-type: " +
+                                         std::to_string(static_cast<uint16_t>(tag.Type()))};
+        }
+
+        // ensure value is in valid ASCII char range
+        if (value < 33 || value > 126)
+            throw std::runtime_error{"invalid integer value for ASCII char, value: " +
+                                     std::to_string(value)};
+
+        return static_cast<uint8_t>('A');
+    }
+
+    switch (tag.Type()) {
+        case TagDataType::INT8:
+            return static_cast<uint8_t>('c');
+        case TagDataType::UINT8:
+            return static_cast<uint8_t>('C');
+        case TagDataType::INT16:
+            return static_cast<uint8_t>('s');
+        case TagDataType::UINT16:
+            return static_cast<uint8_t>('S');
+        case TagDataType::INT32:
+            return static_cast<uint8_t>('i');
+        case TagDataType::UINT32:
+            return static_cast<uint8_t>('I');
+        case TagDataType::FLOAT:
+            return static_cast<uint8_t>('f');
+
+        case TagDataType::STRING: {
+            if (tag.HasModifier(TagModifier::HEX_STRING) ||
+                additionalModifier == TagModifier::HEX_STRING)
+                return static_cast<uint8_t>('H');
+            return static_cast<uint8_t>('Z');
+        }
+
+        case TagDataType::INT8_ARRAY:    // fall through
+        case TagDataType::UINT8_ARRAY:   // .
+        case TagDataType::INT16_ARRAY:   // .
+        case TagDataType::UINT16_ARRAY:  // .
+        case TagDataType::INT32_ARRAY:   // .
+        case TagDataType::UINT32_ARRAY:  // .
+        case TagDataType::FLOAT_ARRAY:
+            return static_cast<uint8_t>('B');
+
+        default:
+            throw std::runtime_error{"unsupported tag-type encountered: " +
+                                     std::to_string(static_cast<uint16_t>(tag.Type()))};
+    }
+    return 0;  // to avoid compiler warning
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BamWriter.cpp b/src/BamWriter.cpp

new file mode 100644 (file)

index 0000000..ae8578e
--- /dev/null
+++ b/src/BamWriter.cpp
@@ -0,0 +1,171 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BamWriter.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <thread>
+
+#include <htslib/bgzf.h>
+#include <htslib/hfile.h>
+#include <htslib/hts.h>
+
+#include "Autovalidate.h"
+#include "FileProducer.h"
+#include "MemoryUtils.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/Unused.h"
+#include "pbbam/Validator.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class BamWriterPrivate
+{
+public:
+    BamWriterPrivate(const std::string& filename, const std::shared_ptr<bam_hdr_t> rawHeader,
+                     const BamWriter::CompressionLevel compressionLevel, const size_t numThreads,
+                     const BamWriter::BinCalculationMode binCalculationMode,
+                     const bool useTempFile);
+
+public:
+    void Write(const BamRecord& record);
+    void Write(const BamRecord& record, int64_t* vOffset);
+    void Write(const BamRecordImpl& recordImpl);
+
+public:
+    bool calculateBins_;
+    std::unique_ptr<samFile, internal::HtslibFileDeleter> file_;
+    std::shared_ptr<bam_hdr_t> header_;
+    std::unique_ptr<internal::FileProducer> fileProducer_;
+};
+
+BamWriterPrivate::BamWriterPrivate(const std::string& filename,
+                                   const std::shared_ptr<bam_hdr_t> rawHeader,
+                                   const BamWriter::CompressionLevel compressionLevel,
+                                   const size_t numThreads,
+                                   const BamWriter::BinCalculationMode binCalculationMode,
+                                   const bool useTempFile)
+    : calculateBins_{binCalculationMode == BamWriter::BinCalculation_ON}, header_{rawHeader}
+{
+    if (!header_) throw std::runtime_error{"null header"};
+
+    if (useTempFile) fileProducer_ = std::make_unique<internal::FileProducer>(filename);
+
+    // open file
+    const auto usingFilename = (fileProducer_ ? fileProducer_->TempFilename() : filename);
+    const auto mode = std::string("wb") + std::to_string(static_cast<int>(compressionLevel));
+    file_.reset(sam_open(usingFilename.c_str(), mode.c_str()));
+    if (!file_) throw std::runtime_error{"could not open file for writing"};
+
+    // if no explicit thread count given, attempt built-in check
+    size_t actualNumThreads = numThreads;
+    if (actualNumThreads == 0) {
+        actualNumThreads = std::thread::hardware_concurrency();
+
+        // if still unknown, default to single-threaded
+        if (actualNumThreads == 0) actualNumThreads = 1;
+    }
+
+    // if multithreading requested, enable it
+    if (actualNumThreads > 1) hts_set_threads(file_.get(), actualNumThreads);
+
+    // write header
+    const auto ret = sam_hdr_write(file_.get(), header_.get());
+    if (ret != 0) throw std::runtime_error{"could not write header"};
+}
+
+void BamWriterPrivate::Write(const BamRecord& record)
+{
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(record);
+#endif
+
+    const auto rawRecord = internal::BamRecordMemory::GetRawData(record);
+
+    // (probably) store bins
+    // min_shift=14 & n_lvls=5 are BAM "magic numbers"
+    if (calculateBins_)
+        rawRecord->core.bin = hts_reg2bin(rawRecord->core.pos, bam_endpos(rawRecord.get()), 14, 5);
+
+    // write record to file
+    const auto ret = sam_write1(file_.get(), header_.get(), rawRecord.get());
+    if (ret <= 0) throw std::runtime_error{"could not write record"};
+}
+
+void BamWriterPrivate::Write(const BamRecord& record, int64_t* vOffset)
+{
+    BGZF* bgzf = file_.get()->fp.bgzf;
+    assert(bgzf);
+    assert(vOffset);
+
+    // ensure offsets up-to-date
+    const auto ret = bgzf_flush(bgzf);
+    UNUSED(ret);
+
+    // capture virtual offset where we’re about to write
+    const auto rawTell = htell(bgzf->fp);
+    const auto length = bgzf->block_offset;
+    *vOffset = (rawTell << 16) | length;
+
+    // now write data
+    Write(record);
+}
+
+inline void BamWriterPrivate::Write(const BamRecordImpl& recordImpl)
+{
+    Write(BamRecord(recordImpl));
+}
+
+}  // namespace internal
+
+BamWriter::BamWriter(const std::string& filename, const BamHeader& header,
+                     const BamWriter::CompressionLevel compressionLevel, const size_t numThreads,
+                     const BinCalculationMode binCalculationMode, const bool useTempFile)
+    : IRecordWriter()
+{
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(header);
+#endif
+    d_ = std::make_unique<internal::BamWriterPrivate>(
+        filename, internal::BamHeaderMemory::MakeRawHeader(header), compressionLevel, numThreads,
+        binCalculationMode, useTempFile);
+}
+
+BamWriter::BamWriter(const std::string& filename, const BamHeader& header,
+                     const BamWriter::Config& config)
+    : BamWriter{filename,
+                header,
+                config.compressionLevel,
+                config.numThreads,
+                config.binCalculationMode,
+                config.useTempFile}
+{
+}
+
+BamWriter::~BamWriter()
+{
+    const auto ret = bgzf_flush(d_->file_.get()->fp.bgzf);
+    UNUSED(ret);
+}
+
+void BamWriter::TryFlush()
+{
+    // TODO: sanity checks on file_ & fp
+    const auto ret = bgzf_flush(d_->file_.get()->fp.bgzf);
+    if (ret != 0) throw std::runtime_error{"could not flush output buffer contents"};
+}
+
+void BamWriter::Write(const BamRecord& record) { d_->Write(record); }
+
+void BamWriter::Write(const BamRecord& record, int64_t* vOffset) { d_->Write(record, vOffset); }
+
+void BamWriter::Write(const BamRecordImpl& recordImpl) { d_->Write(recordImpl); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/BarcodeQuery.cpp b/src/BarcodeQuery.cpp

new file mode 100644 (file)

index 0000000..3aa4542
--- /dev/null
+++ b/src/BarcodeQuery.cpp
@@ -0,0 +1,40 @@
+// File Description
+/// \file BarcodeQuery.cpp
+/// \brief Implements the BarcodeQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/BarcodeQuery.h"
+
+#include <cstdint>
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/PbiFilterTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct BarcodeQuery::BarcodeQueryPrivate
+{
+    BarcodeQueryPrivate(const int16_t barcode, const DataSet& dataset)
+        : reader_{PbiBarcodeFilter{barcode}, dataset}
+    {
+    }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_;  // unsorted
+};
+
+BarcodeQuery::BarcodeQuery(const int16_t barcode, const DataSet& dataset)
+    : internal::IQuery(), d_{std::make_unique<BarcodeQueryPrivate>(barcode, dataset)}
+{
+}
+
+BarcodeQuery::~BarcodeQuery() {}
+
+bool BarcodeQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

new file mode 100644 (file)

index 0000000..1847a90
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,65 @@
+
+# grab library source files
+include(files.cmake)
+set(SOURCES
+    ${PacBioBAM_H}
+    ${PacBioBAM_CPP}
+)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PacBioBAM_CXX_FLAGS}")
+
+# define actual library
+add_library(pbbam ${SOURCES})
+
+# library properties
+set_target_properties(pbbam PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY ${PacBioBAM_LibDir}
+    RUNTIME_OUTPUT_DIRECTORY ${PacBioBAM_LibDir}
+    LIBRARY_OUTPUT_DIRECTORY ${PacBioBAM_LibDir}
+)
+
+if(PacBioBAM_auto_validate)
+    target_compile_definitions(pbbam
+        PUBLIC "-DPBBAM_AUTOVALIDATE=1"
+    )
+endif()
+
+# pbbam includes
+target_include_directories(pbbam
+    PUBLIC
+    ${PacBioBAM_IncludeDir}
+    ${hts_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIRS}
+    ${ZLIB_INCLUDE_DIRS}
+)
+
+# set link dependencies
+#if(HTSLIB_LIBRARIES)
+set(pbbam_all_dependency_libs
+    ${hts_LIBRARIES}
+    ${ZLIB_LIBRARIES}
+    ${SOCKET_LIBRARIES}
+    ${CMAKE_THREAD_LIBS_INIT}
+)
+
+target_link_libraries(pbbam
+    PUBLIC
+    ${pbbam_all_dependency_libs}
+)
+
+# define include paths for projects that use pbbam
+set(PacBioBAM_INCLUDE_DIRS
+    ${PacBioBAM_IncludeDir}
+    ${hts_INCLUDE_DIRS}
+    ${Boost_INCLUDE_DIRS}
+    ${ZLIB_INCLUDE_DIRS}
+    CACHE INTERNAL
+    "${PROJECT_NAME}: Include Directories"
+    FORCE
+)
+set(PacBioBAM_LIBRARIES
+    $<TARGET_FILE:pbbam>
+    ${pbbam_all_dependency_libs}
+    CACHE INTERNAL
+    "${PROJECT_NAME}: Libraries"
+    FORCE
+)
diff --git a/src/ChemistryTable.cpp b/src/ChemistryTable.cpp

new file mode 100644 (file)

index 0000000..0d9509c
--- /dev/null
+++ b/src/ChemistryTable.cpp
@@ -0,0 +1,153 @@
+// Author: Lance Hepler
+
+#include "PbbamInternalConfig.h"
+
+#include "ChemistryTable.h"
+
+#include <cstdlib>
+#include <fstream>
+#include <map>
+
+#include "FileUtils.h"
+#include "pbbam/exception/BundleChemistryMappingException.h"
+#include "pugixml/pugixml.hpp"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// clang-format off
+
+extern const ChemistryTable BuiltInChemistryTable = {
+
+    // BindingKit, SequencingKit, BasecallerVersion, Chemistry
+
+    // RS
+    {{"100356300", "100356200", "2.1", "P6-C4"}},
+    {{"100356300", "100356200", "2.3", "P6-C4"}},
+    {{"100356300", "100612400", "2.1", "P6-C4"}},
+    {{"100356300", "100612400", "2.3", "P6-C4"}},
+    {{"100372700", "100356200", "2.1", "P6-C4"}},
+    {{"100372700", "100356200", "2.3", "P6-C4"}},
+    {{"100372700", "100612400", "2.1", "P6-C4"}},
+    {{"100372700", "100612400", "2.3", "P6-C4"}},
+
+    // 3.0 ("Dromedary"): S/P1-C1/beta
+    {{"100-619-300", "100-620-000", "3.0", "S/P1-C1/beta"}},
+    {{"100-619-300", "100-620-000", "3.1", "S/P1-C1/beta"}},
+
+    // 3.1 ("Echidna"): S/P1-C1.1
+    {{"100-619-300", "100-867-300", "3.1", "S/P1-C1.1"}},
+    {{"100-619-300", "100-867-300", "3.2", "S/P1-C1.1"}},
+    {{"100-619-300", "100-867-300", "3.3", "S/P1-C1.1"}},
+
+    // 3.1.1 ("Flea"): S/P1-C1.2
+    {{"100-619-300", "100-902-100", "3.1", "S/P1-C1.2"}},
+    {{"100-619-300", "100-902-100", "3.2", "S/P1-C1.2"}},
+    {{"100-619-300", "100-902-100", "3.3", "S/P1-C1.2"}},
+    {{"100-619-300", "100-902-100", "4.0", "S/P1-C1.2"}},
+    {{"100-619-300", "100-902-100", "4.1", "S/P1-C1.2"}},
+
+    // 3.2 ("Goat"): S/P1-C1.3
+    {{"100-619-300", "100-972-200", "3.2", "S/P1-C1.3"}},
+    {{"100-619-300", "100-972-200", "3.3", "S/P1-C1.3"}},
+    {{"100-619-300", "100-972-200", "4.0", "S/P1-C1.3"}},
+    {{"100-619-300", "100-972-200", "4.1", "S/P1-C1.3"}},
+
+    // 4.0 ("Seabiscuit"); S/P2-C2
+    {{"100-862-200", "100-861-800", "4.0", "S/P2-C2"}},
+    {{"100-862-200", "100-861-800", "4.1", "S/P2-C2"}},
+    {{"100-862-200", "101-093-700", "4.1", "S/P2-C2"}},
+
+    // 5.0 ("Iguana"); S/P2-C2
+    {{"100-862-200", "100-861-800", "5.0", "S/P2-C2/5.0"}},
+    {{"100-862-200", "101-093-700", "5.0", "S/P2-C2/5.0"}},
+
+    // 5.0.1 ChemRel ("Sequel® Sequencing Plate Silwet"); S/P2-C2
+    {{"100-862-200", "101-309-500", "5.0", "S/P2-C2/5.0"}},
+    // 5.0.1 ChemRel ("Sequel® Sequencing Plate Silwet (4 rxn)"); S/P2-C2
+    {{"100-862-200", "101-309-400", "5.0", "S/P2-C2/5.0"}},
+
+    // --- SG1/16509P/PA5.0 ---
+    // 2.1 binding kit/5.1PA support with ..
+    // 5.0 ("Iguana"); S/P2-C2
+    {{"101-365-900", "100-861-800", "5.0", "S/P2-C2/5.0"}},
+    {{"101-365-900", "101-093-700", "5.0", "S/P2-C2/5.0"}},
+
+    // 5.0.1 ChemRel; Sequel® Binding Kit 2.1; S/P2-C2
+    {{"101-365-900", "101-309-500", "5.0", "S/P2-C2/5.0"}}, // Sequel® Sequencing Plate 2.1 Silwet (8 rxn)
+    {{"101-365-900", "101-309-400", "5.0", "S/P2-C2/5.0"}}, // Sequel® Sequencing Plate 2.1 Silwet (4 rxn)
+
+    // 5.0.1 ChemRel; Sequel® Binding Kit 3.0; S/P3-C3
+    {{"101-500-400", "101-427-500", "5.0", "S/P3-C3/5.0"}}, // Sequel® Sequencing Plate 3.0 (8 rxn)
+    {{"101-500-400", "101-427-800", "5.0", "S/P3-C3/5.0"}}, // Sequel® Sequencing Plate 3.0 (4 rxn)
+
+    // 5.0.1 ChemRel; Sequel® Dev Binding Kit; S/P2-C2
+    {{"101-490-800", "101-490-900", "5.0", "S/P2-C2/5.0"}}, // Sequel® Dev Sequencing Plate (4 rxn)
+    {{"101-490-800", "101-491-000", "5.0", "S/P2-C2/5.0"}}, // Sequel® Dev Sequencing Plate (8 rxn)
+};
+
+// clang-format on
+
+ChemistryTable ChemistryTableFromXml(const std::string& mappingXml)
+{
+    if (!FileUtils::Exists(mappingXml))
+        throw BundleChemistryMappingException{
+            mappingXml, "SMRT_CHEMISTRY_BUNDLE_DIR defined but file not found"};
+
+    std::ifstream in(mappingXml);
+    pugi::xml_document doc;
+    const pugi::xml_parse_result loadResult = doc.load(in);
+    if (loadResult.status != pugi::status_ok)
+        throw BundleChemistryMappingException{
+            mappingXml, "unparseable XML, error code:" + std::to_string(loadResult.status)};
+
+    // parse top-level attributes
+    pugi::xml_node rootNode = doc.document_element();
+    if (rootNode == pugi::xml_node())
+        throw BundleChemistryMappingException{mappingXml, "could not fetch XML root node"};
+
+    if (std::string(rootNode.name()) != "MappingTable")
+        throw BundleChemistryMappingException{mappingXml, "MappingTable not found"};
+
+    ChemistryTable table;
+    try {
+        for (const auto& childNode : rootNode) {
+            const std::string childName = childNode.name();
+            if (childName != "Mapping") continue;
+            table.emplace_back(
+                std::array<std::string, 4>{{childNode.child("BindingKit").child_value(),
+                                            childNode.child("SequencingKit").child_value(),
+                                            childNode.child("SoftwareVersion").child_value(),
+                                            childNode.child("SequencingChemistry").child_value()}});
+        }
+    } catch (std::exception& e) {
+        const std::string msg = std::string{"Mapping entries unparseable - "} + e.what();
+        throw BundleChemistryMappingException{mappingXml, msg};
+    }
+    return table;
+}
+
+const ChemistryTable& GetChemistryTableFromEnv()
+{
+    static const ChemistryTable empty{};
+    static std::map<std::string, ChemistryTable> tableCache;
+
+    std::string chemPath;
+    const char* pth = getenv("SMRT_CHEMISTRY_BUNDLE_DIR");
+    if (pth != nullptr && pth[0] != '\0')
+        chemPath = pth;
+    else
+        return empty;
+
+    auto it = tableCache.find(chemPath);
+    if (it != tableCache.end()) return it->second;
+
+    auto tbl = ChemistryTableFromXml(chemPath + "/chemistry.xml");
+    it = tableCache.emplace(std::move(chemPath), std::move(tbl)).first;
+    return it->second;
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ChemistryTable.h b/src/ChemistryTable.h

new file mode 100644 (file)

index 0000000..4b16dcb
--- /dev/null
+++ b/src/ChemistryTable.h
@@ -0,0 +1,24 @@
+// Author: Lance Hepler
+
+#ifndef CHEMISTRYTABLE_H
+#define CHEMISTRYTABLE_H
+
+#include <array>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+typedef std::vector<std::array<std::string, 4>> ChemistryTable;
+
+extern const ChemistryTable BuiltInChemistryTable;
+
+const ChemistryTable& GetChemistryTableFromEnv();
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // CHEMISTRYTABLE_H
diff --git a/src/Cigar.cpp b/src/Cigar.cpp

new file mode 100644 (file)

index 0000000..7e04324
--- /dev/null
+++ b/src/Cigar.cpp
@@ -0,0 +1,45 @@
+// File Description
+/// \file Cigar.cpp
+/// \brief Implements the Cigar class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/Cigar.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <sstream>
+
+namespace PacBio {
+namespace BAM {
+
+Cigar::Cigar(const std::string& cigarString) : std::vector<CigarOperation>{}
+{
+    size_t numberStart = 0;
+    const size_t numChars = cigarString.size();
+    for (size_t i = 0; i < numChars; ++i) {
+        const char c = cigarString.at(i);
+        if (!isdigit(c)) {
+            const size_t distance = i - numberStart;
+            const uint32_t length = stoul(cigarString.substr(numberStart, distance));
+            push_back(CigarOperation(c, length));
+            numberStart = i + 1;
+        }
+    }
+}
+
+std::string Cigar::ToStdString() const
+{
+    std::ostringstream s;
+    const auto endIt = this->cend();
+    for (auto iter = this->cbegin(); iter != endIt; ++iter) {
+        const CigarOperation& cigar = (*iter);
+        s << cigar.Length() << cigar.Char();
+    }
+    return s.str();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/CigarOperation.cpp b/src/CigarOperation.cpp

new file mode 100644 (file)

index 0000000..ab00131
--- /dev/null
+++ b/src/CigarOperation.cpp
@@ -0,0 +1,50 @@
+// File Description
+/// \file CigarOperation.cpp
+/// \brief Implements the CigarOperation class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/CigarOperation.h"
+
+#include <htslib/sam.h>
+
+namespace PacBio {
+namespace BAM {
+
+CigarOperationType CigarOperation::CharToType(const char c)
+{
+    switch (c) {
+        case 'S':
+            return CigarOperationType::SOFT_CLIP;
+        case '=':
+            return CigarOperationType::SEQUENCE_MATCH;
+        case 'X':
+            return CigarOperationType::SEQUENCE_MISMATCH;
+        case 'I':
+            return CigarOperationType::INSERTION;
+        case 'D':
+            return CigarOperationType::DELETION;
+        case 'N':
+            return CigarOperationType::REFERENCE_SKIP;
+        case 'H':
+            return CigarOperationType::HARD_CLIP;
+        case 'P':
+            return CigarOperationType::PADDING;
+        case 'M':
+            return CigarOperationType::ALIGNMENT_MATCH;
+        default:
+            return CigarOperationType::UNKNOWN_OP;
+    }
+}
+
+char CigarOperation::TypeToChar(const CigarOperationType type)
+{
+    return bam_cigar_opchr(static_cast<int>(type));
+}
+
+bool CigarOperation::validate_ = true;
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Compare.cpp b/src/Compare.cpp

new file mode 100644 (file)

index 0000000..39f5382
--- /dev/null
+++ b/src/Compare.cpp
@@ -0,0 +1,111 @@
+// File Description
+/// \file Compare.cpp
+/// \brief Implements the Compare class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/Compare.h"
+
+#include <cstddef>
+#include <functional>
+#include <unordered_map>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct TypeAlias
+{
+    std::string name_;
+    std::string op_;
+    std::string opAlpha_;
+
+    TypeAlias(std::string name = std::string(), std::string op = std::string(),
+              std::string opAlpha = std::string())
+        : name_(std::move(name)), op_(std::move(op)), opAlpha_(std::move(opAlpha))
+    {
+    }
+};
+
+struct CompareTypeHash
+{
+    size_t operator()(const Compare::Type& t) const
+    {
+        return std::hash<int>()(static_cast<int>(t));
+    }
+};
+
+// clang-format off
+static const std::unordered_map<std::string, Compare::Type> opToTypeMap =
+{
+    // basic operators plus some permissiveness for other representations
+    { "==",     Compare::EQUAL },
+    { "=",      Compare::EQUAL },
+    { "eq",     Compare::EQUAL },
+    { "in",     Compare::EQUAL },
+    { "!=",     Compare::NOT_EQUAL },
+    { "ne",     Compare::NOT_EQUAL },
+    { "not_in", Compare::NOT_EQUAL },
+    { "<",      Compare::LESS_THAN },
+    { "lt",     Compare::LESS_THAN },
+    { "&lt;",   Compare::LESS_THAN },
+    { "<=",     Compare::LESS_THAN_EQUAL },
+    { "lte",    Compare::LESS_THAN_EQUAL },
+    { "&lt;=",  Compare::LESS_THAN_EQUAL },
+    { ">",      Compare::GREATER_THAN },
+    { "gt",     Compare::GREATER_THAN },
+    { "&gt;",   Compare::GREATER_THAN },
+    { ">=",     Compare::GREATER_THAN_EQUAL },
+    { "gte",    Compare::GREATER_THAN_EQUAL },
+    { "&gt;=",  Compare::GREATER_THAN_EQUAL },
+    { "&",      Compare::CONTAINS },
+    { "~",      Compare::NOT_CONTAINS }
+};
+
+static const std::unordered_map<Compare::Type, TypeAlias, CompareTypeHash> typeAliases =
+{
+    { Compare::EQUAL,              TypeAlias{ "Compare::EQUAL",              "==", "eq" } },
+    { Compare::NOT_EQUAL,          TypeAlias{ "Compare::NOT_EQUAL",          "!=", "ne" } },
+    { Compare::LESS_THAN,          TypeAlias{ "Compare::LESS_THAN",          "<",  "lt"  } },
+    { Compare::LESS_THAN_EQUAL,    TypeAlias{ "Compare::LESS_THAN_EQUAL",    "<=", "lte" } },
+    { Compare::GREATER_THAN,       TypeAlias{ "Compare::GREATER_THAN",       ">",  "gt"  } },
+    { Compare::GREATER_THAN_EQUAL, TypeAlias{ "Compare::GREATER_THAN_EQUAL", ">=", "gte" } },
+    { Compare::CONTAINS,           TypeAlias{ "Compare::CONTAINS",           "&",  "and" } },
+    { Compare::NOT_CONTAINS,       TypeAlias{ "Compare::NOT_CONTAINS",       "~",  "not" } }
+};
+// clang-format on
+
+}  // namespace internal
+
+Compare::Type Compare::TypeFromOperator(const std::string& opString)
+{
+    try {
+        return internal::opToTypeMap.at(opString);
+    } catch (std::exception&) {
+        throw std::runtime_error{opString + " is not a valid comparison operator."};
+    }
+}
+
+std::string Compare::TypeToName(const Compare::Type& type)
+{
+    try {
+        return internal::typeAliases.at(type).name_;
+    } catch (std::exception&) {
+        throw std::runtime_error{"invalid comparison type encountered"};
+    }
+}
+
+std::string Compare::TypeToOperator(const Compare::Type& type, bool asAlpha)
+{
+    try {
+        return asAlpha ? internal::typeAliases.at(type).opAlpha_
+                       : internal::typeAliases.at(type).op_;
+    } catch (std::exception&) {
+        throw std::runtime_error{"invalid comparison type encountered"};
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Config.cpp b/src/Config.cpp

new file mode 100644 (file)

index 0000000..02f0d26
--- /dev/null
+++ b/src/Config.cpp
@@ -0,0 +1,25 @@
+// File Description
+/// \file Config.cpp
+/// \brief Initializes global variable defaults.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+// Initialized to -1 to indicate default. Client code may set this or not.
+//
+// To respect client code or else fallback to default[OFF], this value should be used like this:
+//
+//    hts_verbose = ( PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity);
+//
+//
+//
+int HtslibVerbosity = -1;
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSet.cpp b/src/DataSet.cpp

new file mode 100644 (file)

index 0000000..edd930a
--- /dev/null
+++ b/src/DataSet.cpp
@@ -0,0 +1,332 @@
+// File Description
+/// \file DataSet.cpp
+/// \brief Implements the DataSet class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/DataSet.h"
+
+#include <algorithm>
+#include <unordered_map>
+
+#include <boost/algorithm/string.hpp>
+
+#include "DataSetIO.h"
+#include "FileUtils.h"
+#include "TimeUtils.h"
+#include "pbbam/DataSetTypes.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/internal/DataSetBaseTypes.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static const std::string defaultVersion{"4.0.0"};
+
+static void GetAllFiles(const ExternalResources& resources, std::vector<std::string>* result)
+{
+    for (const auto& resource : resources) {
+
+        // store this resource's path
+        result->push_back(resource.ResourceId());
+
+        // store any child indices
+        for (const auto& idx : resource.FileIndices())
+            result->push_back(idx.ResourceId());
+
+        // recurse into any other child resources
+        GetAllFiles(resource.ExternalResources(), result);
+    }
+}
+
+static inline void InitDefaults(DataSet& ds)
+{
+    // provide default 'CreatedAt' & 'Version' attributes if not already present in XML
+
+    if (ds.CreatedAt().empty()) ds.CreatedAt(internal::ToIso8601(CurrentTime()));
+
+    if (ds.Version().empty()) ds.Version(internal::defaultVersion);
+}
+
+}  // namespace internal
+
+using internal::DataSetElement;
+using internal::DataSetIO;
+using internal::FileUtils;
+
+DataSet::DataSet() : DataSet(DataSet::GENERIC) { internal::InitDefaults(*this); }
+
+DataSet::DataSet(const DataSet::TypeEnum type)
+    : d_(nullptr), path_(FileUtils::CurrentWorkingDirectory())
+{
+    switch (type) {
+        case DataSet::GENERIC:
+            d_ = std::make_unique<DataSetBase>();
+            break;
+        case DataSet::ALIGNMENT:
+            d_ = std::make_unique<AlignmentSet>();
+            break;
+        case DataSet::BARCODE:
+            d_ = std::make_unique<BarcodeSet>();
+            break;
+        case DataSet::CONSENSUS_ALIGNMENT:
+            d_ = std::make_unique<ConsensusAlignmentSet>();
+            break;
+        case DataSet::CONSENSUS_READ:
+            d_ = std::make_unique<ConsensusReadSet>();
+            break;
+        case DataSet::CONTIG:
+            d_ = std::make_unique<ContigSet>();
+            break;
+        case DataSet::HDF_SUBREAD:
+            d_ = std::make_unique<HdfSubreadSet>();
+            break;
+        case DataSet::REFERENCE:
+            d_ = std::make_unique<ReferenceSet>();
+            break;
+        case DataSet::SUBREAD:
+            d_ = std::make_unique<SubreadSet>();
+            break;
+        case DataSet::TRANSCRIPT:
+            d_ = std::make_unique<TranscriptSet>();
+            break;
+        case DataSet::TRANSCRIPT_ALIGNMENT:
+            d_ = std::make_unique<TranscriptAlignmentSet>();
+            break;
+        default:
+            throw std::runtime_error{"unsupported dataset type"};
+    }
+
+    internal::InitDefaults(*this);
+}
+
+DataSet::DataSet(const BamFile& bamFile)
+    : d_(DataSetIO::FromUri(bamFile.Filename())), path_(FileUtils::CurrentWorkingDirectory())
+{
+    internal::InitDefaults(*this);
+}
+
+DataSet::DataSet(const std::string& filename)
+    : d_(DataSetIO::FromUri(filename)), path_(FileUtils::DirectoryName(filename))
+{
+    // for FOFN contents and raw BAM filenames, we can just use the current
+    // directory as the starting path.
+    //
+    // (any relative paths in the FOFN have already been resolved)
+    //
+    if (boost::algorithm::iends_with(filename, ".fofn") ||
+        boost::algorithm::iends_with(filename, ".bam") ||
+        boost::algorithm::iends_with(filename, ".fasta") ||
+        boost::algorithm::iends_with(filename, ".fa")) {
+        path_ = FileUtils::CurrentWorkingDirectory();
+    }
+    internal::InitDefaults(*this);
+}
+
+DataSet::DataSet(const std::vector<std::string>& filenames)
+    : d_(DataSetIO::FromUris(filenames)), path_(FileUtils::CurrentWorkingDirectory())
+{
+    internal::InitDefaults(*this);
+}
+
+DataSet::DataSet(const DataSet& other) : path_(other.path_)
+{
+    DataSetBase* otherDataset = other.d_.get();
+    auto copyDataset = new DataSetElement(*otherDataset);
+    d_.reset(static_cast<DataSetBase*>(copyDataset));
+}
+
+DataSet& DataSet::operator=(const DataSet& other)
+{
+    if (this != &other) {
+        DataSetBase* otherDataset = other.d_.get();
+        auto copyDataset = new DataSetElement(*otherDataset);
+        d_.reset(static_cast<DataSetBase*>(copyDataset));
+        path_ = other.path_;
+    }
+    return *this;
+}
+
+DataSet& DataSet::operator+=(const DataSet& other)
+{
+    *d_.get() += *other.d_.get();
+    return *this;
+}
+
+std::vector<std::string> DataSet::AllFiles() const
+{
+    // get all files
+    std::vector<std::string> result;
+    internal::GetAllFiles(ExternalResources(), &result);
+
+    // resolve relative paths
+    std::transform(result.begin(), result.end(), result.begin(),
+                   [this](const std::string& fn) { return this->ResolvePath(fn); });
+    return result;
+}
+
+std::vector<BamFile> DataSet::BamFiles() const
+{
+    const PacBio::BAM::ExternalResources& resources = ExternalResources();
+
+    std::vector<BamFile> result;
+    result.reserve(resources.Size());
+    for (const ExternalResource& ext : resources) {
+
+        // only bother resolving file path if this is a BAM file
+        boost::iterator_range<std::string::const_iterator> bamFound =
+            boost::algorithm::ifind_first(ext.MetaType(), "bam");
+        if (!bamFound.empty()) {
+            const std::string fn = ResolvePath(ext.ResourceId());
+            result.emplace_back(fn);
+        }
+    }
+    return result;
+}
+
+std::vector<std::string> DataSet::FastaFiles() const
+{
+    const PacBio::BAM::ExternalResources& resources = ExternalResources();
+
+    std::vector<std::string> result;
+    result.reserve(resources.Size());
+    for (const ExternalResource& ext : resources) {
+
+        // only bother resolving file path if this is a BAM file
+        boost::iterator_range<std::string::const_iterator> fastaFound =
+            boost::algorithm::ifind_first(ext.MetaType(), "fasta");
+        if (!fastaFound.empty()) {
+            const std::string fn = ResolvePath(ext.ResourceId());
+            result.push_back(fn);
+        }
+    }
+    return result;
+}
+
+DataSet DataSet::FromXml(const std::string& xml)
+{
+    DataSet result;
+    result.d_ = DataSetIO::FromXmlString(xml);
+    internal::InitDefaults(result);
+    return result;
+}
+
+const NamespaceRegistry& DataSet::Namespaces() const { return d_->Namespaces(); }
+
+NamespaceRegistry& DataSet::Namespaces() { return d_->Namespaces(); }
+
+DataSet::TypeEnum DataSet::NameToType(const std::string& typeName)
+{
+    static std::unordered_map<std::string, DataSet::TypeEnum> lookup;
+    if (lookup.empty()) {
+        lookup["DataSet"] = DataSet::GENERIC;
+        lookup["AlignmentSet"] = DataSet::ALIGNMENT;
+        lookup["BarcodeSet"] = DataSet::BARCODE;
+        lookup["ConsensusAlignmentSet"] = DataSet::CONSENSUS_ALIGNMENT;
+        lookup["ConsensusReadSet"] = DataSet::CONSENSUS_READ;
+        lookup["ContigSet"] = DataSet::CONTIG;
+        lookup["HdfSubreadSet"] = DataSet::HDF_SUBREAD;
+        lookup["ReferenceSet"] = DataSet::REFERENCE;
+        lookup["SubreadSet"] = DataSet::SUBREAD;
+        lookup["TranscriptSet"] = DataSet::TRANSCRIPT;
+        lookup["TranscriptAlignmentSet"] = DataSet::TRANSCRIPT_ALIGNMENT;
+    }
+    return lookup.at(typeName);  // throws if unknown typename
+}
+
+std::vector<std::string> DataSet::ResolvedResourceIds() const
+{
+    const PacBio::BAM::ExternalResources& resources = ExternalResources();
+
+    std::vector<std::string> result;
+    result.reserve(resources.Size());
+    for (const ExternalResource& ext : resources) {
+        //        const string fn = ;
+        //        const string fn = internal::FileUtils::ResolvedFilePath(ext.ResourceId(), path_);
+        result.push_back(ResolvePath(ext.ResourceId()));
+    }
+    return result;
+}
+
+std::string DataSet::ResolvePath(const std::string& originalPath) const
+{
+    return internal::FileUtils::ResolvedFilePath(originalPath, path_);
+}
+
+void DataSet::Save(const std::string& outputFilename) { DataSetIO::ToFile(d_, outputFilename); }
+
+void DataSet::SaveToStream(std::ostream& out) { DataSetIO::ToStream(d_, out); }
+
+std::set<std::string> DataSet::SequencingChemistries() const
+{
+    const std::vector<BamFile> bamFiles{BamFiles()};
+
+    std::set<std::string> result;
+    for (const BamFile& bf : bamFiles) {
+        if (!bf.IsPacBioBAM()) throw std::runtime_error{"only PacBio BAMs are supported"};
+        const std::vector<ReadGroupInfo> readGroups{bf.Header().ReadGroups()};
+        for (const ReadGroupInfo& rg : readGroups)
+            result.insert(rg.SequencingChemistry());
+    }
+    return result;
+}
+
+std::string DataSet::TypeToName(const DataSet::TypeEnum& type)
+{
+    switch (type) {
+        case DataSet::GENERIC:
+            return "DataSet";
+        case DataSet::ALIGNMENT:
+            return "AlignmentSet";
+        case DataSet::BARCODE:
+            return "BarcodeSet";
+        case DataSet::CONSENSUS_ALIGNMENT:
+            return "ConsensusAlignmentSet";
+        case DataSet::CONSENSUS_READ:
+            return "ConsensusReadSet";
+        case DataSet::CONTIG:
+            return "ContigSet";
+        case DataSet::HDF_SUBREAD:
+            return "HdfSubreadSet";
+        case DataSet::REFERENCE:
+            return "ReferenceSet";
+        case DataSet::SUBREAD:
+            return "SubreadSet";
+        case DataSet::TRANSCRIPT:
+            return "TranscriptSet";
+        case DataSet::TRANSCRIPT_ALIGNMENT:
+            return "TranscriptAlignmentSet";
+        default:
+            throw std::runtime_error{"unsupported dataset type"};
+    }
+}
+
+// Exposed timestamp utils
+
+std::string CurrentTimestamp() { return internal::ToDataSetFormat(internal::CurrentTime()); }
+
+std::string ToDataSetFormat(const std::chrono::system_clock::time_point& tp)
+{
+    return internal::ToDataSetFormat(tp);
+}
+
+std::string ToDataSetFormat(const time_t& t)
+{
+    return ToDataSetFormat(std::chrono::system_clock::from_time_t(t));
+}
+
+std::string ToIso8601(const std::chrono::system_clock::time_point& tp)
+{
+    return internal::ToIso8601(tp);
+}
+
+std::string ToIso8601(const time_t& t)
+{
+    return ToIso8601(std::chrono::system_clock::from_time_t(t));
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSetBaseTypes.cpp b/src/DataSetBaseTypes.cpp

new file mode 100644 (file)

index 0000000..c2d121f
--- /dev/null
+++ b/src/DataSetBaseTypes.cpp
@@ -0,0 +1,104 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/internal/DataSetBaseTypes.h"
+
+#include <cstddef>
+
+#include <boost/algorithm/string.hpp>
+
+#include "DataSetUtils.h"
+#include "TimeUtils.h"
+#include "pbbam/DataSetTypes.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// ----------------
+// BaseEntityType
+// ----------------
+
+BaseEntityType::BaseEntityType(const std::string& label, const XsdType& xsd)
+    : DataSetElement(label, xsd)
+{
+    if (Version().empty()) Version(internal::XML_VERSION);
+}
+
+DEFINE_ACCESSORS(BaseEntityType, Extensions, Extensions)
+
+BaseEntityType& BaseEntityType::Extensions(const PacBio::BAM::Extensions& extensions)
+{
+    Extensions() = extensions;
+    return *this;
+}
+
+// ----------------
+// DataEntityType
+// ----------------
+
+DataEntityType::DataEntityType(const std::string& label, const XsdType& xsd)
+    : BaseEntityType(label, xsd)
+{
+}
+
+// -----------------
+// IndexedDataType
+// -----------------
+
+IndexedDataType::IndexedDataType(const std::string& metatype, const std::string& filename,
+                                 const std::string& label, const XsdType& xsd)
+    : InputOutputDataType(metatype, filename, label, xsd)
+{
+}
+
+DEFINE_ACCESSORS(IndexedDataType, FileIndices, FileIndices)
+
+IndexedDataType& IndexedDataType::FileIndices(const PacBio::BAM::FileIndices& indices)
+{
+    FileIndices() = indices;
+    return *this;
+}
+
+// ---------------------
+// InputOutputDataType
+// ---------------------
+
+InputOutputDataType::InputOutputDataType(const std::string& metatype, const std::string& filename,
+                                         const std::string& label, const XsdType& xsd)
+    : StrictEntityType(metatype, label, xsd)
+{
+    ResourceId(filename);
+}
+
+// ----------------
+// StrictEntityType
+// ----------------
+
+StrictEntityType::StrictEntityType(const std::string& metatype, const std::string& label,
+                                   const XsdType& xsd)
+    : BaseEntityType(label, xsd)
+{
+    // MetaType
+    MetaType(metatype);
+
+    // TimeStampedName
+    const size_t numChars = metatype.size();
+    std::string transformedMetatype;
+    transformedMetatype.resize(numChars);
+    for (size_t i = 0; i < numChars; ++i) {
+        const char c = metatype.at(i);
+        transformedMetatype[i] = ((c == '.') ? '_' : tolower(c));
+    }
+    const std::string tsn =
+        transformedMetatype + "-" + internal::ToDataSetFormat(internal::CurrentTime());
+    TimeStampedName(tsn);
+
+    // UniqueId
+    UniqueId(internal::GenerateUuid());
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSetElement.cpp b/src/DataSetElement.cpp

new file mode 100644 (file)

index 0000000..fa8ee67
--- /dev/null
+++ b/src/DataSetElement.cpp
@@ -0,0 +1,20 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/internal/DataSetElement.h"
+
+#include "DataSetUtils.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+const std::string& DataSetElement::SharedNullString()
+{
+    return internal::NullObject<std::string>();
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSetIO.cpp b/src/DataSetIO.cpp

new file mode 100644 (file)

index 0000000..7c41ba3
--- /dev/null
+++ b/src/DataSetIO.cpp
@@ -0,0 +1,148 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "DataSetIO.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <exception>
+#include <fstream>
+#include <iostream>
+
+#include <boost/algorithm/string.hpp>
+
+#include "FileUtils.h"
+#include "FofnReader.h"
+#include "StringUtils.h"
+#include "XmlReader.h"
+#include "XmlWriter.h"
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+
+using DataSetPtr = std::shared_ptr<DataSetBase>;
+
+namespace internal {
+
+static std::unique_ptr<DataSetBase> FromXml(const std::string& xmlFn)
+{
+    std::ifstream in(xmlFn);
+    if (!in) throw std::runtime_error{"could not open XML file for reading: " + xmlFn};
+    return XmlReader::FromStream(in);
+}
+
+static std::unique_ptr<DataSetBase> FromBam(const std::string& bamFn)
+{
+    // peek at sort order to determine if file should be an AlignmentSet or else SubreadSet
+    const auto bamFile = BamFile{bamFn};
+    const auto& header = bamFile.Header();
+    const auto aligned = header.SortOrder() == "coordinate";
+
+    std::unique_ptr<DataSetBase> dataset;
+    if (aligned)
+        dataset = std::make_unique<AlignmentSet>();
+    else
+        dataset = std::make_unique<SubreadSet>();
+
+    auto& resources = dataset->ExternalResources();
+    resources.Add(ExternalResource(BamFile(bamFn)));
+    return dataset;
+}
+
+static std::unique_ptr<DataSetBase> FromFasta(const std::string& fasta)
+{
+    // make FASTA data set
+    auto dataset = std::make_unique<ReferenceSet>();
+    auto& resources = dataset->ExternalResources();
+    resources.Add(ExternalResource("PacBio.ReferenceFile.ReferenceFastaFile", fasta));
+    return std::move(dataset);
+}
+
+static std::unique_ptr<DataSetBase> FromFofn(const std::string& fofn)
+{
+    const auto fofnDir = FileUtils::DirectoryName(fofn);
+    std::ifstream in(fofn);
+    if (!in) throw std::runtime_error{"could not open FOFN for reading: " + fofn};
+
+    auto filenames = FofnReader::Files(in);
+    std::transform(
+        filenames.begin(), filenames.end(), filenames.begin(),
+        [&fofnDir](const std::string fn) { return FileUtils::ResolvedFilePath(fn, fofnDir); });
+    return DataSetIO::FromUris(filenames);
+}
+
+static std::unique_ptr<DataSetBase> FromUri(const std::string& uri)
+{
+    // NOTE: this says URI, but we're not quite handling filenames as true URIs
+    //       basically just treating as a regular filename for now
+
+    // handle on extension
+    if (boost::algorithm::iends_with(uri, ".xml"))
+        return FromXml(uri);
+    else if (boost::algorithm::iends_with(uri, ".bam"))
+        return FromBam(uri);
+    else if (boost::algorithm::iends_with(uri, ".fofn"))
+        return FromFofn(uri);
+    else if (boost::algorithm::iends_with(uri, ".fasta") ||
+             boost::algorithm::iends_with(uri, ".fa")) {
+        return FromFasta(uri);
+    }
+
+    // unknown filename extension
+    throw std::runtime_error{"unsupported extension on input file: " + uri};
+}
+
+std::unique_ptr<DataSetBase> DataSetIO::FromUri(const std::string& uri)
+{
+    return FromUris(std::vector<std::string>(1, uri));
+}
+
+std::unique_ptr<DataSetBase> DataSetIO::FromUris(const std::vector<std::string>& uris)
+{
+    if (uris.empty())
+        throw std::runtime_error{"empty input URI list"};  // or just return empty, generic DataSet?
+
+    // create dataset(s) from URI(s)
+    std::vector<std::unique_ptr<DataSetBase> > datasets;
+    datasets.reserve(uris.size());
+    for (const auto& uri : uris)
+        datasets.emplace_back(internal::FromUri(uri));
+    assert(!datasets.empty());
+
+    // if only 1, just return
+    if (datasets.size() == 1) return std::unique_ptr<DataSetBase>(datasets.front().release());
+
+    // else merge
+    else {
+        auto& result = datasets.front();
+        for (const auto& dataset : datasets)
+            *result += *dataset;
+        return std::move(result);
+    }
+}
+
+std::unique_ptr<DataSetBase> DataSetIO::FromXmlString(const std::string& xml)
+{
+    if (xml.empty()) throw std::runtime_error{"empty XML string"};
+    std::istringstream s{xml};
+    return XmlReader::FromStream(s);
+}
+
+void DataSetIO::ToFile(const std::unique_ptr<DataSetBase>& dataset, const std::string& fn)
+{
+    std::ofstream out(fn);
+    if (!out) throw std::runtime_error{"could not open XML file for writing: " + fn};
+    XmlWriter::ToStream(dataset, out);
+}
+
+void DataSetIO::ToStream(const std::unique_ptr<DataSetBase>& dataset, std::ostream& out)
+{
+    XmlWriter::ToStream(dataset, out);
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSetIO.h b/src/DataSetIO.h

new file mode 100644 (file)

index 0000000..7a966ba
--- /dev/null
+++ b/src/DataSetIO.h
@@ -0,0 +1,37 @@
+// Author: Derek Barnett
+
+#ifndef DATASETIO_H
+#define DATASETIO_H
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <vector>
+#include "pbbam/DataSet.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class DataSetIO
+{
+public:
+    // input
+    static std::unique_ptr<DataSetBase> FromUri(const std::string& uri);
+    static std::unique_ptr<DataSetBase> FromUris(const std::vector<std::string>& uris);
+
+    static std::unique_ptr<DataSetBase> FromXmlString(const std::string& xml);
+
+    //    static DataSetBase FromUri(const std::string& uri);
+    //    static DataSetBase FromUris(const std::vector<std::string>& uris);
+
+    //    // output
+    static void ToFile(const std::unique_ptr<DataSetBase>& dataset, const std::string& fn);
+    static void ToStream(const std::unique_ptr<DataSetBase>& dataset, std::ostream& out);
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // DATASETIO_H
diff --git a/src/DataSetTypes.cpp b/src/DataSetTypes.cpp

new file mode 100644 (file)

index 0000000..45e597e
--- /dev/null
+++ b/src/DataSetTypes.cpp
@@ -0,0 +1,453 @@
+// File Description
+/// \file DataSetTypes.cpp
+/// \brief Implementations for the public DataSet component classes.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/DataSetTypes.h"
+
+#include <cstddef>
+#include <set>
+
+#include "DataSetUtils.h"
+#include "FileUtils.h"
+#include "TimeUtils.h"
+#include "pbbam/internal/DataSetBaseTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+// -------------------
+// AlignmentSet
+// -------------------
+
+AlignmentSet::AlignmentSet()
+    : DataSetBase("PacBio.DataSet.AlignmentSet", "AlignmentSet", XsdType::DATASETS)
+{
+}
+
+// -------------------
+// BarcodeSet
+// -------------------
+
+BarcodeSet::BarcodeSet() : DataSetBase("PacBio.DataSet.BarcodeSet", "BarcodeSet", XsdType::DATASETS)
+{
+}
+
+// -----------------------
+// ConsensusAlignmentSet
+// -----------------------
+
+ConsensusAlignmentSet::ConsensusAlignmentSet()
+    : DataSetBase("PacBio.DataSet.ConsensusAlignmentSet", "ConsensusAlignmentSet",
+                  XsdType::DATASETS)
+{
+}
+
+// -------------------
+// ConsensusReadSet
+// -------------------
+
+ConsensusReadSet::ConsensusReadSet()
+    : DataSetBase("PacBio.DataSet.ConsensusReadSet", "ConsensusReadSet", XsdType::DATASETS)
+{
+}
+
+// -------------------
+// ContigSet
+// -------------------
+
+ContigSet::ContigSet() : DataSetBase("PacBio.DataSet.ContigSet", "ContigSet", XsdType::DATASETS) {}
+
+// -------------------
+// DataSetBase
+// -------------------
+
+DataSetBase::DataSetBase()
+    : StrictEntityType("PacBio.DataSet.DataSet", "DataSet", XsdType::DATASETS)
+{
+}
+
+DataSetBase::DataSetBase(const std::string& metatype, const std::string& label, const XsdType& xsd)
+    : StrictEntityType(metatype, label, xsd)
+{
+}
+
+DEFINE_ACCESSORS(DataSetBase, ExternalResources, ExternalResources)
+
+DataSetBase& DataSetBase::ExternalResources(const PacBio::BAM::ExternalResources& resources)
+{
+    ExternalResources() = resources;
+    return *this;
+}
+
+DEFINE_ACCESSORS(DataSetBase, Filters, Filters)
+
+DataSetBase& DataSetBase::Filters(const PacBio::BAM::Filters& filters)
+{
+    Filters() = filters;
+    return *this;
+}
+
+DEFINE_ACCESSORS(DataSetBase, DataSetMetadata, Metadata)
+
+DataSetBase& DataSetBase::Metadata(const PacBio::BAM::DataSetMetadata& metadata)
+{
+    Metadata() = metadata;
+    return *this;
+}
+
+const PacBio::BAM::SubDataSets& DataSetBase::SubDataSets() const
+{
+    try {
+        return Child<PacBio::BAM::SubDataSets>("DataSets");
+    } catch (std::exception&) {
+        return internal::NullObject<PacBio::BAM::SubDataSets>();
+    }
+}
+
+PacBio::BAM::SubDataSets& DataSetBase::SubDataSets()
+{
+    if (!HasChild("DataSets")) AddChild(internal::NullObject<PacBio::BAM::SubDataSets>());
+    return Child<PacBio::BAM::SubDataSets>("DataSets");
+}
+
+DataSetBase& DataSetBase::SubDataSets(const PacBio::BAM::SubDataSets& subdatasets)
+{
+    SubDataSets() = subdatasets;
+    return *this;
+}
+
+DataSetBase* DataSetBase::DeepCopy() const
+{
+    auto* copyDataset = new DataSetElement(*this);
+    auto* result = static_cast<DataSetBase*>(copyDataset);
+    result->registry_ = registry_;
+    return result;
+}
+
+DataSetBase& DataSetBase::operator+=(const DataSetBase& other)
+{
+    // must be same dataset types (or 'other' must be generic)
+    if (other.LocalNameLabel() != LocalNameLabel() && other.LocalNameLabel() != "DataSet")
+        throw std::runtime_error{"cannot merge different dataset types"};
+
+    // check filter match
+    // check object metadata
+    Metadata() += other.Metadata();
+    ExternalResources() += other.ExternalResources();
+    Filters() += other.Filters();
+    SubDataSets() += other;
+
+    return *this;
+}
+
+std::shared_ptr<DataSetBase> DataSetBase::Create(const std::string& typeName)
+{
+    if (typeName == std::string("DataSet")) return std::make_shared<DataSetBase>();
+    if (typeName == std::string("SubreadSet")) return std::make_shared<SubreadSet>();
+    if (typeName == std::string("AlignmentSet")) return std::make_shared<AlignmentSet>();
+    if (typeName == std::string("BarcodeSet")) return std::make_shared<BarcodeSet>();
+    if (typeName == std::string("ConsensusAlignmentSet"))
+        return std::make_shared<ConsensusAlignmentSet>();
+    if (typeName == std::string("ConsensusReadSet")) return std::make_shared<ConsensusReadSet>();
+    if (typeName == std::string("ContigSet")) return std::make_shared<ContigSet>();
+    if (typeName == std::string("HdfSubreadSet")) return std::make_shared<HdfSubreadSet>();
+    if (typeName == std::string("ReferenceSet")) return std::make_shared<ReferenceSet>();
+    if (typeName == std::string("TranscriptSet")) return std::make_shared<TranscriptSet>();
+    if (typeName == std::string("TranscriptAlignmentSet"))
+        return std::make_shared<TranscriptAlignmentSet>();
+
+    // unknown typename
+    throw std::runtime_error{"unsupported dataset type"};
+}
+
+// -------------------
+// DataSetMetadata
+// -------------------
+
+DataSetMetadata::DataSetMetadata(const std::string& numRecords, const std::string& totalLength)
+    : DataSetElement("DataSetMetadata", XsdType::DATASETS)
+{
+    TotalLength(totalLength);
+    NumRecords(numRecords);
+}
+
+DEFINE_ACCESSORS(DataSetMetadata, Provenance, Provenance)
+
+DataSetMetadata& DataSetMetadata::Provenance(const PacBio::BAM::Provenance& provenance)
+{
+    Provenance() = provenance;
+    return *this;
+}
+
+DataSetMetadata& DataSetMetadata::operator+=(const DataSetMetadata& other)
+{
+    TotalLength() = TotalLength() + other.TotalLength();
+    NumRecords() = NumRecords() + other.NumRecords();
+    // merge add'l
+    return *this;
+}
+
+// -------------------
+// ExtensionElement
+// -------------------
+
+ExtensionElement::ExtensionElement() : DataSetElement("ExtensionElement", XsdType::BASE_DATA_MODEL)
+{
+}
+
+// -------------------
+// Extensions
+// -------------------
+
+Extensions::Extensions()
+    : DataSetListElement<ExtensionElement>("Extensions", XsdType::BASE_DATA_MODEL)
+{
+}
+
+// -------------------
+// ExternalResource
+// -------------------
+
+ExternalResource::ExternalResource(const BamFile& bamFile)
+    : IndexedDataType("PacBio.SubreadFile.SubreadBamFile", bamFile.Filename(), "ExternalResource",
+                      XsdType::BASE_DATA_MODEL)
+{
+}
+
+ExternalResource::ExternalResource(const std::string& metatype, const std::string& filename)
+    : IndexedDataType(metatype, filename, "ExternalResource", XsdType::BASE_DATA_MODEL)
+{
+}
+
+DEFINE_ACCESSORS(ExternalResource, ExternalResources, ExternalResources)
+
+ExternalResource& ExternalResource::ExternalResources(
+    const PacBio::BAM::ExternalResources& resources)
+{
+    ExternalResources() = resources;
+    return *this;
+}
+
+BamFile ExternalResource::ToBamFile() const { return BamFile(ResourceId()); }
+
+// -------------------
+// ExternalResources
+// -------------------
+
+ExternalResources::ExternalResources()
+    : DataSetListElement<ExternalResource>("ExternalResources", XsdType::BASE_DATA_MODEL)
+{
+}
+
+ExternalResources& ExternalResources::operator+=(const ExternalResources& other)
+{
+    // only keep unique resource ids
+
+    std::set<std::string> myResourceIds;
+    for (size_t i = 0; i < Size(); ++i) {
+        const ExternalResource& resource = this->operator[](i);
+        myResourceIds.insert(resource.ResourceId());
+    }
+
+    std::vector<size_t> newResourceIndices;
+    const size_t numOtherResourceIds = other.Size();
+    for (size_t i = 0; i < numOtherResourceIds; ++i) {
+        const std::string& resourceId = other[i].ResourceId();
+        auto found = myResourceIds.find(resourceId);
+        if (found == myResourceIds.cend()) newResourceIndices.push_back(i);
+    }
+
+    for (size_t index : newResourceIndices)
+        Add(other[index]);
+
+    return *this;
+}
+
+void ExternalResources::Add(const ExternalResource& ext)
+{
+    // disallow external resources w/ duplicate ResourceIds
+    std::set<std::string> myResourceIds;
+    for (size_t i = 0; i < Size(); ++i) {
+        const ExternalResource& resource = this->operator[](i);
+        myResourceIds.insert(resource.ResourceId());
+    }
+    if (myResourceIds.find(ext.ResourceId()) == myResourceIds.cend()) AddChild(ext);
+}
+
+std::vector<BamFile> ExternalResources::BamFiles() const
+{
+    std::vector<BamFile> result;
+    const int numResources = Size();
+    result.reserve(numResources);
+    for (const ExternalResource& ext : *this)
+        result.push_back(ext.ToBamFile());
+    return result;
+}
+
+void ExternalResources::Remove(const ExternalResource& ext) { RemoveChild(ext); }
+
+// -------------------
+// FileIndex
+// -------------------
+
+FileIndex::FileIndex(const std::string& metatype, const std::string& filename)
+    : InputOutputDataType(metatype, filename, "FileIndex", XsdType::BASE_DATA_MODEL)
+{
+}
+
+// -------------------
+// FileIndices
+// -------------------
+
+FileIndices::FileIndices() : DataSetListElement<FileIndex>("FileIndices", XsdType::BASE_DATA_MODEL)
+{
+}
+
+void FileIndices::Add(const FileIndex& index) { AddChild(index); }
+
+void FileIndices::Remove(const FileIndex& index) { RemoveChild(index); }
+
+// -------------------
+// Filter
+// -------------------
+
+Filter::Filter() : DataSetElement("Filter", XsdType::DATASETS) {}
+
+DEFINE_ACCESSORS(Filter, Properties, Properties)
+
+Filter& Filter::Properties(const PacBio::BAM::Properties& properties)
+{
+    Properties() = properties;
+    return *this;
+}
+
+// -------------------
+// Filters
+// -------------------
+
+Filters::Filters() : DataSetListElement<Filter>("Filters", XsdType::DATASETS) {}
+
+Filters& Filters::operator+=(const Filters& other)
+{
+    for (auto& newFilter : other)
+        AddChild(newFilter);
+    return *this;
+}
+
+void Filters::Add(const Filter& filter) { AddChild(filter); }
+
+void Filters::Remove(const Filter& filter) { RemoveChild(filter); }
+
+// -------------------
+// HdfSubreadSet
+// -------------------
+
+HdfSubreadSet::HdfSubreadSet()
+    : DataSetBase("PacBio.DataSet.HdfSubreadSet", "HdfSubreadSet", XsdType::DATASETS)
+{
+}
+
+// -------------------
+// ParentTool
+// -------------------
+
+ParentTool::ParentTool() : BaseEntityType("ParentTool", XsdType::DATASETS) {}
+
+// -------------------
+// Properties
+// -------------------
+
+Properties::Properties() : DataSetListElement<Property>("Properties", XsdType::BASE_DATA_MODEL) {}
+
+void Properties::Add(const Property& property) { AddChild(property); }
+
+void Properties::Remove(const Property& property) { RemoveChild(property); }
+
+// -------------------
+// Property
+// -------------------
+
+Property::Property(const std::string& name, const std::string& value, const std::string& op)
+    : DataSetElement("Property", XsdType::BASE_DATA_MODEL)
+{
+    Name(name);
+    Value(value);
+    Operator(op);
+}
+
+// -------------------
+// Provenance
+// -------------------
+
+Provenance::Provenance() : DataSetElement("Provenance", XsdType::DATASETS) {}
+
+DEFINE_ACCESSORS(Provenance, ParentTool, ParentTool)
+
+// -------------------
+// ReferenceSet
+// -------------------
+
+ReferenceSet::ReferenceSet()
+    : DataSetBase("PacBio.DataSet.ReferenceSet", "ReferenceSet", XsdType::DATASETS)
+{
+}
+
+// -------------------
+// SubDataSets
+// -------------------
+
+SubDataSets::SubDataSets()
+    : internal::DataSetListElement<DataSetBase>("DataSets", XsdType::DATASETS)
+{
+}
+
+SubDataSets& SubDataSets::operator+=(const DataSetBase& other)
+{
+    AddChild(other);
+    return *this;
+}
+
+SubDataSets& SubDataSets::operator+=(const SubDataSets& other)
+{
+    for (auto& newSubDataset : other)
+        AddChild(newSubDataset);
+    return *this;
+}
+
+void SubDataSets::Add(const DataSetBase& subdataset) { AddChild(subdataset); }
+
+void SubDataSets::Remove(const DataSetBase& subdataset) { RemoveChild(subdataset); }
+
+// -------------------
+// SubreadSet
+// -------------------
+
+SubreadSet::SubreadSet() : DataSetBase("PacBio.DataSet.SubreadSet", "SubreadSet", XsdType::DATASETS)
+{
+}
+
+// -------------------
+// TranscriptSet
+// -------------------
+
+TranscriptSet::TranscriptSet()
+    : DataSetBase("PacBio.DataSet.TranscriptSet", "TranscriptSet", XsdType::DATASETS)
+{
+}
+
+// -------------------
+// TranscriptAlignmentSet
+// -------------------
+
+TranscriptAlignmentSet::TranscriptAlignmentSet()
+    : DataSetBase("PacBio.DataSet.TranscriptAlignmentSet", "TranscriptAlignmentSet",
+                  XsdType::DATASETS)
+{
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/DataSetUtils.h b/src/DataSetUtils.h

new file mode 100644 (file)

index 0000000..1b05f4c
--- /dev/null
+++ b/src/DataSetUtils.h
@@ -0,0 +1,70 @@
+// Author: Derek Barnett
+
+#ifndef DATASETUTILS_H
+#define DATASETUTILS_H
+
+#include <boost/uuid/random_generator.hpp>
+#include <boost/uuid/uuid_io.hpp>
+#include "pbbam/DataSetTypes.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static const std::string XML_VERSION = std::string{"3.0.1"};
+
+template <typename T>
+inline const T& NullObject()
+{
+    static const T empty;
+    return empty;
+}
+
+template <>
+inline const PacBio::BAM::DataSetMetadata& NullObject()
+{
+    static const PacBio::BAM::DataSetMetadata empty("", "");
+    return empty;
+}
+
+inline std::string GenerateUuid()
+{
+    static boost::uuids::random_generator gen;
+    const boost::uuids::uuid uuid = gen();
+    return boost::uuids::to_string(uuid);
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#ifndef FETCH_CHILD_CONST_REF
+#define FETCH_CHILD_CONST_REF(Class, Type, Method)            \
+                                                              \
+    const PacBio::BAM::Type& Class::Method() const            \
+    {                                                         \
+        try {                                                 \
+            return Child<PacBio::BAM::Type>(#Type);           \
+        } catch (std::exception&) {                           \
+            return internal::NullObject<PacBio::BAM::Type>(); \
+        }                                                     \
+    }
+#endif
+
+#ifndef FETCH_CHILD_REF
+#define FETCH_CHILD_REF(Class, Type, Method)                                       \
+                                                                                   \
+    PacBio::BAM::Type& Class::Method()                                             \
+    {                                                                              \
+        if (!HasChild(#Type)) AddChild(internal::NullObject<PacBio::BAM::Type>()); \
+        return Child<PacBio::BAM::Type>(#Type);                                    \
+    }
+#endif
+
+#ifndef DEFINE_ACCESSORS
+#define DEFINE_ACCESSORS(Class, Type, Method)  \
+    FETCH_CHILD_CONST_REF(Class, Type, Method) \
+    FETCH_CHILD_REF(Class, Type, Method)
+#endif
+
+#endif  // DATASETUTILS_H
diff --git a/src/DataSetXsd.cpp b/src/DataSetXsd.cpp

new file mode 100644 (file)

index 0000000..6ad8ebd
--- /dev/null
+++ b/src/DataSetXsd.cpp
@@ -0,0 +1,199 @@
+// File Description
+/// \file DataSetXsd.cpp
+/// \brief Implements the XSD- and namespace-related classes for DataSetXML.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/DataSetXsd.h"
+
+#include <unordered_map>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// clang-format off
+static std::map<XsdType, NamespaceInfo> DefaultRegistry()
+{
+    const auto result = std::map<XsdType, NamespaceInfo>
+    {
+        { XsdType::NONE,                   NamespaceInfo{ "", "" } },
+        { XsdType::AUTOMATION_CONSTRAINTS, NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioAutomationConstraints.xsd" } },
+        { XsdType::BASE_DATA_MODEL,        NamespaceInfo{ "pbbase", "http://pacificbiosciences.com/PacBioBaseDataModel.xsd" } },
+        { XsdType::COLLECTION_METADATA,    NamespaceInfo{ "pbmeta", "http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" } },
+        { XsdType::COMMON_MESSAGES,        NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioCommonMessages.xsd" } },
+        { XsdType::DATA_MODEL,             NamespaceInfo{ "pbdm",   "http://pacificbiosciences.com/PacBioDataModel.xsd" } },
+        { XsdType::DATA_STORE,             NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioDataStore.xsd" } },
+        { XsdType::DATASETS,               NamespaceInfo{ "pbds",   "http://pacificbiosciences.com/PacBioDatasets.xsd" } },
+        { XsdType::DECL_DATA,              NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioDeclData.xsd" } },
+        { XsdType::PART_NUMBERS,           NamespaceInfo{ "pbpn",   "http://pacificbiosciences.com/PacBioPartNumbers.xsd" } },
+        { XsdType::PRIMARY_METRICS,        NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioPrimaryMetrics.xsd" } },
+        { XsdType::REAGENT_KIT,            NamespaceInfo{ "pbrk",   "http://pacificbiosciences.com/PacBioReagentKit.xsd" } },
+        { XsdType::RIGHTS_AND_ROLES,       NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioRightsAndRoles.xsd" } },
+        { XsdType::SAMPLE_INFO,            NamespaceInfo{ "pbsample", "http://pacificbiosciences.com/PacBioSampleInfo.xsd" } },
+        { XsdType::SEEDING_DATA,           NamespaceInfo{ "",       "http://pacificbiosciences.com/PacBioSeedingData.xsd" } }
+    };
+    return result;
+}
+
+static const auto elementRegistry = std::unordered_map<std::string, XsdType>
+{
+    // 'pbbase' elements
+    //
+    { "AutomationParameter" ,  XsdType::BASE_DATA_MODEL },
+    { "AutomationParameters" , XsdType::BASE_DATA_MODEL },
+    { "BinCount" ,             XsdType::BASE_DATA_MODEL },
+    { "BinCounts" ,            XsdType::BASE_DATA_MODEL },
+    { "BinLabel" ,             XsdType::BASE_DATA_MODEL },
+    { "BinLabels" ,            XsdType::BASE_DATA_MODEL },
+    { "BinWidth" ,             XsdType::BASE_DATA_MODEL },
+    { "ExternalResource" ,     XsdType::BASE_DATA_MODEL },
+    { "ExternalResources" ,    XsdType::BASE_DATA_MODEL },
+    { "FileIndex" ,            XsdType::BASE_DATA_MODEL },
+    { "FileIndices" ,          XsdType::BASE_DATA_MODEL },
+    { "MaxBinValue" ,          XsdType::BASE_DATA_MODEL },
+    { "MaxOutlierValue" ,      XsdType::BASE_DATA_MODEL },
+    { "MetricDescription" ,    XsdType::BASE_DATA_MODEL },
+    { "NumBins" ,              XsdType::BASE_DATA_MODEL },
+    { "Properties" ,           XsdType::BASE_DATA_MODEL },
+    { "Property" ,             XsdType::BASE_DATA_MODEL },
+    { "Sample95thPct" ,        XsdType::BASE_DATA_MODEL },
+    { "SampleMean" ,           XsdType::BASE_DATA_MODEL },
+    { "SampleMed" ,            XsdType::BASE_DATA_MODEL },
+    { "SampleSize" ,           XsdType::BASE_DATA_MODEL },
+    { "SampleStd" ,            XsdType::BASE_DATA_MODEL },
+
+    // 'pbds' elements
+    //
+    { "AdapterDimerFraction",  XsdType::DATASETS },
+    { "AlignmentSet",          XsdType::DATASETS },
+    { "BarcodeConstruction",   XsdType::DATASETS },
+    { "BarcodeSet",            XsdType::DATASETS },
+    { "ConsensusAlignmentSet", XsdType::DATASETS },
+    { "ConsensusReadSet",      XsdType::DATASETS },
+    { "Contig",                XsdType::DATASETS },
+    { "Contigs",               XsdType::DATASETS },
+    { "ContigSet",             XsdType::DATASETS },
+    { "ControlReadLenDist",    XsdType::DATASETS },
+    { "ControlReadQualDist",   XsdType::DATASETS },
+    { "DataSetMetdata",        XsdType::DATASETS },
+    { "DataSet",               XsdType::DATASETS },
+    { "DataSets",              XsdType::DATASETS },
+    { "Filter",                XsdType::DATASETS },
+    { "Filters",               XsdType::DATASETS },
+    { "HdfSubreadSet",         XsdType::DATASETS },
+    { "InsertReadLenDist",     XsdType::DATASETS },
+    { "InsertReadQualDist" ,   XsdType::DATASETS },
+    { "MedianInsertDist",      XsdType::DATASETS },
+    { "NumRecords",            XsdType::DATASETS },
+    { "NumSequencingZmws",     XsdType::DATASETS },
+    { "Organism",              XsdType::DATASETS },
+    { "ParentTool",            XsdType::DATASETS },
+    { "Ploidy",                XsdType::DATASETS },
+    { "ProdDist",              XsdType::DATASETS },
+    { "Provenance",            XsdType::DATASETS },
+    { "ReadLenDist",           XsdType::DATASETS },
+    { "ReadQualDist",          XsdType::DATASETS },
+    { "ReadTypeDist",          XsdType::DATASETS },
+    { "ReferenceSet",          XsdType::DATASETS },
+    { "ShortInsertFraction",   XsdType::DATASETS },
+    { "SubreadSet",            XsdType::DATASETS },
+    { "SummaryStats",          XsdType::DATASETS },
+    { "TotalLength",           XsdType::DATASETS },
+    { "TranscriptSet",         XsdType::DATASETS },
+    { "TranscriptAlignmentSet",XsdType::DATASETS },
+
+    // 'pbmeta' elements
+    //
+    { "Automation",           XsdType::COLLECTION_METADATA },
+    { "AutomationName",       XsdType::COLLECTION_METADATA },
+    { "CellIndex",            XsdType::COLLECTION_METADATA },
+    { "CellPac",              XsdType::COLLECTION_METADATA },
+    { "CollectionFileCopy",   XsdType::COLLECTION_METADATA },
+    { "CollectionMetadata",   XsdType::COLLECTION_METADATA },
+    { "CollectionNumber",     XsdType::COLLECTION_METADATA },
+    { "CollectionPathUri",    XsdType::COLLECTION_METADATA },
+    { "Collections",          XsdType::COLLECTION_METADATA },
+    { "Concentration",        XsdType::COLLECTION_METADATA },
+    { "ConfigFileName",       XsdType::COLLECTION_METADATA },
+    { "CopyFiles",            XsdType::COLLECTION_METADATA },
+    { "InstCtrlVer",          XsdType::COLLECTION_METADATA },
+    { "MetricsVerbosity",     XsdType::COLLECTION_METADATA },
+    { "Name",                 XsdType::COLLECTION_METADATA },
+    { "OutputOptions",        XsdType::COLLECTION_METADATA },
+    { "PlateId",              XsdType::COLLECTION_METADATA },
+    { "Primary",              XsdType::COLLECTION_METADATA },
+    { "Readout",              XsdType::COLLECTION_METADATA },
+    { "ResultsFolder",        XsdType::COLLECTION_METADATA },
+    { "RunDetails",           XsdType::COLLECTION_METADATA },
+    { "RunId",                XsdType::COLLECTION_METADATA },
+    { "SampleReuseEnabled",   XsdType::COLLECTION_METADATA },
+    { "SequencingCondition",  XsdType::COLLECTION_METADATA },
+    { "SigProcVer",           XsdType::COLLECTION_METADATA },
+    { "SizeSelectionEnabled", XsdType::COLLECTION_METADATA },
+    { "StageHotstartEnabled", XsdType::COLLECTION_METADATA },
+    { "UseCount",             XsdType::COLLECTION_METADATA },
+    { "WellName",             XsdType::COLLECTION_METADATA },
+    { "WellSample",           XsdType::COLLECTION_METADATA },
+
+    // 'pbsample' elements
+    //
+    { "BioSample",         XsdType::SAMPLE_INFO },
+    { "BioSamplePointer",  XsdType::SAMPLE_INFO },
+    { "BioSamplePointers", XsdType::SAMPLE_INFO },
+    { "BioSamples",        XsdType::SAMPLE_INFO }
+};
+// clang-format on
+
+}  // namespace internal
+
+// ---------------
+// NamespaceInfo
+// ---------------
+
+NamespaceInfo::NamespaceInfo(std::string name, std::string uri)
+    : name_(std::move(name)), uri_(std::move(uri))
+{
+}
+
+// -------------------
+// NamespaceRegistry
+// -------------------
+
+NamespaceRegistry::NamespaceRegistry() : data_(internal::DefaultRegistry()) {}
+
+const NamespaceInfo& NamespaceRegistry::DefaultNamespace() const { return Namespace(DefaultXsd()); }
+
+XsdType NamespaceRegistry::DefaultXsd() const { return defaultXsdType_; }
+
+const NamespaceInfo& NamespaceRegistry::Namespace(const XsdType& xsd) const
+{
+    return data_.at(xsd);
+}
+
+void NamespaceRegistry::Register(const XsdType& xsd, const NamespaceInfo& namespaceInfo)
+{
+    data_[xsd] = namespaceInfo;
+}
+
+void NamespaceRegistry::SetDefaultXsd(const XsdType& xsd) { defaultXsdType_ = xsd; }
+
+XsdType NamespaceRegistry::XsdForElement(const std::string& elementLabel) const
+{
+    const auto iter = internal::elementRegistry.find(elementLabel);
+    return (iter == internal::elementRegistry.cend() ? XsdType::NONE : iter->second);
+}
+
+XsdType NamespaceRegistry::XsdForUri(const std::string& uri) const
+{
+    for (const auto& entry : data_) {
+        const auto& info = entry.second;
+        if (info.Uri() == uri) return entry.first;
+    }
+    return XsdType::NONE;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/EntireFileQuery.cpp b/src/EntireFileQuery.cpp

new file mode 100644 (file)

index 0000000..d13f82c
--- /dev/null
+++ b/src/EntireFileQuery.cpp
@@ -0,0 +1,33 @@
+// File Description
+/// \file EntireFileQuery.cpp
+/// \brief Implements the EntireFileQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/EntireFileQuery.h"
+
+#include "pbbam/CompositeBamReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct EntireFileQuery::EntireFileQueryPrivate
+{
+    EntireFileQueryPrivate(const DataSet &dataset) : reader_(dataset) {}
+
+    SequentialCompositeBamReader reader_;
+};
+
+EntireFileQuery::EntireFileQuery(const DataSet &dataset)
+    : internal::IQuery(), d_(new EntireFileQueryPrivate(dataset))
+{
+}
+
+EntireFileQuery::~EntireFileQuery() {}
+
+bool EntireFileQuery::GetNext(BamRecord &r) { return d_->reader_.GetNext(r); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/EnumClassHash.h b/src/EnumClassHash.h

new file mode 100644 (file)

index 0000000..13b8575
--- /dev/null
+++ b/src/EnumClassHash.h
@@ -0,0 +1,53 @@
+// File Description
+/// \file EnumClassHash.h
+/// \brief Defines the EnumClassHash class.
+//
+// Author: Derek Barnett
+
+#ifndef ENUMCLASSHASH_H
+#define ENUMCLASSHASH_H
+
+#include <cstddef>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+///
+/// \brief The EnumClassHash struct enables the use of enum class types as keys
+///        for std::unordered_map.
+///
+/// Allows something like:
+///
+/// \code{.cpp}
+///    std::unordered_map<Key_t, Value_t, EnumClassHash> myLookup;
+/// \endcode
+///
+/// where Key_t is an enum class. Without this sort of extra hand-holding to
+/// provide a 'manual' hash value, enum classes as keys will fail to compile.
+///
+/// \note This approach might be unnecessary in C++14, if I understand some of
+/// the changes correctly. But this works for C++11 and should continue beyond.
+///
+/// \sa http://stackoverflow.com/questions/18837857/cant-use-enum-class-as-unordered-map-key
+///
+struct EnumClassHash
+{
+    // *** NOTE ***
+    //
+    // Remove this when we integrate pbcopper.
+    // This is a duplicate of pbcopper/utility/EnumClassHash.h
+    //
+
+    template <typename T>
+    size_t operator()(const T t) const
+    {
+        return static_cast<size_t>(t);
+    }
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // ENUMCLASSHASH_H
diff --git a/src/FastaReader.cpp b/src/FastaReader.cpp

new file mode 100644 (file)

index 0000000..8698e6e
--- /dev/null
+++ b/src/FastaReader.cpp
@@ -0,0 +1,110 @@
+// File Description
+/// \file FastaReader.cpp
+/// \brief Implements the FastaReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FastaReader.h"
+
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <stdexcept>
+
+#include <htslib/faidx.h>
+
+#include "pbbam/MakeUnique.h"
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct FastaReaderPrivate
+{
+    std::ifstream stream_;
+    std::string name_;
+    std::string bases_;
+
+    FastaReaderPrivate(const std::string& fn) : stream_{fn}
+    {
+        if (!stream_)
+            throw std::runtime_error{"FastaReader - could not open " + fn + " for reading"};
+        FetchNext();
+    }
+
+    bool GetNext(FastaSequence& record)
+    {
+        if (name_.empty() && bases_.empty()) return false;
+        record = FastaSequence{name_, bases_};
+        FetchNext();
+        return true;
+    }
+
+private:
+    void FetchNext()
+    {
+        name_.clear();
+        bases_.clear();
+
+        SkipNewlines();
+        ReadName();
+        ReadBases();
+
+        bases_ = RemoveAllWhitespace(std::move(bases_));
+    }
+
+    inline void SkipNewlines()
+    {
+        if (!stream_) return;
+        if (stream_.peek() == '\n')
+            stream_.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+    }
+
+    void ReadName()
+    {
+        if (!stream_) return;
+        if (stream_.get() == '>') std::getline(stream_, name_, '\n');
+    }
+
+    void ReadBases()
+    {
+        if (!stream_) return;
+        int p = stream_.peek();
+        while (static_cast<char>(p) != '>' && p != EOF) {
+            if (!stream_) return;
+            std::string line;
+            std::getline(stream_, line, '\n');
+            bases_ += line;
+            if (!stream_) return;
+            p = stream_.peek();
+        }
+    }
+};
+
+}  // namespace internal
+
+FastaReader::FastaReader(const std::string& fn)
+    : d_{std::make_unique<internal::FastaReaderPrivate>(fn)}
+{
+}
+
+FastaReader::~FastaReader() {}
+
+bool FastaReader::GetNext(FastaSequence& record) { return d_->GetNext(record); }
+
+std::vector<FastaSequence> FastaReader::ReadAll(const std::string& fn)
+{
+    std::vector<FastaSequence> result;
+    result.reserve(256);
+    FastaReader reader{fn};
+    FastaSequence s;
+    while (reader.GetNext(s))
+        result.emplace_back(s);
+    return result;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FastaSequenceQuery.cpp b/src/FastaSequenceQuery.cpp

new file mode 100644 (file)

index 0000000..55d2bdd
--- /dev/null
+++ b/src/FastaSequenceQuery.cpp
@@ -0,0 +1,34 @@
+// File Description
+/// \file FastaSequenceQuery.cpp
+/// \brief Implements the FastaSequenceQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FastaSequenceQuery.h"
+
+#include "pbbam/CompositeFastaReader.h"
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct FastaSequenceQuery::FastaSequenceQueryPrivate
+{
+    FastaSequenceQueryPrivate(const DataSet& dataset) : reader_{dataset} {}
+
+    CompositeFastaReader reader_;
+};
+
+FastaSequenceQuery::FastaSequenceQuery(const DataSet& dataset)
+    : internal::QueryBase<FastaSequence>(), d_{std::make_unique<FastaSequenceQueryPrivate>(dataset)}
+{
+}
+
+FastaSequenceQuery::~FastaSequenceQuery() {}
+
+bool FastaSequenceQuery::GetNext(FastaSequence& seq) { return d_->reader_.GetNext(seq); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FastqReader.cpp b/src/FastqReader.cpp

new file mode 100644 (file)

index 0000000..60ce12b
--- /dev/null
+++ b/src/FastqReader.cpp
@@ -0,0 +1,104 @@
+// File Description
+/// \file FastqReader.cpp
+/// \brief Implements the FastqReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/FastqReader.h"
+
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <stdexcept>
+
+#include <htslib/faidx.h>
+
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct FastqReaderPrivate
+{
+public:
+    explicit FastqReaderPrivate(const std::string& fn) : stream_{fn}
+    {
+        if (!stream_)
+            throw std::runtime_error{"FastqReader - could not open " + fn + " for reading"};
+        FetchNext();
+    }
+
+    bool GetNext(FastqSequence& record)
+    {
+        if (name_.empty() && bases_.empty() && quals_.empty()) return false;
+        record = FastqSequence{name_, bases_, quals_};
+        FetchNext();
+        return true;
+    }
+
+private:
+    void FetchNext()
+    {
+        name_.clear();
+        bases_.clear();
+        quals_.clear();
+
+        if (!stream_ || stream_.eof()) return;
+
+        SkipNewlines();
+
+        ReadName();
+        ReadBases();
+        stream_.ignore(std::numeric_limits<std::streamsize>::max(), '\n');  // ignore "comment line"
+        ReadQuals();
+    }
+
+    inline void SkipNewlines()
+    {
+        if (stream_.peek() == '\n')
+            stream_.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+    }
+
+    void ReadName()
+    {
+        if (stream_.get() == '@') std::getline(stream_, name_, '\n');
+    }
+
+    void ReadBases() { std::getline(stream_, bases_, '\n'); }
+
+    void ReadQuals() { std::getline(stream_, quals_, '\n'); }
+
+private:
+    std::ifstream stream_;
+    std::string name_;
+    std::string bases_;
+    std::string quals_;
+};
+
+}  // namespace internal
+
+FastqReader::FastqReader(const std::string& fn)
+    : d_{std::make_unique<internal::FastqReaderPrivate>(fn)}
+{
+}
+
+FastqReader::~FastqReader() {}
+
+bool FastqReader::GetNext(FastqSequence& record) { return d_->GetNext(record); }
+
+std::vector<FastqSequence> FastqReader::ReadAll(const std::string& fn)
+{
+    std::vector<FastqSequence> result;
+    result.reserve(256);
+    FastqReader reader{fn};
+    FastqSequence s;
+    while (reader.GetNext(s))
+        result.emplace_back(s);
+    return result;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FileProducer.cpp b/src/FileProducer.cpp

new file mode 100644 (file)

index 0000000..36dc164
--- /dev/null
+++ b/src/FileProducer.cpp
@@ -0,0 +1,40 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "FileProducer.h"
+
+#include <cstdio>
+#include <exception>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+FileProducer::FileProducer(std::string targetFilename)
+    : FileProducer(std::move(targetFilename), targetFilename + ".tmp")
+{
+}
+
+FileProducer::FileProducer(std::string targetFilename, std::string tempFilename)
+    : targetFilename_{std::move(targetFilename)}, tempFilename_{std::move(tempFilename)}
+{
+    // override renaming if writing to stdout
+    //
+    // setting temp filename to '-' keeps consistent interfaces
+    // for derived classes to actually operate on temp filename
+    if (targetFilename_ == "-") tempFilename_ = "-";
+}
+
+FileProducer::~FileProducer()
+{
+    // skip renaming if there is a 'live' exception
+    // or if writing to stdout
+    if ((std::current_exception() == nullptr) && (tempFilename_ != "-")) {
+        std::rename(tempFilename_.c_str(), targetFilename_.c_str());
+    }
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FileProducer.h b/src/FileProducer.h

new file mode 100644 (file)

index 0000000..f8b0b94
--- /dev/null
+++ b/src/FileProducer.h
@@ -0,0 +1,58 @@
+// Author: Derek Barnett
+
+#ifndef FILEPRODUCER_H
+#define FILEPRODUCER_H
+
+#include <cstdio>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// The FileProducer class provides functionality for working with a temp
+// file until successful destruction of a FileProducer-derived class.
+//
+// Derived classes should be sure to flush/close the temp file, and the
+// FileProducer's destructor will ensure that the temp file will be renamed to
+// the target filename.
+//
+// If destruction is triggered by an exception, no renaming will occur.
+//
+class FileProducer
+{
+
+public:
+    FileProducer() = delete;
+
+    // Initializes FileProducer with specified target filename. Temp filename is
+    // set to target filename plus ".tmp" suffix.
+    FileProducer(std::string targetFilename);
+
+    // Initializes FileProducer with specified target filename & explicit temp
+    // filename.
+    FileProducer(std::string targetFilename, std::string tempFilename);
+
+    // Renames temp file to target filename.
+    //
+    // Derived classes should ensure that data is flushed and file handle closed
+    // before or during their destructor.
+    //
+    // Remaming will not occur if there is a 'live' exception being thrown.
+    //
+    ~FileProducer();
+
+public:
+    const std::string& TargetFilename() const { return targetFilename_; }
+    const std::string& TempFilename() const { return tempFilename_; }
+
+private:
+    std::string targetFilename_;
+    std::string tempFilename_;
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FILEPRODUCER_H
diff --git a/src/FileUtils.cpp b/src/FileUtils.cpp

new file mode 100644 (file)

index 0000000..1199976
--- /dev/null
+++ b/src/FileUtils.cpp
@@ -0,0 +1,196 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "FileUtils.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+#include <cassert>
+#include <cstddef>
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <memory>
+
+#include <boost/algorithm/string.hpp>
+
+#include "StringUtils.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// pops "file://" scheme off the front of a URI/filepath, if found
+static std::string removeFileUriScheme(const std::string& uri)
+{
+    assert(!uri.empty());
+
+    auto schemeLess = uri;
+    const auto fileScheme = std::string{"file://"};
+    const auto schemeFound = schemeLess.find(fileScheme);
+    if (schemeFound != std::string::npos) {
+        if (schemeFound != 0) throw std::runtime_error{"Malformed URI: scheme not at beginning"};
+        schemeLess = schemeLess.substr(fileScheme.size());
+    }
+    return schemeLess;
+}
+
+#ifdef PBBAM_WIN_FILEPATHS
+
+static std::string removeDiskName(const std::string& filePath)
+{
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':')) return filePath.substr(2);
+    }
+    return filePath;
+}
+
+static const char native_pathSeparator = '\\';
+
+static bool native_pathIsAbsolute(const std::string& filePath)
+{
+    assert(!filePath.empty());
+
+    // if starts with single slash or double slash
+    if (boost::algorithm::starts_with(filePath, "\\")) return true;
+
+    // if starts with single or double-dots -> not absolute
+    if (boost::algorithm::starts_with(filePath, ".")) return false;
+
+    // if starts with disk drive name and colon ("C:\foo\bar.txt")
+    // strip the drive name and check to see if the remaining path is absolute
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':'))
+            return native_pathIsAbsolute(removeDiskName(filePath));
+    }
+
+    // otherwise, likely relative
+    return false;
+}
+
+static std::string native_resolvedFilePath(const std::string& filePath, const std::string& from)
+{
+    // strip file:// scheme if present
+    auto schemeLess = removeFileUriScheme(filePath);
+
+    // if empty or already absolute path, just return it
+    // upfront empty check simplifies further parsing logic
+    if (schemeLess.empty() || native_pathIsAbsolute(schemeLess)) return schemeLess;
+
+    // else make relative from the provided 'from' directory
+    //
+    // first pop disk name, then any leading single-dot '.'
+    //
+    // since we're prepending the 'from' directory, we can remove
+    // any leading './' form our file path. this may just mean that
+    // we pop it off to add it right back (when from == '.'), but this
+    // keeps it consistent with other 'from' parent directories
+    //
+    schemeLess = removeDiskName(schemeLess);
+
+    const bool thisDirAtStart = (schemeLess.find(".") == 0);
+    if (thisDirAtStart) {
+        if (schemeLess.find(native_pathSeparator) == 1) schemeLess = schemeLess.substr(2);
+    }
+    return from + native_pathSeparator + schemeLess;
+}
+
+#else  // else for non-Windows systems
+
+static const char native_pathSeparator = '/';
+
+static bool native_pathIsAbsolute(const std::string& filePath) { return filePath.at(0) == '/'; }
+
+static std::string native_resolvedFilePath(const std::string& filePath, const std::string& from)
+{
+    // strip file:// scheme if present
+    auto schemeLess = removeFileUriScheme(filePath);
+
+    // if empty or already absolute path, just return it
+    // upfront empty check simplifies further parsing logic
+    if (schemeLess.empty() || native_pathIsAbsolute(schemeLess)) return schemeLess;
+
+    // else make relative from the provided 'from' directory
+    //
+    // since we're prepending the 'from' directory, we can remove
+    // any leading './' form our file path. this may just mean that
+    // we pop it off to add it right back (when from == '.'), but this
+    // keeps it consistent with other 'from' parent directories
+    //
+    const bool thisDirAtStart = (schemeLess.find(".") == 0);
+    if (thisDirAtStart) {
+        if (schemeLess.find(native_pathSeparator) == 1) schemeLess = schemeLess.substr(2);
+    }
+    return from + native_pathSeparator + schemeLess;
+}
+
+#endif  // PBBAM_WIN_FILEPATHS
+
+// see http://stackoverflow.com/questions/2869594/how-return-a-stdstring-from-cs-getcwd-function
+std::string FileUtils::CurrentWorkingDirectory()
+{
+    const size_t chunkSize = 1024;
+    const size_t maxNumChunks = 20;
+
+    // stack-based buffer for 'normal' case
+    char buffer[chunkSize];
+    if (getcwd(buffer, sizeof(buffer)) != nullptr) return std::string(buffer);
+
+    // if error is not ERANGE, then it's not a problem of too-long name... something else happened
+    if (errno != ERANGE)
+        throw std::runtime_error{"could not determine current working directory path"};
+
+    // long path - use heap, trying progressively longer buffers
+    for (size_t chunks = 2; chunks < maxNumChunks; ++chunks) {
+        std::unique_ptr<char> cwd(new char[chunkSize * chunks]);
+        if (getcwd(cwd.get(), chunkSize * chunks) != nullptr) return std::string(cwd.get());
+
+        // if error is not ERANGE, then it's not a problem of too-long name... something else happened
+        if (errno != ERANGE)
+            throw std::runtime_error{"could not determine current working directory path"};
+    }
+
+    // crazy long path name
+    throw std::runtime_error{"could determine current working directory - extremely long path"};
+}
+
+std::string FileUtils::DirectoryName(const std::string& file)
+{
+    const auto found = file.rfind(Separator(), file.length());
+    if (found != std::string::npos) return file.substr(0, found);
+    return std::string(".");
+}
+
+bool FileUtils::Exists(const char* fn)
+{
+    struct stat buf;
+    return (stat(fn, &buf) != -1);
+}
+
+std::chrono::system_clock::time_point FileUtils::LastModified(const char* fn)
+{
+    struct stat s;
+    if (stat(fn, &s) != 0) throw std::runtime_error{"could not get file timestamp"};
+    return std::chrono::system_clock::from_time_t(s.st_mtime);
+}
+
+std::string FileUtils::ResolvedFilePath(const std::string& filePath, const std::string& from)
+{
+    return native_resolvedFilePath(filePath, from);
+}
+
+constexpr char FileUtils::Separator() { return native_pathSeparator; }
+
+off_t FileUtils::Size(const char* fn)
+{
+    struct stat s;
+    if (stat(fn, &s) != 0) throw std::runtime_error{"could not determine file size"};
+    return s.st_size;
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FileUtils.h b/src/FileUtils.h

new file mode 100644 (file)

index 0000000..4803a9c
--- /dev/null
+++ b/src/FileUtils.h
@@ -0,0 +1,108 @@
+// Author: Derek Barnett
+
+#ifndef FILEUTILS_H
+#define FILEUTILS_H
+
+#include <chrono>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct FileUtils
+{
+public:
+    /// \returns application's current working directory
+    static std::string CurrentWorkingDirectory();
+
+    /// Parses a filepath for the the directory name for a file.
+    ///
+    /// Essentially this method strips the filename from the string provided (/path/to/file => /path/to).
+    /// If only a filename is provided, then "." is returned to indicate the current directory.
+    ///
+    /// \param[in] file name of file (can be just a filename or path/to/filename)
+    /// \returns file's directory name
+    ///
+    static std::string DirectoryName(const std::string& file);
+
+    /// Check for existence of a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns true if file exists & can be opened
+    ///
+    static bool Exists(const char* fn);
+
+    /// Check for existence of a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns true if file exists & can be opened
+    ///
+    static bool Exists(const std::string& fn);
+
+    /// Check "last modified" timestamp for a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns time of last modification
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static std::chrono::system_clock::time_point LastModified(const char* fn);
+
+    /// Check "last modified" timestamp for a file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns time of last modification
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static std::chrono::system_clock::time_point LastModified(const std::string& fn);
+
+    /// Resolves input file path using optional starting directory.
+    ///
+    /// \verbatim
+    ///   /absolute/path/to/file.txt   => /absolute/path/to/file.txt
+    ///   ../relative/path/to/file.txt => <from>/../relative/path/to/file.txt
+    ///   file.txt                     => <from>/file.txt
+    /// \endverbatim
+    ///
+    /// \note This method will strip any URI scheme as well ("file://") so that the result is immediately ready from I/O operations.
+    ///
+    /// \param[in] filePath file path to be resolved
+    /// \param[in] from     optional starting directory (useful if not same as application's working directory)
+    /// \returns resolved file path
+    ///
+    static std::string ResolvedFilePath(const std::string& filePath, const std::string& from = ".");
+
+    /// \returns native path separator
+    constexpr static char Separator();
+
+    /// Check size of file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns file size in bytes
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static off_t Size(const char* fn);
+
+    /// Check size of file.
+    ///
+    /// \param[in] fn full path to file
+    /// \returns file size in bytes
+    /// \throws runtime_error if file info can't be accessed
+    ///
+    static off_t Size(const std::string& fn);
+};
+
+inline bool FileUtils::Exists(const std::string& fn) { return FileUtils::Exists(fn.c_str()); }
+
+inline std::chrono::system_clock::time_point FileUtils::LastModified(const std::string& fn)
+{
+    return FileUtils::LastModified(fn.c_str());
+}
+
+inline off_t FileUtils::Size(const std::string& fn) { return FileUtils::Size(fn.c_str()); }
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FILEUTILS_H
diff --git a/src/FofnReader.cpp b/src/FofnReader.cpp

new file mode 100644 (file)

index 0000000..cd72d59
--- /dev/null
+++ b/src/FofnReader.cpp
@@ -0,0 +1,24 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "FofnReader.h"
+
+#include <iostream>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+std::vector<std::string> FofnReader::Files(std::istream& in)
+{
+    std::vector<std::string> files;
+    std::string fn;
+    while (std::getline(in, fn))
+        files.push_back(fn);
+    return files;
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/FofnReader.h b/src/FofnReader.h

new file mode 100644 (file)

index 0000000..34e3c7a
--- /dev/null
+++ b/src/FofnReader.h
@@ -0,0 +1,25 @@
+// Author: Derek Barnett
+
+#ifndef FOFNREADER_H
+#define FOFNREADER_H
+
+#include <iosfwd>
+#include <string>
+#include <vector>
+#include "pbbam/DataSet.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class FofnReader
+{
+public:
+    static std::vector<std::string> Files(std::istream& in);
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // FOFNREADER_H
diff --git a/src/Frames.cpp b/src/Frames.cpp

new file mode 100644 (file)

index 0000000..c0161af
--- /dev/null
+++ b/src/Frames.cpp
@@ -0,0 +1,126 @@
+// File Description
+/// \file Frames.cpp
+/// \brief Implements the Frames class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/Frames.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <mutex>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static std::vector<uint16_t> framepoints;
+static std::vector<uint8_t> frameToCode;
+static uint16_t maxFramepoint;
+static std::mutex initIpdDownsamplingMutex;
+
+static void InitIpdDownsampling()
+{
+    std::lock_guard<std::mutex> lock(initIpdDownsamplingMutex);
+
+    if (!framepoints.empty()) return;
+
+    // liftover from Dave's python code:
+    // .../bioinformatics/tools/kineticsTools/kineticsTools/_downsampling.py
+
+    const int B = 2;
+    const int t = 6;
+    const double T = pow(B, t);
+
+    int next = 0;
+    double grain;
+    const int end = 256 / T;
+    for (int i = 0; i < end; ++i) {
+        grain = pow(B, i);
+        std::vector<uint16_t> nextOnes;
+        for (double j = 0; j < T; ++j)
+            nextOnes.push_back(j * grain + next);
+        next = nextOnes.back() + grain;
+        framepoints.insert(framepoints.end(), nextOnes.cbegin(), nextOnes.cend());
+    }
+    assert(framepoints.size() - 1 <= std::numeric_limits<uint8_t>::max());
+
+    const uint16_t maxElement = (*max_element(framepoints.cbegin(), framepoints.cend()));
+    frameToCode.assign(maxElement + 1, 0);
+
+    const int fpEnd = framepoints.size() - 1;
+    uint8_t i = 0;
+    uint16_t fl = 0;
+    uint16_t fu = 0;
+    for (; i < fpEnd; ++i) {
+        fl = framepoints[i];
+        fu = framepoints[i + 1];
+        if (fu > fl + 1) {
+            const int middle = (fl + fu) / 2;
+            for (int f = fl; f < middle; ++f)
+                frameToCode[f] = i;
+            for (int f = middle; f < fu; ++f)
+                frameToCode[f] = i + 1;
+        } else
+            frameToCode[fl] = i;
+    }
+
+    // this next line differs from the python implementation (there, it's "i+1")
+    // our C++ for loop has incremented our index counter one more time than the indexes from python enumerate(...)
+    frameToCode[fu] = i;
+    maxFramepoint = fu;
+}
+
+static inline uint16_t CodeToFrames(const uint8_t code) { return framepoints[code]; }
+
+static std::vector<uint16_t> CodeToFrames(const std::vector<uint8_t>& codedData)
+{
+    InitIpdDownsampling();
+
+    const auto length = codedData.size();
+    std::vector<uint16_t> frames(length, 0);
+    for (size_t i = 0; i < length; ++i)
+        frames[i] = CodeToFrames(codedData[i]);
+    return frames;
+}
+
+static inline uint8_t FramesToCode(const uint16_t frame)
+{
+    return frameToCode[std::min(maxFramepoint, frame)];
+}
+
+static std::vector<uint8_t> FramesToCode(const std::vector<uint16_t>& frames)
+{
+    InitIpdDownsampling();
+
+    const auto length = frames.size();
+    std::vector<uint8_t> result(length, 0);
+    for (size_t i = 0; i < length; ++i)
+        result[i] = FramesToCode(frames[i]);
+    return result;
+}
+
+}  // namespace internal
+
+Frames::Frames() {}
+
+Frames::Frames(std::vector<uint16_t> frames) : data_{std::move(frames)} {}
+
+Frames Frames::Decode(const std::vector<uint8_t>& codedData)
+{
+    return Frames{internal::CodeToFrames(codedData)};
+}
+
+std::vector<uint8_t> Frames::Encode(const std::vector<uint16_t>& frames)
+{
+    return internal::FramesToCode(frames);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/GenomicInterval.cpp b/src/GenomicInterval.cpp

new file mode 100644 (file)

index 0000000..d92cc5c
--- /dev/null
+++ b/src/GenomicInterval.cpp
@@ -0,0 +1,81 @@
+// File Description
+/// \file GenomicInterval.cpp
+/// \brief Implements the GenomicInterval class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/GenomicInterval.h"
+
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <stdexcept>
+
+#include "StringUtils.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// returns sequence name & sets begin/end, from input regionString
+std::string parseRegionString(const std::string& reg, PacBio::BAM::Position* begin,
+                              PacBio::BAM::Position* end)
+{
+    const std::vector<std::string> parts = internal::Split(reg, ':');
+    if (parts.empty() || parts.size() > 2) throw std::runtime_error{"malformed region string"};
+
+    // given name only, default min,max intervals
+    if (parts.size() == 1) {
+        *begin = 0;
+        *end = 1 << 29;
+    }
+
+    // parse interval from input
+    else if (parts.size() == 2) {
+        const std::vector<std::string> intervalParts = internal::Split(parts.at(1), '-');
+        if (intervalParts.empty() || intervalParts.size() > 2)
+            throw std::runtime_error{"malformed region string"};
+        *begin = std::stoi(intervalParts.at(0));
+        *end = std::stoi(intervalParts.at(1));
+    }
+
+    return parts.at(0);
+}
+
+}  // namespace internal
+
+GenomicInterval::GenomicInterval(std::string name, Position start, Position stop)
+    : name_{std::move(name)}, interval_{std::move(start), std::move(stop)}
+{
+}
+
+GenomicInterval::GenomicInterval(const std::string& samtoolsRegionString)
+{
+    Position begin;
+    Position end;
+    name_ = internal::parseRegionString(samtoolsRegionString, &begin, &end);
+    interval_ = PacBio::BAM::Interval<Position>(begin, end);
+}
+
+bool GenomicInterval::CoveredBy(const GenomicInterval& other) const
+{
+    if (name_ != other.name_) return false;
+    return interval_.CoveredBy(other.interval_);
+}
+
+bool GenomicInterval::Covers(const GenomicInterval& other) const
+{
+    if (name_ != other.name_) return false;
+    return interval_.Covers(other.interval_);
+}
+
+bool GenomicInterval::Intersects(const GenomicInterval& other) const
+{
+    if (name_ != other.name_) return false;
+    return interval_.Intersects(other.interval_);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/GenomicIntervalQuery.cpp b/src/GenomicIntervalQuery.cpp

new file mode 100644 (file)

index 0000000..a60904d
--- /dev/null
+++ b/src/GenomicIntervalQuery.cpp
@@ -0,0 +1,45 @@
+// File Description
+/// \file GenomicIntervalQuery.cpp
+/// \brief Implements the GenomicIntervalQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/GenomicIntervalQuery.h"
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct GenomicIntervalQuery::GenomicIntervalQueryPrivate
+{
+    GenomicIntervalQueryPrivate(const GenomicInterval& interval, const DataSet& dataset)
+        : reader_{interval, dataset}
+    {
+    }
+
+    GenomicIntervalCompositeBamReader reader_;
+};
+
+GenomicIntervalQuery::GenomicIntervalQuery(const GenomicInterval& interval, const DataSet& dataset)
+    : internal::IQuery(), d_{std::make_unique<GenomicIntervalQueryPrivate>(interval, dataset)}
+{
+}
+
+GenomicIntervalQuery::~GenomicIntervalQuery() {}
+
+bool GenomicIntervalQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+GenomicIntervalQuery& GenomicIntervalQuery::Interval(const GenomicInterval& interval)
+{
+    d_->reader_.Interval(interval);
+    return *this;
+}
+
+const GenomicInterval& GenomicIntervalQuery::Interval() const { return d_->reader_.Interval(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/IndexedBamWriter.cpp b/src/IndexedBamWriter.cpp

new file mode 100644 (file)

index 0000000..b5920be
--- /dev/null
+++ b/src/IndexedBamWriter.cpp
@@ -0,0 +1,129 @@
+// File Description
+/// \file IndexedBamWriter.cpp
+/// \brief Implements the IndexedBamWriter class
+//
+// Author: Derek Barnett
+
+#include "pbbam/IndexedBamWriter.h"
+
+#include <cassert>
+#include <cstdint>
+#include <stdexcept>
+
+#include <htslib/bgzf.h>
+#include <htslib/hfile.h>
+#include <htslib/hts.h>
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/PbiBuilder.h"
+#include "pbbam/Unused.h"
+#include "pbbam/Validator.h"
+
+#include "FileProducer.h"
+#include "MemoryUtils.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class IndexedBamWriterPrivate : public internal::FileProducer
+{
+public:
+    IndexedBamWriterPrivate(const std::string& outputFilename, std::shared_ptr<bam_hdr_t> rawHeader)
+        : internal::FileProducer{outputFilename}
+        , header_{rawHeader}
+        , builder_{outputFilename + ".pbi"}
+        , previousBlockAddress_{0}
+    {
+        if (!header_) throw std::runtime_error{"null header"};
+
+        // open file
+        const auto& usingFilename = TempFilename();
+        file_.reset(sam_open(usingFilename.c_str(), "wb"));
+        if (!file_)
+            throw std::runtime_error{"could not open file" + usingFilename + " for writing"};
+
+        // write header
+        const auto ret = sam_hdr_write(file_.get(), header_.get());
+        if (ret != 0) throw std::runtime_error{"could not write header"};
+
+        // store first alignment block
+        previousBlockAddress_ = file_.get()->fp.bgzf->block_address;
+    }
+
+    ~IndexedBamWriterPrivate()
+    {
+        // ensure last remaining bits are flushed to file
+        const auto ret = bgzf_flush(file_.get()->fp.bgzf);
+        UNUSED(ret);
+    }
+
+public:
+    void Write(const BamRecord& record)
+    {
+#if PBBAM_AUTOVALIDATE
+        Validator::Validate(record);
+#endif
+        const auto rawRecord = internal::BamRecordMemory::GetRawData(record);
+        BGZF* bgzf = file_.get()->fp.bgzf;
+        assert(bgzf);
+
+        //
+        // Fetch record's start offset.
+        //
+        // If we're still in the same block from the last record written, we
+        // need to flush to get the proper offset.
+        //
+        if (bgzf->block_address == previousBlockAddress_) {
+            const auto ret = bgzf_flush(bgzf);
+            UNUSED(ret);
+        }
+        const int64_t vOffset = bgzf_tell(bgzf);
+
+        // update bin
+        rawRecord->core.bin = hts_reg2bin(rawRecord->core.pos, bam_endpos(rawRecord.get()), 14, 5);
+
+        // write record to file & PBI builder
+        const auto ret = sam_write1(file_.get(), header_.get(), rawRecord.get());
+        if (ret <= 0) throw std::runtime_error{"could not write record"};
+        builder_.AddRecord(record, vOffset);
+
+        // update block address
+        previousBlockAddress_ = bgzf->block_address;
+    }
+
+public:
+    std::unique_ptr<samFile, internal::HtslibFileDeleter> file_;
+    std::shared_ptr<bam_hdr_t> header_;
+    PbiBuilder builder_;
+    int64_t previousBlockAddress_;
+};
+
+}  // namespace internal
+
+IndexedBamWriter::IndexedBamWriter(const std::string& outputFilename, const BamHeader& header)
+    : IRecordWriter()
+{
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(header);
+#endif
+    d_ = std::make_unique<internal::IndexedBamWriterPrivate>(
+        outputFilename, internal::BamHeaderMemory::MakeRawHeader(header));
+}
+
+IndexedBamWriter::~IndexedBamWriter() {}
+
+void IndexedBamWriter::TryFlush()
+{
+    const auto ret = bgzf_flush(d_->file_.get()->fp.bgzf);
+    if (ret != 0) throw std::runtime_error{"could not flush output buffer contents"};
+}
+
+void IndexedBamWriter::Write(const BamRecord& record) { d_->Write(record); }
+
+void IndexedBamWriter::Write(const BamRecordImpl& record) { d_->Write(BamRecord{record}); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/IndexedFastaReader.cpp b/src/IndexedFastaReader.cpp

new file mode 100644 (file)

index 0000000..60a04e7
--- /dev/null
+++ b/src/IndexedFastaReader.cpp
@@ -0,0 +1,196 @@
+// File Description
+/// \file IndexedFastaReader.cpp
+/// \brief Implements the IndexedFastaReader class.
+//
+// Author: David Alexander
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/IndexedFastaReader.h"
+
+#include <cstddef>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+
+#include <htslib/faidx.h>
+
+#include "SequenceUtils.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/GenomicInterval.h"
+#include "pbbam/Orientation.h"
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+
+IndexedFastaReader::IndexedFastaReader(const std::string& filename)
+{
+    if (!Open(filename)) throw std::runtime_error{"Cannot open file " + filename};
+}
+
+IndexedFastaReader::IndexedFastaReader(const IndexedFastaReader& src)
+{
+    if (!Open(src.filename_)) throw std::runtime_error{"Cannot open file " + src.filename_};
+}
+
+IndexedFastaReader& IndexedFastaReader::operator=(const IndexedFastaReader& rhs)
+{
+    if (&rhs == this) return *this;
+
+    Open(rhs.filename_);
+    return *this;
+}
+
+IndexedFastaReader::~IndexedFastaReader() { Close(); }
+
+bool IndexedFastaReader::Open(std::string filename)
+{
+    auto* handle = fai_load(filename.c_str());
+    if (handle == nullptr)
+        return false;
+    else {
+        filename_ = std::move(filename);
+        handle_ = handle;
+        return true;
+    }
+}
+
+void IndexedFastaReader::Close()
+{
+    filename_.clear();
+    if (handle_ != nullptr) fai_destroy(handle_);
+    handle_ = nullptr;
+}
+
+#define REQUIRE_FAIDX_LOADED                     \
+    if (handle_ == nullptr) throw std::exception \
+        {                                        \
+        }
+
+std::string IndexedFastaReader::Subsequence(const std::string& id, Position begin,
+                                            Position end) const
+{
+    REQUIRE_FAIDX_LOADED;
+
+    int len;
+    // Derek: *Annoyingly* htslib seems to interpret "end" as inclusive in
+    // faidx_fetch_seq, whereas it considers it exclusive in the region spec in
+    // fai_fetch.  Can you please verify?
+    const std::unique_ptr<char> rawSeq{faidx_fetch_seq(handle_, id.c_str(), begin, end - 1, &len)};
+    if (rawSeq == nullptr) throw std::runtime_error{"could not fetch FASTA sequence"};
+    return RemoveAllWhitespace(rawSeq.get());
+}
+
+std::string IndexedFastaReader::Subsequence(const GenomicInterval& interval) const
+{
+    REQUIRE_FAIDX_LOADED;
+    return Subsequence(interval.Name(), interval.Start(), interval.Stop());
+}
+
+std::string IndexedFastaReader::Subsequence(const char* htslibRegion) const
+{
+    REQUIRE_FAIDX_LOADED;
+
+    int len;
+    const std::unique_ptr<char> rawSeq(fai_fetch(handle_, htslibRegion, &len));
+    if (rawSeq == nullptr) throw std::runtime_error{"could not fetch FASTA sequence"};
+    return RemoveAllWhitespace(rawSeq.get());
+}
+
+std::string IndexedFastaReader::ReferenceSubsequence(const BamRecord& bamRecord,
+                                                     const Orientation orientation,
+                                                     const bool gapped,
+                                                     const bool exciseSoftClips) const
+{
+    REQUIRE_FAIDX_LOADED;
+
+    std::string subseq = Subsequence(bamRecord.ReferenceName(), bamRecord.ReferenceStart(),
+                                     bamRecord.ReferenceEnd());
+    const auto reverse = orientation != Orientation::GENOMIC && bamRecord.Impl().IsReverseStrand();
+
+    if (bamRecord.Impl().IsMapped() && gapped) {
+        size_t seqIndex = 0;
+
+        const auto cigar = bamRecord.Impl().CigarData();
+        for (const auto& op : cigar) {
+            const auto type = op.Type();
+
+            // do nothing for hard clips
+            if (type != CigarOperationType::HARD_CLIP) {
+                const auto opLength = op.Length();
+
+                // maybe remove soft clips
+                if (type == CigarOperationType::SOFT_CLIP) {
+                    if (!exciseSoftClips) {
+                        subseq.reserve(subseq.size() + opLength);
+                        subseq.insert(seqIndex, opLength, '-');
+                        seqIndex += opLength;
+                    }
+                }
+
+                // for non-clipping operations
+                else {
+
+                    // maybe add gaps/padding
+                    if (type == CigarOperationType::INSERTION) {
+                        subseq.reserve(subseq.size() + opLength);
+                        subseq.insert(seqIndex, opLength, '-');
+                    } else if (type == CigarOperationType::PADDING) {
+                        subseq.reserve(subseq.size() + opLength);
+                        subseq.insert(seqIndex, opLength, '*');
+                    }
+
+                    // update index
+                    seqIndex += opLength;
+                }
+            }
+        }
+    }
+
+    if (reverse) internal::ReverseComplementCaseSens(subseq);
+
+    return subseq;
+}
+
+int IndexedFastaReader::NumSequences() const
+{
+    REQUIRE_FAIDX_LOADED;
+    return faidx_nseq(handle_);
+}
+
+std::vector<std::string> IndexedFastaReader::Names() const
+{
+    REQUIRE_FAIDX_LOADED;
+    std::vector<std::string> names;
+    names.reserve(NumSequences());
+    for (int i = 0; i < NumSequences(); ++i)
+        names.emplace_back(faidx_iseq(handle_, i));
+    return names;
+}
+
+std::string IndexedFastaReader::Name(const size_t idx) const
+{
+    REQUIRE_FAIDX_LOADED;
+    if (static_cast<int>(idx) >= NumSequences())
+        throw std::runtime_error{"FASTA index out of range"};
+    return {faidx_iseq(handle_, idx)};
+}
+
+bool IndexedFastaReader::HasSequence(const std::string& name) const
+{
+    REQUIRE_FAIDX_LOADED;
+    return (faidx_has_seq(handle_, name.c_str()) != 0);
+}
+
+int IndexedFastaReader::SequenceLength(const std::string& name) const
+{
+    REQUIRE_FAIDX_LOADED;
+    const auto len = faidx_seq_len(handle_, name.c_str());
+    if (len < 0)
+        throw std::runtime_error{"could not determine FASTA sequence length"};
+    else
+        return len;
+}
+}
+}  // PacBio::BAM
diff --git a/src/MD5.cpp b/src/MD5.cpp

new file mode 100644 (file)

index 0000000..25e3368
--- /dev/null
+++ b/src/MD5.cpp
@@ -0,0 +1,55 @@
+// File Description
+/// \file MD5.cpp
+/// \brief Implements basic MD5 hash utilities
+//
+// Author: Brett Bowman
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/MD5.h"
+
+#include <stdexcept>
+
+#include <htslib/hts.h>
+
+namespace PacBio {
+namespace BAM {
+
+class Md5ContextHelper
+{
+public:
+    Md5ContextHelper() : data_(hts_md5_init())
+    {
+        if (data_ == nullptr) throw std::runtime_error{"could not initialize MD5 context"};
+    }
+
+    ~Md5ContextHelper() { hts_md5_destroy(data_); }
+
+public:
+    std::string Encoded(const std::string& str)
+    {
+        hts_md5_update(data_, reinterpret_cast<void*>(const_cast<char*>(str.c_str())), str.size());
+
+        unsigned char digest[16];
+        hts_md5_final(digest, data_);
+
+        char hexdigest[33];  // leave space for null-term
+        hts_md5_hex(hexdigest, digest);
+
+        return std::string{hexdigest, 32};
+    }
+
+private:
+    hts_md5_context* data_;
+};
+
+/// \brief MD5 hash of a string as a 32-digit hexadecimal string
+///
+std::string MD5Hash(const std::string& str)
+{
+    Md5ContextHelper md5;
+    return md5.Encoded(str);
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/MemoryUtils.cpp b/src/MemoryUtils.cpp

new file mode 100644 (file)

index 0000000..93fa841
--- /dev/null
+++ b/src/MemoryUtils.cpp
@@ -0,0 +1,46 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "MemoryUtils.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// -----------------
+// BamHeaderMemory
+// -----------------
+
+BamHeader BamHeaderMemory::FromRawData(bam_hdr_t* hdr)
+{
+    // null input - error
+    if (hdr == nullptr) throw std::runtime_error{"invalid BAM header"};
+
+    // empty text input - ok
+    if (hdr->text == nullptr || hdr->l_text == 0) return BamHeader();
+
+    // parse normal SAM text input
+    return BamHeader(std::string(hdr->text, hdr->l_text));
+}
+
+std::shared_ptr<bam_hdr_t> BamHeaderMemory::MakeRawHeader(const BamHeader& header)
+{
+    const std::string text = header.ToSam();
+    std::shared_ptr<bam_hdr_t> rawData(sam_hdr_parse(text.size(), text.c_str()),
+                                       internal::HtslibHeaderDeleter());
+    rawData->ignore_sam_err = 0;
+    rawData->cigar_tab = nullptr;
+    rawData->l_text = text.size();
+    rawData->text = static_cast<char*>(calloc(rawData->l_text + 1, 1));
+    memcpy(rawData->text, text.c_str(), rawData->l_text);
+    return rawData;
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/MemoryUtils.h b/src/MemoryUtils.h

new file mode 100644 (file)

index 0000000..2e239e2
--- /dev/null
+++ b/src/MemoryUtils.h
@@ -0,0 +1,142 @@
+// Author: Derek Barnett
+
+#ifndef MEMORYUTILS_H
+#define MEMORYUTILS_H
+
+#include <cstdio>
+#include <memory>
+
+#include <htslib/bgzf.h>
+#include <htslib/sam.h>
+
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/BamRecordImpl.h"
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+
+class BamHeader;
+
+namespace internal {
+
+// intended for use with std::shared_ptr<T>, std::unique_ptr<T>, etc
+
+struct FileDeleter
+{
+    void operator()(std::FILE* fp)
+    {
+        if (fp) std::fclose(fp);
+        fp = nullptr;
+    }
+};
+
+struct HtslibBgzfDeleter
+{
+    void operator()(BGZF* bgzf)
+    {
+        if (bgzf) bgzf_close(bgzf);
+        bgzf = nullptr;
+    }
+};
+
+struct HtslibFileDeleter
+{
+    void operator()(samFile* file)
+    {
+        if (file) sam_close(file);
+        file = nullptr;
+    }
+};
+
+struct HtslibHeaderDeleter
+{
+    void operator()(bam_hdr_t* hdr)
+    {
+        if (hdr) bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+struct HtslibIndexDeleter
+{
+    void operator()(hts_idx_t* index)
+    {
+        if (index) hts_idx_destroy(index);
+        index = nullptr;
+    }
+};
+
+struct HtslibIteratorDeleter
+{
+    void operator()(hts_itr_t* iter)
+    {
+        if (iter) hts_itr_destroy(iter);
+        iter = nullptr;
+    }
+};
+
+struct HtslibRecordDeleter
+{
+    void operator()(bam1_t* b)
+    {
+        if (b) bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+class BamHeaderMemory
+{
+public:
+    static BamHeader FromRawData(bam_hdr_t* header);
+    static std::shared_ptr<bam_hdr_t> MakeRawHeader(const BamHeader& header);
+};
+
+class BamRecordMemory
+{
+public:
+    static const BamRecordImpl& GetImpl(const BamRecord& r);
+    static const BamRecordImpl& GetImpl(const BamRecord* r);
+    static std::shared_ptr<bam1_t> GetRawData(const BamRecord& r);
+    static std::shared_ptr<bam1_t> GetRawData(const BamRecord* r);
+    static std::shared_ptr<bam1_t> GetRawData(const BamRecordImpl& impl);
+    static std::shared_ptr<bam1_t> GetRawData(const BamRecordImpl* impl);
+
+    static void UpdateRecordTags(const BamRecord& r);
+    static void UpdateRecordTags(const BamRecordImpl& r);
+};
+
+inline const BamRecordImpl& BamRecordMemory::GetImpl(const BamRecord& r) { return r.impl_; }
+
+inline const BamRecordImpl& BamRecordMemory::GetImpl(const BamRecord* r) { return r->impl_; }
+
+inline std::shared_ptr<bam1_t> BamRecordMemory::GetRawData(const BamRecord& r)
+{
+    return GetRawData(r.impl_);
+}
+
+inline std::shared_ptr<bam1_t> BamRecordMemory::GetRawData(const BamRecord* r)
+{
+    return GetRawData(r->impl_);
+}
+
+inline std::shared_ptr<bam1_t> BamRecordMemory::GetRawData(const BamRecordImpl& impl)
+{
+    return impl.d_;
+}
+
+inline std::shared_ptr<bam1_t> BamRecordMemory::GetRawData(const BamRecordImpl* impl)
+{
+    return impl->d_;
+}
+
+inline void BamRecordMemory::UpdateRecordTags(const BamRecord& r) { UpdateRecordTags(r.impl_); }
+
+inline void BamRecordMemory::UpdateRecordTags(const BamRecordImpl& r) { r.UpdateTagMap(); }
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // MEMORYUTILS_H
diff --git a/src/PbbamInternalConfig.h b/src/PbbamInternalConfig.h

new file mode 100644 (file)

index 0000000..340b97d
--- /dev/null
+++ b/src/PbbamInternalConfig.h
@@ -0,0 +1,18 @@
+// File Description
+/// \file PbbamInternalConfig.h
+/// \brief Defines internal macros for symbol visibility
+//
+// Author: Derek Barnett
+
+#ifndef PBBAMINTERNALCONFIG_H
+#define PBBAMINTERNALCONFIG_H
+
+#if defined(WIN32)
+#define PBBAM_EXPORT __declspec(dllexport)
+#else
+#define PBBAM_EXPORT __attribute__((visibility("default")))
+#endif
+
+#include "pbbam/Config.h"
+
+#endif  // PBBAMINTERNALCONFIG_H
diff --git a/src/PbiBuilder.cpp b/src/PbiBuilder.cpp

new file mode 100644 (file)

index 0000000..5eacd3e
--- /dev/null
+++ b/src/PbiBuilder.cpp
@@ -0,0 +1,717 @@
+// File Description
+/// \file PbiBuilder.cpp
+/// \brief Implements the PbiBuilder class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiBuilder.h"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <stdexcept>
+#include <thread>
+#include <tuple>
+
+#include <htslib/bgzf.h>
+#include <htslib/hfile.h>
+#include <htslib/hts.h>
+
+#include <boost/numeric/conversion/cast.hpp>
+
+#include "MemoryUtils.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/PbiRawData.h"
+
+namespace {
+
+using RecordType = PacBio::BAM::RecordType;
+std::string ToString(const RecordType type)
+{
+    // clang-format off
+    static const auto lookup = std::map<RecordType, std::string>
+    {
+        { RecordType::ZMW,        "ZMW" },
+        { RecordType::HQREGION,   "HQREGION" },
+        { RecordType::SUBREAD,    "SUBREAD" },
+        { RecordType::CCS,        "CCS" },
+        { RecordType::SCRAP,      "SCRAP" },
+        { RecordType::TRANSCRIPT, "TRANSCRIPT" },
+        { RecordType::UNKNOWN,    "UNKNOWN" }
+    };
+    // clang-format on
+
+    try {
+        return lookup.at(type);
+    } catch (std::exception&) {
+        throw std::runtime_error{"error: unknown RecordType encountered"};
+    }
+}
+
+}  // namespace anonymous
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template <typename T>
+inline void SwapEndianness(std::vector<T>& data)
+{
+    const size_t elementSize = sizeof(T);
+    const size_t numReads = data.size();
+    switch (elementSize) {
+        case 1:
+            break;  // no swapping necessary
+        case 2:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_2p(&data[i]);
+            break;
+        case 4:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_4p(&data[i]);
+            break;
+        case 8:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_8p(&data[i]);
+            break;
+        default:
+            throw std::runtime_error{"unsupported element size"};
+    }
+}
+
+void bgzf_write_safe(BGZF* fp, const void* data, size_t length)
+{
+    const auto ret = bgzf_write(fp, data, length);
+    if (ret < 0L)
+        throw std::runtime_error{"Non-zero returned from bgzf_write(). Out of disk space?"};
+}
+
+template <typename T>
+inline void WriteBgzfVector(BGZF* fp, std::vector<T>& data, const size_t numElements)
+{
+    assert(fp);
+    if (fp->is_be) SwapEndianness(data);
+    bgzf_write_safe(fp, &data[0], numElements * sizeof(T));
+}
+
+// --------------------------
+// PbiTempFile
+// --------------------------
+
+template <typename T>
+class PbiTempFile
+{
+public:
+    constexpr static const size_t MaxBufferSize = 0x10000;  // 64K
+    constexpr static const size_t ElementSize = sizeof(T);
+    constexpr static const size_t MaxElementCount = MaxBufferSize / ElementSize;
+
+public:
+    PbiTempFile(std::string fn);
+    ~PbiTempFile();
+
+public:
+    void Close();
+    const std::vector<T>& Data() const;
+    std::vector<T>& Data();
+    void Flush();
+    size_t Read(const size_t count);
+    void Rewind();
+    void Write(T value);
+
+private:
+    void WriteToFile();
+
+private:
+    // file info
+    std::string fn_;
+    std::unique_ptr<FILE, internal::FileDeleter> fp_;
+
+    // data storage/tracking
+    std::vector<T> buffer_;
+    size_t numElementsWritten_ = 0;
+};
+
+template <typename T>
+PbiTempFile<T>::PbiTempFile(std::string fn)
+    : fn_{std::move(fn)}, fp_{std::fopen(fn_.c_str(), "w+b")}
+{
+    if (fp_ == nullptr) throw std::runtime_error{"could not open temp file: " + fn_};
+    buffer_.reserve(MaxElementCount);
+}
+
+template <typename T>
+PbiTempFile<T>::~PbiTempFile()
+{
+    remove(fn_.c_str());
+}
+
+template <typename T>
+void PbiTempFile<T>::Close()
+{
+    Flush();  // dtor will take care of closing file handle
+}
+
+template <typename T>
+const std::vector<T>& PbiTempFile<T>::Data() const
+{
+    return buffer_;
+}
+
+template <typename T>
+std::vector<T>& PbiTempFile<T>::Data()
+{
+    return buffer_;
+}
+
+template <typename T>
+void PbiTempFile<T>::Flush()
+{
+    WriteToFile();
+    buffer_.clear();
+}
+
+template <typename T>
+size_t PbiTempFile<T>::Read(const size_t count)
+{
+    const auto actualCount = std::min(count, numElementsWritten_);
+    buffer_.resize(actualCount);
+    return fread(buffer_.data(), ElementSize, actualCount, fp_.get());
+}
+
+template <typename T>
+void PbiTempFile<T>::Rewind()
+{
+    Flush();
+
+    const auto ret = fseek(fp_.get(), 0, SEEK_SET);
+    if (ret != 0) throw std::runtime_error{"could not rewind temp file" + fn_};
+}
+
+template <typename T>
+void PbiTempFile<T>::Write(T value)
+{
+    buffer_.push_back(value);
+
+    // maybe flush
+    if (buffer_.size() == MaxElementCount) Flush();
+}
+
+template <typename T>
+void PbiTempFile<T>::WriteToFile()
+{
+    numElementsWritten_ += fwrite(buffer_.data(), ElementSize, buffer_.size(), fp_.get());
+}
+
+// --------------------------
+// PbiReferenceDataBuilder
+// --------------------------
+
+class PbiReferenceDataBuilder
+{
+public:
+    using ReferenceRows = std::pair<int32_t, int32_t>;  // [startRow, endRow)
+
+public:
+    explicit PbiReferenceDataBuilder(const size_t numReferenceSequences);
+
+public:
+    bool AddRecord(const BamRecord& record, const int32_t rowNumber);
+
+    PbiRawReferenceData Result() const;
+
+    void WriteData(BGZF* bgzf);
+
+private:
+    int32_t lastRefId_ = -1;
+    Position lastPos_ = -1;
+    std::map<uint32_t, PbiReferenceEntry> rawReferenceEntries_;
+};
+
+PbiReferenceDataBuilder::PbiReferenceDataBuilder(const size_t numReferenceSequences)
+{
+    // initialize with number of references we expect to see
+    //
+    // we can add more later, but want to ensure known references have an entry
+    // even if no records are observed mapping to it
+    //
+    for (size_t i = 0; i < numReferenceSequences; ++i)
+        rawReferenceEntries_[i] = PbiReferenceEntry(i);
+
+    // also create an "unmapped" entry
+    rawReferenceEntries_[PbiReferenceEntry::UNMAPPED_ID] = PbiReferenceEntry{};
+}
+
+bool PbiReferenceDataBuilder::AddRecord(const BamRecord& record, const int32_t rowNumber)
+{
+    // fetch ref ID & pos for record
+    const int32_t tId = record.ReferenceId();
+    const int32_t pos = record.ReferenceStart();
+
+    // sanity checks to protect against non-coordinate-sorted BAMs
+    if (lastRefId_ != tId || (lastRefId_ >= 0 && tId < 0)) {
+        if (tId >= 0) {
+
+            // if we've already seen unmapped reads, but our current tId is valid
+            //
+            // error: unmapped reads should all be at the end (can stop checking refs)
+            //
+            PbiReferenceEntry& unmappedEntry =
+                rawReferenceEntries_.at(PbiReferenceEntry::UNMAPPED_ID);
+            if (unmappedEntry.beginRow_ != PbiReferenceEntry::UNSET_ROW) return false;
+
+            // if we've already seen data for this new tId
+            // (remember we're coming from another tId)
+            //
+            // error: refs are out of order (can stop checking refs)
+            //
+            PbiReferenceEntry& currentEntry = rawReferenceEntries_.at(static_cast<uint32_t>(tId));
+            if (currentEntry.beginRow_ != PbiReferenceEntry::UNSET_ROW) return false;
+        }
+        lastRefId_ = tId;
+    } else if (tId >= 0 && lastPos_ > pos)
+        return false;  // error: positions out of order
+
+    // update row numbers
+    PbiReferenceEntry& entry = rawReferenceEntries_.at(static_cast<uint32_t>(tId));
+    if (entry.beginRow_ == PbiReferenceEntry::UNSET_ROW) entry.beginRow_ = rowNumber;
+    entry.endRow_ = rowNumber + 1;
+
+    // update pos (for sorting check next go-round)
+    lastPos_ = pos;
+    return true;
+}
+
+PbiRawReferenceData PbiReferenceDataBuilder::Result() const
+{
+    // PbiReferenceEntries will be sorted thanks to std::map
+    // tId will be at end since we're sorting on the uint cast of -1
+    PbiRawReferenceData result;
+    result.entries_.reserve(rawReferenceEntries_.size());
+    for (const auto& entry : rawReferenceEntries_)
+        result.entries_.push_back(entry.second);
+    return result;
+}
+
+void PbiReferenceDataBuilder::WriteData(BGZF* bgzf)
+{
+    const auto refData = Result();
+
+    // num_refs
+    uint32_t numRefs = refData.entries_.size();
+    if (bgzf->is_be) numRefs = ed_swap_4(numRefs);
+    bgzf_write_safe(bgzf, &numRefs, 4);
+
+    // reference entries
+    numRefs = refData.entries_.size();  // need to reset after maybe endian-swapping
+    for (size_t i = 0; i < numRefs; ++i) {
+        auto& entry = refData.entries_[i];
+        auto tId = entry.tId_;
+        auto beginRow = entry.beginRow_;
+        auto endRow = entry.endRow_;
+        if (bgzf->is_be) {
+            tId = ed_swap_4(tId);
+            beginRow = ed_swap_4(beginRow);
+            endRow = ed_swap_4(endRow);
+        }
+        bgzf_write_safe(bgzf, &tId, 4);
+        bgzf_write_safe(bgzf, &beginRow, 4);
+        bgzf_write_safe(bgzf, &endRow, 4);
+    }
+}
+
+// --------------------------------------------
+// PbiBuilderPrivate - builder implementation
+// --------------------------------------------
+
+class PbiBuilderPrivate
+{
+    using CompressionLevel = PacBio::BAM::PbiBuilder::CompressionLevel;
+
+public:
+    PbiBuilderPrivate(const std::string& pbiFilename, const size_t numReferenceSequences,
+                      const bool isCoordinateSorted, const CompressionLevel compressionLevel,
+                      const size_t numThreads);
+
+    ~PbiBuilderPrivate() noexcept;
+
+public:
+    void AddRecord(const BamRecord& record, const int64_t vOffset);
+    void Close();
+
+private:
+    // store record data
+    void AddBarcodeData(const BamRecord& record);
+    void AddBasicData(const BamRecord& record, const int64_t vOffset);
+    void AddMappedData(const BamRecord& record);
+    void AddReferenceData(const BamRecord& record, const uint32_t currentRow);
+
+    // read from temp files & write PBI data
+    void OpenPbiFile();
+
+    template <typename T>
+    void WriteFromTempFile(PbiTempFile<T>& tempFile, BGZF* bgzf);
+
+    void WritePbiHeader(BGZF* bgzf);
+    void WriteReferenceData(BGZF* bgzf);
+
+private:
+    // basic data
+    PbiTempFile<int32_t> rgIdFile_;
+    PbiTempFile<int32_t> qStartFile_;
+    PbiTempFile<int32_t> qEndFile_;
+    PbiTempFile<int32_t> holeNumFile_;
+    PbiTempFile<float> readQualFile_;
+    PbiTempFile<uint8_t> ctxtFile_;
+    PbiTempFile<int64_t> fileOffsetFile_;
+
+    // mapped data
+    PbiTempFile<int32_t> tIdFile_;
+    PbiTempFile<uint32_t> tStartFile_;
+    PbiTempFile<uint32_t> tEndFile_;
+    PbiTempFile<uint32_t> aStartFile_;
+    PbiTempFile<uint32_t> aEndFile_;
+    PbiTempFile<uint8_t> revStrandFile_;
+    PbiTempFile<uint32_t> nMFile_;
+    PbiTempFile<uint32_t> nMMFile_;
+    PbiTempFile<uint8_t> mapQualFile_;
+
+    // barcode data
+    PbiTempFile<int16_t> bcForwardFile_;
+    PbiTempFile<int16_t> bcReverseFile_;
+    PbiTempFile<int8_t> bcQualFile_;
+
+    // reference data
+    std::unique_ptr<PbiReferenceDataBuilder> refDataBuilder_ = nullptr;
+
+    // output file info
+    std::string pbiFilename_;
+    std::unique_ptr<BGZF, internal::HtslibBgzfDeleter> outFile_ = nullptr;
+    CompressionLevel compressionLevel_;
+    size_t numThreads_;
+
+    // tracking data
+    uint32_t currentRow_ = 0;
+    bool isClosed_ = false;
+    bool hasBarcodeData_ = false;
+    bool hasMappedData_ = false;
+};
+
+PbiBuilderPrivate::PbiBuilderPrivate(const std::string& pbiFilename,
+                                     const size_t numReferenceSequences,
+                                     const bool isCoordinateSorted,
+                                     const CompressionLevel compressionLevel,
+                                     const size_t numThreads)
+    : rgIdFile_{pbiFilename + ".rgId.tmp"}
+    , qStartFile_{pbiFilename + ".qStart.tmp"}
+    , qEndFile_{pbiFilename + ".qEnd.tmp"}
+    , holeNumFile_{pbiFilename + ".holeNum.tmp"}
+    , readQualFile_{pbiFilename + ".rq.tmp"}
+    , ctxtFile_{pbiFilename + ".ctxt.tmp"}
+    , fileOffsetFile_{pbiFilename + ".offset.tmp"}
+    , tIdFile_{pbiFilename + ".tId.tmp"}
+    , tStartFile_{pbiFilename + ".tStart.tmp"}
+    , tEndFile_{pbiFilename + ".tEnd.tmp"}
+    , aStartFile_{pbiFilename + ".aStart.tmp"}
+    , aEndFile_{pbiFilename + ".aEnd.tmp"}
+    , revStrandFile_{pbiFilename + ".revStrand.tmp"}
+    , nMFile_{pbiFilename + ".nm.tmp"}
+    , nMMFile_{pbiFilename + ".nmm.tmp"}
+    , mapQualFile_{pbiFilename + ".mapQual.tmp"}
+    , bcForwardFile_{pbiFilename + ".bcForward.tmp"}
+    , bcReverseFile_{pbiFilename + ".bcReverse.tmp"}
+    , bcQualFile_{pbiFilename + ".bcQual.tmp"}
+    , pbiFilename_{pbiFilename}
+    , compressionLevel_{compressionLevel}
+    , numThreads_{numThreads}
+{
+    if (isCoordinateSorted && numReferenceSequences > 0)
+        refDataBuilder_ = std::make_unique<PbiReferenceDataBuilder>(numReferenceSequences);
+}
+
+PbiBuilderPrivate::~PbiBuilderPrivate() noexcept
+{
+    if (!isClosed_) {
+        try {
+            Close();
+        } catch (...) {
+            // swallow any exceptions & remain no-throw from dtor
+        }
+    }
+}
+
+void PbiBuilderPrivate::AddBarcodeData(const BamRecord& b)
+{
+    // initialize w/ 'missing' value
+    int16_t bcForward = -1;
+    int16_t bcReverse = -1;
+    int8_t bcQuality = -1;
+
+    // check for any barcode data (both required)
+    if (b.HasBarcodes() && b.HasBarcodeQuality()) {
+        // fetch data from record
+        std::tie(bcForward, bcReverse) = b.Barcodes();
+        bcQuality = static_cast<int8_t>(b.BarcodeQuality());
+
+        // double-check & reset to 'missing' value if any less than zero
+        if (bcForward < 0 && bcReverse < 0 && bcQuality < 0) {
+            bcForward = -1;
+            bcReverse = -1;
+            bcQuality = -1;
+        } else
+            hasBarcodeData_ = true;
+    }
+
+    // store
+    bcForwardFile_.Write(bcForward);
+    bcReverseFile_.Write(bcReverse);
+    bcQualFile_.Write(bcQuality);
+}
+
+void PbiBuilderPrivate::AddBasicData(const BamRecord& b, const int64_t vOffset)
+{
+    // read group ID
+    const auto rgId = [&b]() -> int32_t {
+        auto rgIdString = b.ReadGroupId();
+        if (rgIdString.empty()) rgIdString = MakeReadGroupId(b.MovieName(), ToString(b.Type()));
+        const auto rawId = std::stoul(rgIdString, nullptr, 16);
+        return static_cast<int32_t>(rawId);
+    }();
+
+    // query start/end
+    const auto isCcsOrTranscript = (IsCcsOrTranscript(b.Type()));
+    const int32_t qStart = (isCcsOrTranscript ? -1 : b.QueryStart());
+    const int32_t qEnd = (isCcsOrTranscript ? -1 : b.QueryEnd());
+
+    // add'l data
+    const int32_t holeNum = (b.HasHoleNumber() ? b.HoleNumber() : 0);
+    const float readAccuracy =
+        (b.HasReadAccuracy() ? boost::numeric_cast<float>(b.ReadAccuracy()) : 0.0F);
+    const uint8_t ctxt =
+        (b.HasLocalContextFlags() ? b.LocalContextFlags() : LocalContextFlags::NO_LOCAL_CONTEXT);
+
+    // store
+    rgIdFile_.Write(rgId);
+    qStartFile_.Write(qStart);
+    qEndFile_.Write(qEnd);
+    holeNumFile_.Write(holeNum);
+    ctxtFile_.Write(ctxt);
+    readQualFile_.Write(readAccuracy);
+    fileOffsetFile_.Write(vOffset);
+}
+
+void PbiBuilderPrivate::AddMappedData(const BamRecord& b)
+{
+    // fetch data
+    const auto tId = b.ReferenceId();
+    const auto tStart = static_cast<uint32_t>(b.ReferenceStart());
+    const auto tEnd = static_cast<uint32_t>(b.ReferenceEnd());
+    const auto aStart = static_cast<uint32_t>(b.AlignedStart());
+    const auto aEnd = static_cast<uint32_t>(b.AlignedEnd());
+
+    const auto isReverseStrand = [&b]() -> uint8_t {
+        return (b.AlignedStrand() == Strand::REVERSE ? 1 : 0);
+    }();
+
+    const auto matchData = b.NumMatchesAndMismatches();
+    const auto nM = static_cast<uint32_t>(matchData.first);
+    const auto nMM = static_cast<uint32_t>(matchData.second);
+    const auto mapQuality = b.MapQuality();
+
+    if (tId >= 0) hasMappedData_ = true;
+
+    // store
+    tIdFile_.Write(tId);
+    tStartFile_.Write(tStart);
+    tEndFile_.Write(tEnd);
+    aStartFile_.Write(aStart);
+    aEndFile_.Write(aEnd);
+    revStrandFile_.Write(isReverseStrand);
+    nMFile_.Write(nM);
+    nMMFile_.Write(nMM);
+    mapQualFile_.Write(mapQuality);
+}
+
+void PbiBuilderPrivate::AddRecord(const BamRecord& b, const int64_t vOffset)
+{
+    // ensure updated data
+    internal::BamRecordMemory::UpdateRecordTags(b);
+    b.ResetCachedPositions();
+
+    // store data
+    AddBasicData(b, vOffset);
+    AddMappedData(b);
+    AddBarcodeData(b);
+    AddReferenceData(b, currentRow_);
+
+    // increment row counter
+    ++currentRow_;
+}
+
+void PbiBuilderPrivate::AddReferenceData(const BamRecord& b, const uint32_t currentRow)
+{
+    // only add if coordinate-sorted hint is set
+    // update with info from refDataBuilder
+    if (refDataBuilder_) {
+        const auto sorted = refDataBuilder_->AddRecord(b, currentRow);
+        if (!sorted) refDataBuilder_.reset(nullptr);
+    }
+}
+
+void PbiBuilderPrivate::Close()
+{
+    // open PBI file for writing
+    OpenPbiFile();
+    auto* bgzf = outFile_.get();
+
+    // header section
+    WritePbiHeader(bgzf);
+
+    // 'basic' data section
+    WriteFromTempFile(rgIdFile_, bgzf);
+    WriteFromTempFile(qStartFile_, bgzf);
+    WriteFromTempFile(qEndFile_, bgzf);
+    WriteFromTempFile(holeNumFile_, bgzf);
+    WriteFromTempFile(readQualFile_, bgzf);
+    WriteFromTempFile(ctxtFile_, bgzf);
+    WriteFromTempFile(fileOffsetFile_, bgzf);
+
+    // mapped data section
+    if (hasMappedData_) {
+        WriteFromTempFile(tIdFile_, bgzf);
+        WriteFromTempFile(tStartFile_, bgzf);
+        WriteFromTempFile(tEndFile_, bgzf);
+        WriteFromTempFile(aStartFile_, bgzf);
+        WriteFromTempFile(aEndFile_, bgzf);
+        WriteFromTempFile(revStrandFile_, bgzf);
+        WriteFromTempFile(nMFile_, bgzf);
+        WriteFromTempFile(nMMFile_, bgzf);
+        WriteFromTempFile(mapQualFile_, bgzf);
+    }
+
+    // reference data section
+    if (refDataBuilder_) WriteReferenceData(bgzf);
+
+    // barcode data section
+    if (hasBarcodeData_) {
+        WriteFromTempFile(bcForwardFile_, bgzf);
+        WriteFromTempFile(bcReverseFile_, bgzf);
+        WriteFromTempFile(bcQualFile_, bgzf);
+    }
+
+    // finally, set flag
+    isClosed_ = true;
+}
+
+void PbiBuilderPrivate::OpenPbiFile()
+{
+    // open file handle
+    const auto mode = std::string{"wb"} + std::to_string(static_cast<int>(compressionLevel_));
+    outFile_.reset(bgzf_open(pbiFilename_.c_str(), mode.c_str()));
+    if (outFile_ == nullptr) throw std::runtime_error{"could not open output file"};
+
+    // if no explicit thread count given, attempt built-in check
+    size_t actualNumThreads = numThreads_;
+    if (actualNumThreads == 0) {
+        actualNumThreads = std::thread::hardware_concurrency();
+
+        // if still unknown, default to single-threaded
+        if (actualNumThreads == 0) actualNumThreads = 1;
+    }
+
+    // if multithreading requested, enable it
+    if (actualNumThreads > 1) bgzf_mt(outFile_.get(), actualNumThreads, 256);
+}
+
+template <typename T>
+void PbiBuilderPrivate::WriteFromTempFile(PbiTempFile<T>& tempFile, BGZF* bgzf)
+{
+    using TempFileType = PbiTempFile<T>;
+    static constexpr const auto maxElementCount = TempFileType::MaxElementCount;
+
+    tempFile.Rewind();
+
+    size_t totalNumRead = 0;
+    for (size_t i = 0; totalNumRead < currentRow_; ++i) {
+        const auto numRead = tempFile.Read(maxElementCount);
+        auto& data = tempFile.Data();
+        WriteBgzfVector(bgzf, data, numRead);
+        totalNumRead += numRead;
+    }
+}
+
+void PbiBuilderPrivate::WritePbiHeader(BGZF* bgzf)
+{
+    // 'magic' string
+    static constexpr const std::array<char, 4> magic{{'P', 'B', 'I', '\1'}};
+    bgzf_write_safe(bgzf, magic.data(), 4);
+
+    PbiFile::Sections sections = PbiFile::BASIC;
+    if (hasMappedData_) sections |= PbiFile::MAPPED;
+    if (hasBarcodeData_) sections |= PbiFile::BARCODE;
+    if (refDataBuilder_) sections |= PbiFile::REFERENCE;
+
+    // version, pbi_flags, & n_reads
+    auto version = static_cast<uint32_t>(PbiFile::CurrentVersion);
+    uint16_t pbi_flags = sections;
+    auto numReads = currentRow_;
+    if (bgzf->is_be) {
+        version = ed_swap_4(version);
+        pbi_flags = ed_swap_2(pbi_flags);
+        numReads = ed_swap_4(numReads);
+    }
+    bgzf_write_safe(bgzf, &version, 4);
+    bgzf_write_safe(bgzf, &pbi_flags, 2);
+    bgzf_write_safe(bgzf, &numReads, 4);
+
+    // reserved space
+    char reserved[18];
+    memset(reserved, 0, 18);
+    bgzf_write_safe(bgzf, reserved, 18);
+}
+
+void PbiBuilderPrivate::WriteReferenceData(BGZF* bgzf) { refDataBuilder_->WriteData(bgzf); }
+
+}  // namespace internal
+
+// --------------------------------------------
+// PbiBuilder - builder API
+// --------------------------------------------
+
+PbiBuilder::PbiBuilder(const std::string& pbiFilename, const CompressionLevel compressionLevel,
+                       const size_t numThreads)
+    : PbiBuilder{pbiFilename, 0, false, compressionLevel, numThreads}
+{
+}
+
+PbiBuilder::PbiBuilder(const std::string& pbiFilename, const size_t numReferenceSequences,
+                       const CompressionLevel compressionLevel, const size_t numThreads)
+    : PbiBuilder{pbiFilename, numReferenceSequences, (numReferenceSequences > 0), compressionLevel,
+                 numThreads}
+{
+}
+
+PbiBuilder::PbiBuilder(const std::string& pbiFilename, const size_t numReferenceSequences,
+                       const bool isCoordinateSorted, const CompressionLevel compressionLevel,
+                       const size_t numThreads)
+    : d_{std::make_unique<internal::PbiBuilderPrivate>(
+          pbiFilename, numReferenceSequences, isCoordinateSorted, compressionLevel, numThreads)}
+{
+}
+
+PbiBuilder::~PbiBuilder() noexcept {}
+
+void PbiBuilder::AddRecord(const BamRecord& record, const int64_t vOffset)
+{
+    d_->AddRecord(record, vOffset);
+}
+
+void PbiBuilder::Close() { d_->Close(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiFile.cpp b/src/PbiFile.cpp

new file mode 100644 (file)

index 0000000..fe5fdc0
--- /dev/null
+++ b/src/PbiFile.cpp
@@ -0,0 +1,37 @@
+// File Description
+/// \file PbiFile.cpp
+/// \brief Implements the PbiFile methods.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiFile.h"
+
+#include <cstddef>
+#include <cstdint>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamReader.h"
+#include "pbbam/PbiBuilder.h"
+
+namespace PacBio {
+namespace BAM {
+
+void PbiFile::CreateFrom(const BamFile& bamFile,
+                         const PbiBuilder::CompressionLevel compressionLevel,
+                         const size_t numThreads)
+{
+    PbiBuilder builder(bamFile.PacBioIndexFilename(), bamFile.Header().Sequences().size(),
+                       compressionLevel, numThreads);
+    BamReader reader(bamFile);
+    BamRecord b;
+    int64_t offset = reader.VirtualTell();
+    while (reader.GetNext(b)) {
+        builder.AddRecord(b, offset);
+        offset = reader.VirtualTell();
+    }
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiFilter.cpp b/src/PbiFilter.cpp

new file mode 100644 (file)

index 0000000..68be02f
--- /dev/null
+++ b/src/PbiFilter.cpp
@@ -0,0 +1,467 @@
+// File Description
+/// \file PbiFilter.cpp
+/// \brief Implements the PbiFilter class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiFilter.h"
+
+#include <algorithm>
+#include <cctype>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include <boost/algorithm/string/case_conv.hpp>
+#include <boost/algorithm/string/trim.hpp>
+#include <boost/numeric/conversion/cast.hpp>
+
+#include "FileUtils.h"
+#include "StringUtils.h"
+#include "pbbam/PbiFilterTypes.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+// clang-format off
+enum class BuiltIn
+{
+    AlignedEndFilter
+  , AlignedLengthFilter
+  , AlignedStartFilter
+  , AlignedStrandFilter
+  , BarcodeFilter
+  , BarcodeForwardFilter
+  , BarcodeQualityFilter
+  , BarcodeReverseFilter
+  , BarcodesFilter
+  , IdentityFilter
+  , LocalContextFilter
+  , MovieNameFilter
+  , NumDeletedBasesFilter
+  , NumInsertedBasesFilter
+  , NumMatchesFilter
+  , NumMismatchesFilter
+  , QueryEndFilter
+  , QueryLengthFilter
+  , QueryNameFilter
+  , QueryNamesFromFileFilter
+  , QueryStartFilter
+  , ReadAccuracyFilter
+  , ReadGroupFilter
+  , ReferenceEndFilter
+  , ReferenceIdFilter
+  , ReferenceNameFilter
+  , ReferenceStartFilter
+  , ZmwFilter
+};
+
+static const std::unordered_map<std::string, BuiltIn> builtInLookup =
+{
+    // property name   built-in filter
+    { "ae",            BuiltIn::AlignedEndFilter },
+    { "aend",          BuiltIn::AlignedEndFilter },
+    { "alignedlength", BuiltIn::AlignedLengthFilter },
+    { "as",            BuiltIn::AlignedStartFilter },
+    { "astart",        BuiltIn::AlignedStartFilter },
+    { "readstart",     BuiltIn::AlignedStartFilter },
+    { "bc",            BuiltIn::BarcodeFilter },
+    { "barcode",       BuiltIn::BarcodeFilter },
+    { "bcf",           BuiltIn::BarcodeForwardFilter },
+    { "bq",            BuiltIn::BarcodeQualityFilter },
+    { "bcq",           BuiltIn::BarcodeQualityFilter },
+    { "bcr",           BuiltIn::BarcodeReverseFilter },
+    { "accuracy",      BuiltIn::IdentityFilter },
+    { "identity",      BuiltIn::IdentityFilter },
+    { "cx",            BuiltIn::LocalContextFilter },
+    { "movie",         BuiltIn::MovieNameFilter },
+    { "qe",            BuiltIn::QueryEndFilter },
+    { "qend",          BuiltIn::QueryEndFilter },
+    { "length",        BuiltIn::QueryLengthFilter },
+    { "querylength",   BuiltIn::QueryLengthFilter },
+    { "qname",         BuiltIn::QueryNameFilter },
+    { "qname_file",    BuiltIn::QueryNamesFromFileFilter },
+    { "qs",            BuiltIn::QueryStartFilter },
+    { "qstart",        BuiltIn::QueryStartFilter },
+    { "rq",            BuiltIn::ReadAccuracyFilter },
+    { "te",            BuiltIn::ReferenceEndFilter },
+    { "tend",          BuiltIn::ReferenceEndFilter },
+    { "rname",         BuiltIn::ReferenceNameFilter },
+    { "ts",            BuiltIn::ReferenceStartFilter },
+    { "tstart",        BuiltIn::ReferenceStartFilter },
+    { "pos",           BuiltIn::ReferenceStartFilter },
+    { "zm",            BuiltIn::ZmwFilter },
+    { "zmw",           BuiltIn::ZmwFilter }
+};
+
+static const std::unordered_map<std::string, LocalContextFlags> contextFlagNames =
+{
+    { "NO_LOCAL_CONTEXT", LocalContextFlags::NO_LOCAL_CONTEXT },
+    { "ADAPTER_BEFORE",   LocalContextFlags::ADAPTER_BEFORE },
+    { "ADAPTER_AFTER",    LocalContextFlags::ADAPTER_AFTER },
+    { "BARCODE_BEFORE",   LocalContextFlags::BARCODE_BEFORE },
+    { "BARCODE_AFTER",    LocalContextFlags::BARCODE_AFTER },
+    { "FORWARD_PASS",     LocalContextFlags::FORWARD_PASS },
+    { "REVERSE_PASS",     LocalContextFlags::REVERSE_PASS }
+};
+// clang-format off
+
+// helper methods (for handling maybe-list strings))
+static inline bool isBracketed(const std::string& value)
+{
+    static const std::string openBrackets = "[({";
+    static const std::string closeBrackets = "])}";
+    return openBrackets.find(value.at(0)) != std::string::npos &&
+           closeBrackets.find(value.at(value.length() - 1)) != std::string::npos;
+}
+
+static inline bool isList(const std::string& value) { return value.find(',') != std::string::npos; }
+
+static PbiFilter CreateBarcodeFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"empty value for barcode filter property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+        std::vector<std::string> barcodes = internal::Split(value, ',');
+        if (barcodes.size() != 2) throw std::runtime_error{"only 2 barcode values expected"};
+        return PbiBarcodesFilter{boost::numeric_cast<int16_t>(std::stoi(barcodes.at(0))),
+                                 boost::numeric_cast<int16_t>(std::stoi(barcodes.at(1))),
+                                 compareType};
+    } else
+        return PbiBarcodeFilter{boost::numeric_cast<int16_t>(stoi(value)), compareType};
+}
+
+static PbiFilter CreateBarcodeForwardFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"empty value for barcode_forward filter property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+        std::vector<std::string> tokens = internal::Split(value, ',');
+        std::vector<int16_t> barcodes;
+        barcodes.reserve(tokens.size());
+        for (const auto& t : tokens)
+            barcodes.push_back(boost::numeric_cast<int16_t>(stoi(t)));
+        return PbiBarcodeForwardFilter{std::move(barcodes)};
+    } else
+        return PbiBarcodeForwardFilter{boost::numeric_cast<int16_t>(std::stoi(value)), compareType};
+}
+
+static PbiFilter CreateBarcodeReverseFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"empty value for barcode_reverse filter property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+        std::vector<std::string> tokens = internal::Split(value, ',');
+        std::vector<int16_t> barcodes;
+        barcodes.reserve(tokens.size());
+        for (const auto& t : tokens)
+            barcodes.push_back(boost::numeric_cast<int16_t>(std::stoi(t)));
+        return PbiBarcodeReverseFilter{std::move(barcodes)};
+    } else
+        return PbiBarcodeReverseFilter{boost::numeric_cast<int16_t>(stoi(value)), compareType};
+}
+
+static PbiFilter CreateLocalContextFilter(const std::string& value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"empty value for local context filter property"};
+
+    LocalContextFlags filterValue = LocalContextFlags::NO_LOCAL_CONTEXT;
+
+    // if raw integer
+    if (isdigit(value.at(0))) filterValue = static_cast<LocalContextFlags>(stoi(value));
+
+    // else interpret as flag names
+    else {
+        std::vector<std::string> tokens = internal::Split(value, '|');
+        for (std::string& token : tokens) {
+            boost::algorithm::trim(token);  // trim whitespace
+            filterValue = (filterValue | contextFlagNames.at(token));
+        }
+    }
+
+    return PbiFilter{PbiLocalContextFilter{filterValue, compareType}};
+}
+
+static PbiFilter CreateMovieNameFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"empty value for movie property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+
+        if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+            throw std::runtime_error{"unsupported compare type on movie property"};
+
+        std::vector<std::string> tokens = internal::Split(value, ',');
+        return PbiMovieNameFilter{std::move(tokens), compareType};
+    } else
+        return PbiMovieNameFilter{value, compareType};
+}
+
+static PbiFilter CreateQueryNamesFilterFromFile(const std::string& value, const DataSet& dataset, const Compare::Type compareType)
+{
+    if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+        throw std::runtime_error{"unsupported compare type on query name property"};
+
+    // resolve file from dataset, value
+    const std::string resolvedFilename = dataset.ResolvePath(value);
+    std::vector<std::string> whitelist;
+    std::string fn;
+    std::ifstream in(resolvedFilename);
+    while (std::getline(in, fn))
+        whitelist.push_back(fn);
+    return PbiQueryNameFilter{whitelist, compareType};
+}
+
+static PbiFilter CreateQueryNameFilter(std::string value, const DataSet& dataset, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"empty value for query name property"};
+
+    // try possible filename first
+    const std::string resolvedFilename = dataset.ResolvePath(value);
+    if (internal::FileUtils::Exists(value))
+        return CreateQueryNamesFilterFromFile(value, dataset, compareType);
+
+    // otherwise "normal" qname (single, or list)
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+
+        if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+            throw std::runtime_error{"unsupported compare type on query name property"};
+
+        std::vector<std::string> tokens = internal::Split(value, ',');
+        return PbiQueryNameFilter{std::move(tokens), compareType};
+    } else
+        return PbiQueryNameFilter{value, compareType};
+}
+
+static PbiFilter CreateReadGroupFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"empty value for read group property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+
+        if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+            throw std::runtime_error{"unsupported compare type on read group property"};
+
+        std::vector<std::string> tokens = internal::Split(value, ',');
+        return PbiReadGroupFilter{std::move(tokens), compareType};
+    } else
+        return PbiReadGroupFilter{value, compareType};
+}
+
+static PbiFilter CreateReferenceIdFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"empty value for reference ID property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+
+        if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+            throw std::runtime_error{"unsupported compare type on reference name ID property"};
+
+        std::vector<std::string> tokens = internal::Split(value, ',');
+        std::vector<int32_t> ids;
+        ids.reserve(tokens.size());
+        for (const auto& t : tokens)
+            ids.push_back(boost::numeric_cast<int32_t>(stoi(t)));
+        return PbiReferenceIdFilter{std::move(ids), compareType};
+    } else
+        return PbiReferenceIdFilter{boost::numeric_cast<int32_t>(stoi(value)), compareType};
+}
+
+static PbiFilter CreateReferenceNameFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"empty value for reference name property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+
+        if (compareType != Compare::EQUAL && compareType != Compare::NOT_EQUAL)
+            throw std::runtime_error{"unsupported compare type on reference name property"};
+
+        std::vector<std::string> tokens = internal::Split(value, ',');
+        return PbiReferenceNameFilter{std::move(tokens), compareType};
+    } else
+        return PbiReferenceNameFilter{value, compareType};
+}
+
+static PbiFilter CreateZmwFilter(std::string value, const Compare::Type compareType)
+{
+    if (value.empty()) throw std::runtime_error{"empty value for ZMW filter property"};
+
+    if (isBracketed(value)) {
+        value.erase(0, 1);
+        value.pop_back();
+    }
+
+    if (isList(value)) {
+        std::vector<std::string> tokens = internal::Split(value, ',');
+        std::vector<int32_t> zmws;
+        zmws.reserve(tokens.size());
+        for (const auto& t : tokens)
+            zmws.push_back(boost::numeric_cast<int32_t>(stoi(t)));
+        return PbiZmwFilter{std::move(zmws)};
+    } else
+        return PbiZmwFilter{boost::numeric_cast<int32_t>(stoi(value)), compareType};
+}
+
+static PbiFilter CreateZmwModuloFilter(const Property& property)
+{
+    if (!property.HasAttribute("Modulo") || !property.HasAttribute("Hash") ||
+        property.Name() != "zm")
+    {
+        throw std::runtime_error{"Modulo filter not supported on property: "};
+    }
+
+    const auto hashType = property.Attribute("Hash");
+    const FilterHash hash = [&hashType]()
+    {
+        if (boost::algorithm::to_lower_copy(hashType) == "uint32cast")
+            return FilterHash::UNSIGNED_LONG_CAST;
+        if (boost::algorithm::to_lower_copy(hashType) == "boosthashcombine")
+            return FilterHash::BOOST_HASH_COMBINE;
+        throw std::runtime_error{"unsuppoerted hash type: " + hashType};
+    }();
+
+    const uint32_t denom = std::stoul(property.Attribute("Modulo"));
+    const uint32_t value = std::stoul(property.Value());
+
+    return PbiZmwModuloFilter{ denom, value, hash, Compare::EQUAL };
+}
+
+static PbiFilter FromDataSetProperty(const Property& property, const DataSet& dataset)
+{
+    try {
+        const std::string& value = property.Value();
+
+        if (property.Name() == "zm" && property.HasAttribute("Modulo"))
+            return CreateZmwModuloFilter(property);
+
+        const Compare::Type compareType = Compare::TypeFromOperator(property.Operator());
+        const BuiltIn builtInCode =
+            builtInLookup.at(boost::algorithm::to_lower_copy(property.Name()));
+
+        // clang-format off
+        switch (builtInCode) {
+
+            // single-value filters
+            case BuiltIn::AlignedEndFilter     : return PbiAlignedEndFilter{ static_cast<uint32_t>(std::stoul(value)), compareType };
+            case BuiltIn::AlignedLengthFilter  : return PbiAlignedLengthFilter{ static_cast<uint32_t>(std::stoul(value)), compareType };
+            case BuiltIn::AlignedStartFilter   : return PbiAlignedStartFilter{ static_cast<uint32_t>(std::stoul(value)), compareType };
+            case BuiltIn::BarcodeQualityFilter : return PbiBarcodeQualityFilter{ static_cast<uint8_t>(std::stoul(value)), compareType };
+            case BuiltIn::IdentityFilter       : return PbiIdentityFilter{ std::stof(value), compareType };
+            case BuiltIn::QueryEndFilter       : return PbiQueryEndFilter{ std::stoi(value), compareType };
+            case BuiltIn::QueryLengthFilter    : return PbiQueryLengthFilter{ std::stoi(value), compareType };
+            case BuiltIn::QueryStartFilter     : return PbiQueryStartFilter{ std::stoi(value), compareType };
+            case BuiltIn::ReadAccuracyFilter   : return PbiReadAccuracyFilter{ std::stof(value), compareType };
+            case BuiltIn::ReferenceEndFilter   : return PbiReferenceEndFilter{ static_cast<uint32_t>(std::stoul(value)), compareType };
+            case BuiltIn::ReferenceStartFilter : return PbiReferenceStartFilter{ static_cast<uint32_t>(std::stoul(value)), compareType };
+
+            // (maybe) list-value filters
+            case BuiltIn::BarcodeFilter        : return CreateBarcodeFilter(value, compareType);
+            case BuiltIn::BarcodeForwardFilter : return CreateBarcodeForwardFilter(value, compareType);
+            case BuiltIn::BarcodeReverseFilter : return CreateBarcodeReverseFilter(value, compareType);
+            case BuiltIn::LocalContextFilter   : return CreateLocalContextFilter(value, compareType);
+            case BuiltIn::MovieNameFilter      : return CreateMovieNameFilter(value, compareType);
+            case BuiltIn::QueryNameFilter      : return CreateQueryNameFilter(value, dataset, compareType);
+            case BuiltIn::ReadGroupFilter      : return CreateReadGroupFilter(value, compareType);
+            case BuiltIn::ReferenceIdFilter    : return CreateReferenceIdFilter(value, compareType);
+            case BuiltIn::ReferenceNameFilter  : return CreateReferenceNameFilter(value, compareType);
+            case BuiltIn::ZmwFilter            : return CreateZmwFilter(value, compareType);
+
+            // other built-ins
+            case BuiltIn::QueryNamesFromFileFilter : return CreateQueryNamesFilterFromFile(value, dataset, compareType);
+
+            default :
+            throw std::runtime_error{""};
+        }
+        // clang-format on
+
+        // unreachable
+        return PbiFilter{};
+
+    } catch (std::exception& e) {
+        std::ostringstream s;
+        s << "error: could not create filter from XML Property element:\n"
+          << "  Name:     " << property.Name() << '\n'
+          << "  Value:    " << property.Value() << '\n'
+          << "  Operator: " << property.Operator() << '\n'
+          << "  reason:   " << e.what() << '\n';
+        throw std::runtime_error{s.str()};
+    }
+}
+
+}  // namespace internal
+
+PbiFilter PbiFilter::FromDataSet(const DataSet& dataset)
+{
+    PbiFilter datasetFilter{PbiFilter::UNION};
+    for (const auto& xmlFilter : dataset.Filters()) {
+        PbiFilter propertiesFilter;
+        for (const auto& xmlProperty : xmlFilter.Properties())
+            propertiesFilter.Add(internal::FromDataSetProperty(xmlProperty, dataset));
+        datasetFilter.Add(propertiesFilter);
+    }
+    return datasetFilter;
+}
+
+PbiFilter PbiFilter::Intersection(std::vector<PbiFilter> filters)
+{
+    auto result = PbiFilter{PbiFilter::INTERSECT};
+    result.Add(std::move(filters));
+    return result;
+}
+
+PbiFilter PbiFilter::Union(std::vector<PbiFilter> filters)
+{
+    auto result = PbiFilter{PbiFilter::UNION};
+    result.Add(std::move(filters));
+    return result;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiFilterQuery.cpp b/src/PbiFilterQuery.cpp

new file mode 100644 (file)

index 0000000..31f3325
--- /dev/null
+++ b/src/PbiFilterQuery.cpp
@@ -0,0 +1,46 @@
+// File Description
+/// \file PbiFilterQuery.cpp
+/// \brief Implements the PbiFilterQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiFilterQuery.h"
+
+#include <iostream>
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct PbiFilterQuery::PbiFilterQueryPrivate
+{
+    PbiFilterQueryPrivate(const PbiFilter& filter, const DataSet& dataset)
+        : reader_{filter, dataset}
+    {
+    }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_;  // unsorted
+};
+
+PbiFilterQuery::PbiFilterQuery(const DataSet& dataset)
+    : PbiFilterQuery{PbiFilter::FromDataSet(dataset), dataset}
+{
+}
+
+PbiFilterQuery::PbiFilterQuery(const PbiFilter& filter, const DataSet& dataset)
+    : internal::IQuery(), d_{std::make_unique<PbiFilterQueryPrivate>(filter, dataset)}
+{
+}
+
+PbiFilterQuery::~PbiFilterQuery() {}
+
+bool PbiFilterQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+uint32_t PbiFilterQuery::NumReads() const { return d_->reader_.NumReads(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiFilterTypes.cpp b/src/PbiFilterTypes.cpp

new file mode 100644 (file)

index 0000000..6f2f359
--- /dev/null
+++ b/src/PbiFilterTypes.cpp
@@ -0,0 +1,387 @@
+// File Description
+/// \file PbiFilterTypes.cpp
+/// \brief Implements the built-in PBI filters.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiFilterTypes.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include <boost/algorithm/string.hpp>
+
+#include "StringUtils.h"
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template <typename T>
+IndexList readLengthHelper(const std::vector<T>& start, const std::vector<T>& end, const T& value,
+                           const Compare::Type cmp)
+{
+    assert(start.size() == end.size());
+
+    auto result = IndexList{};
+    const auto numElements = start.size();
+    for (size_t i = 0; i < numElements; ++i) {
+        const auto readLength = end[i] - start[i];
+        bool keep = false;
+        switch (cmp) {
+            case Compare::EQUAL:
+                keep = (readLength == value);
+                break;
+            case Compare::NOT_EQUAL:
+                keep = (readLength != value);
+                break;
+            case Compare::LESS_THAN:
+                keep = (readLength < value);
+                break;
+            case Compare::LESS_THAN_EQUAL:
+                keep = (readLength <= value);
+                break;
+            case Compare::GREATER_THAN:
+                keep = (readLength > value);
+                break;
+            case Compare::GREATER_THAN_EQUAL:
+                keep = (readLength >= value);
+                break;
+            default:
+                assert(false);
+                throw std::runtime_error{"read length filter encountered unknown Compare::Type: " +
+                                         Compare::TypeToName(cmp)};
+        }
+
+        if (keep) result.push_back(i);
+    }
+    return result;
+}
+
+static PbiFilter filterFromMovieName(const std::string& movieName, bool includeCcs)
+{
+    //
+    // All transcript-type reads (movieName == "transcript") have the same
+    // read group ID. Calculate once & and create filters from that ID.
+    //
+    if (movieName == "transcript") {
+        static const auto transcriptRgId = MakeReadGroupId("transcript", "TRANSCRIPT");
+        return PbiFilter{PbiReadGroupFilter{transcriptRgId}};
+    }
+
+    //
+    // For all other movie names, we can't determine read type up front, so we'll match
+    // on any rgIds from a candidate list.
+    //
+    auto filter = PbiFilter{PbiFilter::UNION};
+    filter.Add({PbiReadGroupFilter{MakeReadGroupId(movieName, "POLYMERASE")},
+                PbiReadGroupFilter{MakeReadGroupId(movieName, "HQREGION")},
+                PbiReadGroupFilter{MakeReadGroupId(movieName, "SUBREAD")},
+                PbiReadGroupFilter{MakeReadGroupId(movieName, "SCRAP")},
+                PbiReadGroupFilter{MakeReadGroupId(movieName, "UNKNOWN")}});
+    if (includeCcs) filter.Add(PbiReadGroupFilter{MakeReadGroupId(movieName, "CCS")});
+
+    return filter;
+}
+
+}  // namespace internal
+
+// PbiAlignedLengthFilter
+
+bool PbiAlignedLengthFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const auto& mappedData = idx.MappedData();
+    const auto& aEnd = mappedData.aEnd_.at(row);
+    const auto& aStart = mappedData.aStart_.at(row);
+    const auto aLength = aEnd - aStart;
+    return CompareHelper(aLength);
+}
+
+// PbiIdentityFilter
+
+bool PbiIdentityFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const auto& mappedData = idx.MappedData();
+    const auto& nMM = mappedData.nMM_.at(row);
+    const auto& nIndels = mappedData.NumDeletedAndInsertedBasesAt(row);
+    const auto& nDel = nIndels.first;
+    const auto& nIns = nIndels.second;
+
+    const auto& basicData = idx.BasicData();
+    const auto& qStart = basicData.qStart_.at(row);
+    const auto& qEnd = basicData.qEnd_.at(row);
+
+    const float readLength = qEnd - qStart;
+    const float nonMatches = nMM + nDel + nIns;
+    const float identity = 1.0f - (nonMatches / readLength);
+
+    return CompareHelper(identity);
+}
+
+// PbiMovieNameFilter
+
+PbiMovieNameFilter::PbiMovieNameFilter(const std::string& movieName, const Compare::Type cmp)
+    : compositeFilter_{internal::filterFromMovieName(movieName, true)}  // include CCS
+    , cmp_{cmp}
+{
+}
+
+PbiMovieNameFilter::PbiMovieNameFilter(const std::vector<std::string>& whitelist,
+                                       const Compare::Type cmp)
+    : compositeFilter_{PbiFilter::UNION}, cmp_{cmp}
+{
+    for (const auto& movieName : whitelist)
+        compositeFilter_.Add(internal::filterFromMovieName(movieName, true));  // include CCS
+}
+
+// PbiQueryLengthFilter
+
+bool PbiQueryLengthFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    const auto& basicData = idx.BasicData();
+    const auto& qStart = basicData.qStart_.at(row);
+    const auto& qEnd = basicData.qEnd_.at(row);
+    const auto readLength = qEnd - qStart;
+    return CompareHelper(readLength);
+}
+
+// PbiQueryNameFilter
+
+struct PbiQueryNameFilter::PbiQueryNameFilterPrivate
+{
+public:
+    using QueryInterval = std::pair<int32_t, int32_t>;
+    using QueryIntervals = std::set<QueryInterval>;
+    using ZmwLookup = std::unordered_map<int32_t, QueryIntervals>;
+    using ZmwLookupPtr = std::shared_ptr<ZmwLookup>;  // may be shared by more than one rgId
+    using RgIdLookup = std::unordered_map<int32_t, ZmwLookupPtr>;
+
+public:
+    PbiQueryNameFilterPrivate(const std::vector<std::string>& whitelist,
+                              const Compare::Type cmp = Compare::EQUAL)
+        : cmp_{cmp}
+    {
+        for (const auto& queryName : whitelist) {
+
+            if (queryName.find("transcript/") == 0)
+                HandleName(queryName, RecordType::TRANSCRIPT);
+            else if (queryName.find("/ccs") != std::string::npos)
+                HandleName(queryName, RecordType::CCS);
+            else
+                HandleName(queryName, RecordType::UNKNOWN);
+        }
+    }
+
+    PbiQueryNameFilterPrivate(const std::unique_ptr<PbiQueryNameFilterPrivate>& other)
+    {
+        if (other) {
+            lookup_ = other->lookup_;
+            cmp_ = other->cmp_;
+        }
+    }
+
+    bool Accepts(const PbiRawData& idx, const size_t row) const
+    {
+        const auto& basicData = idx.BasicData();
+
+        // see if row's RGID known
+        const auto& rgId = basicData.rgId_.at(row);
+        const auto rgFound = lookup_.find(rgId);
+        if (rgFound == lookup_.end()) return false;
+
+        // see if row's ZMW known
+        const auto& zmwPtr = rgFound->second;
+        const auto zmw = basicData.holeNumber_.at(row);
+        const auto zmwFound = zmwPtr->find(zmw);
+        if (zmwFound == zmwPtr->end()) return false;
+
+        // see if row's QueryStart/QueryEnd known
+        // CCS names already covered in lookup construction phase
+        const auto& queryIntervals = zmwFound->second;
+        const auto qStart = basicData.qStart_.at(row);
+        const auto qEnd = basicData.qEnd_.at(row);
+        const auto queryInterval = std::make_pair(qStart, qEnd);
+
+        const bool found = queryIntervals.find(queryInterval) != queryIntervals.end();
+        if (cmp_ == Compare::EQUAL)
+            return found;
+        else if (cmp_ == Compare::NOT_EQUAL)
+            return !found;
+        else
+            throw std::runtime_error{"unsupported compare type on query name filter"};
+    }
+
+    std::vector<int32_t> CandidateRgIds(const std::string& movieName, const RecordType type)
+    {
+        if (type == RecordType::CCS)
+            return {ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "CCS"))};
+
+        if (type == RecordType::TRANSCRIPT)
+            return {ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "TRANSCRIPT"))};
+
+        // we can't know for sure from QNAME alone
+        return {ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "POLYMERASE")),
+                ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "HQREGION")),
+                ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "SUBREAD")),
+                ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "SCRAP")),
+                ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "UNKNOWN")),
+                ReadGroupInfo::IdToInt(MakeReadGroupId(movieName, "ZMW"))};
+    }
+
+    void HandleName(const std::string& queryName, const RecordType type)
+    {
+        // split name into main parts
+        const auto nameParts = internal::Split(queryName, '/');
+
+        // verify syntax
+        if (IsCcsOrTranscript(type)) {
+            if (nameParts.size() != 2) {
+                const auto typeName = (type == RecordType::CCS) ? "CCS" : "transcript";
+                throw std::runtime_error{"PbiQueryNameFilter error: requested QNAME (" + queryName +
+                                         ") is not valid for PacBio " + typeName +
+                                         " reads. See spec for details."};
+            }
+        } else {
+            if (nameParts.size() != 3) {
+                throw std::runtime_error{"PbiQueryNameFilter error: requested QNAME (" + queryName +
+                                         ") is not a valid PacBio BAM QNAME. See spec for details"};
+            }
+        }
+
+        // generate candidate read group IDs from movie name & record type, then
+        // add to lookup table
+        const auto zmwPtr = UpdateRgLookup(CandidateRgIds(nameParts.at(0), type));
+
+        // add qStart/qEnd interval to zmw lookup
+        const auto zmw = std::stoi(nameParts.at(1));
+        if (IsCcsOrTranscript(type))
+            UpdateZmwQueryIntervals(zmwPtr.get(), zmw, -1, -1);
+        else {
+            const auto queryIntervalParts = Split(nameParts.at(2), '_');
+            if (queryIntervalParts.size() != 2) {
+                throw std::runtime_error{"PbiQueryNameFilter error: requested QNAME (" + queryName +
+                                         ") is not a valid PacBio BAM QNAME. See spec for details"};
+            }
+            UpdateZmwQueryIntervals(zmwPtr.get(), zmw, std::stoi(queryIntervalParts.at(0)),
+                                    std::stoi(queryIntervalParts.at(1)));
+        }
+    }
+
+    ZmwLookupPtr UpdateRgLookup(std::vector<int32_t>&& rgIds)
+    {
+        assert(!rgIds.empty());
+
+        ZmwLookupPtr zmwPtr;
+
+        const auto rgFound = lookup_.find(rgIds.front());
+        if (rgFound == lookup_.end()) {
+            zmwPtr = std::make_shared<ZmwLookup>();
+            for (const auto& rg : rgIds) {
+                assert(lookup_.find(rg) == lookup_.end());
+                lookup_.emplace(rg, zmwPtr);
+            }
+        } else {
+#ifndef NDEBUG
+            for (const auto& rg : rgIds)
+                assert(lookup_.find(rg) != lookup_.end());
+#endif
+            zmwPtr = rgFound->second;
+        }
+        return zmwPtr;
+    }
+
+    // add QS/QE pair to ZMW lookup
+    void UpdateZmwQueryIntervals(ZmwLookup* const zmwPtr, const int32_t zmw,
+                                 const int32_t queryStart, const int32_t queryEnd)
+    {
+        const auto zmwFound = zmwPtr->find(zmw);
+        if (zmwFound == zmwPtr->end()) zmwPtr->emplace(zmw, QueryIntervals{});
+        auto& queryIntervals = zmwPtr->at(zmw);
+        queryIntervals.emplace(std::make_pair(queryStart, queryEnd));
+    }
+
+private:
+    RgIdLookup lookup_;
+    Compare::Type cmp_;
+};
+
+PbiQueryNameFilter::PbiQueryNameFilter(const std::string& qname, const Compare::Type cmp)
+    : d_{std::make_unique<PbiQueryNameFilter::PbiQueryNameFilterPrivate>(
+          std::vector<std::string>{1, qname}, cmp)}
+{
+}
+
+PbiQueryNameFilter::PbiQueryNameFilter(const std::vector<std::string>& whitelist,
+                                       const Compare::Type cmp)
+    : d_{std::make_unique<PbiQueryNameFilter::PbiQueryNameFilterPrivate>(whitelist, cmp)}
+{
+}
+
+PbiQueryNameFilter::PbiQueryNameFilter(const PbiQueryNameFilter& other)
+    : d_{std::make_unique<PbiQueryNameFilter::PbiQueryNameFilterPrivate>(other.d_)}
+{
+}
+
+PbiQueryNameFilter::~PbiQueryNameFilter() {}
+
+bool PbiQueryNameFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    return d_->Accepts(idx, row);
+}
+
+// PbiReferenceNameFilter
+
+PbiReferenceNameFilter::PbiReferenceNameFilter(std::string rname, Compare::Type cmp)
+    : rname_{std::move(rname)}, cmp_{cmp}
+{
+    if (cmp != Compare::EQUAL && cmp != Compare::NOT_EQUAL) {
+        throw std::runtime_error{
+            "Compare type: " + Compare::TypeToName(cmp) +
+            " not supported for PbiReferenceNameFilter (use one of Compare::EQUAL or "
+            "Compare::NOT_EQUAL)."};
+    }
+}
+
+PbiReferenceNameFilter::PbiReferenceNameFilter(std::vector<std::string> whitelist,
+                                               const Compare::Type cmp)
+    : rnameWhitelist_{std::move(whitelist)}, cmp_{cmp}
+{
+}
+
+bool PbiReferenceNameFilter::Accepts(const PbiRawData& idx, const size_t row) const
+{
+    if (!initialized_) Initialize(idx);
+    return subFilter_.Accepts(idx, row);
+}
+
+void PbiReferenceNameFilter::Initialize(const PbiRawData& idx) const
+{
+    const auto pbiFilename = idx.Filename();
+    const auto bamFilename = pbiFilename.substr(0, pbiFilename.length() - 4);
+    const BamFile bamFile{bamFilename};
+
+    // single-value
+    if (rnameWhitelist_ == boost::none) {
+        const auto tId = bamFile.ReferenceId(rname_);
+        subFilter_ = PbiReferenceIdFilter{tId, cmp_};
+    }
+
+    // multi-value whitelist
+    else {
+        std::vector<int32_t> ids;
+        for (const auto& rname : rnameWhitelist_.get())
+            ids.push_back(bamFile.ReferenceId(rname));
+        subFilter_ = PbiReferenceIdFilter{std::move(ids), cmp_};
+    }
+    initialized_ = true;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiIndexIO.cpp b/src/PbiIndexIO.cpp

new file mode 100644 (file)

index 0000000..1e9a3bd
--- /dev/null
+++ b/src/PbiIndexIO.cpp
@@ -0,0 +1,393 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "PbiIndexIO.h"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+
+#include "MemoryUtils.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/MoveAppend.h"
+#include "pbbam/PbiBuilder.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static void CheckContainer(const std::string& container, const size_t expected,
+                           const size_t observed)
+{
+    if (observed != expected) {
+        std::ostringstream msg;
+        msg << "PBI index error: expected " << expected << " records in " << container
+            << " field, but found " << observed << " instead";
+        throw std::runtime_error{msg.str()};
+    }
+}
+
+static void CheckExpectedSize(const PbiRawBarcodeData& barcodeData, const size_t numReads)
+{
+    CheckContainer("BarcodeData.bc_forward", numReads, barcodeData.bcForward_.size());
+    CheckContainer("BarcodeData.bc_reverse", numReads, barcodeData.bcReverse_.size());
+    CheckContainer("BarcodeData.bc_qual", numReads, barcodeData.bcReverse_.size());
+}
+
+static void CheckExpectedSize(const PbiRawBasicData& basicData, const size_t numReads)
+{
+    CheckContainer("BasicData.rgId", numReads, basicData.rgId_.size());
+    CheckContainer("BasicData.qStart", numReads, basicData.qStart_.size());
+    CheckContainer("BasicData.qEnd", numReads, basicData.qEnd_.size());
+    CheckContainer("BasicData.holeNumber", numReads, basicData.holeNumber_.size());
+    CheckContainer("BasicData.readQual", numReads, basicData.readQual_.size());
+    CheckContainer("BasicData.ctxt_flag", numReads, basicData.ctxtFlag_.size());
+    CheckContainer("BasicData.fileOffset", numReads, basicData.fileOffset_.size());
+}
+
+static void CheckExpectedSize(const PbiRawMappedData& mappedData, const size_t numReads)
+{
+    CheckContainer("MappedData.tId", numReads, mappedData.tId_.size());
+    CheckContainer("MappedData.tStart", numReads, mappedData.tStart_.size());
+    CheckContainer("MappedData.tEnd", numReads, mappedData.tEnd_.size());
+    CheckContainer("MappedData.aStart", numReads, mappedData.aStart_.size());
+    CheckContainer("MappedData.aEnd", numReads, mappedData.aEnd_.size());
+    CheckContainer("MappedData.revStrand", numReads, mappedData.revStrand_.size());
+    CheckContainer("MappedData.nM", numReads, mappedData.nM_.size());
+    CheckContainer("MappedData.nMM", numReads, mappedData.nMM_.size());
+    CheckContainer("MappedData.mapQV", numReads, mappedData.mapQV_.size());
+}
+
+// ---------------------------
+// PbiIndexIO implementation
+// ---------------------------
+
+PbiRawData PbiIndexIO::Load(const std::string& pbiFilename)
+{
+    PbiRawData rawData;
+    Load(rawData, pbiFilename);
+    return rawData;
+}
+
+void PbiIndexIO::Load(PbiRawData& rawData, const std::string& filename)
+{
+    // open file for reading
+    if (!boost::algorithm::iends_with(filename, ".pbi"))
+        throw std::runtime_error{"unsupported file extension on " + filename};
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> bgzf(bgzf_open(filename.c_str(), "rb"));
+    auto* fp = bgzf.get();
+    if (fp == nullptr)
+        throw std::runtime_error{"could not open PBI file: " + filename + "for reading"};
+
+    // load data
+    LoadHeader(rawData, fp);
+    const auto numReads = rawData.NumReads();
+    if (numReads > 0) {
+        LoadBasicData(rawData.BasicData(), numReads, fp);
+        if (rawData.HasMappedData()) LoadMappedData(rawData.MappedData(), numReads, fp);
+        if (rawData.HasReferenceData()) LoadReferenceData(rawData.ReferenceData(), fp);
+        if (rawData.HasBarcodeData()) LoadBarcodeData(rawData.BarcodeData(), numReads, fp);
+    }
+}
+
+void PbiIndexIO::LoadFromDataSet(PbiRawData& aggregateData, const DataSet& dataset)
+{
+    aggregateData.NumReads(0);
+    aggregateData.FileSections(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE);
+    aggregateData.Version(PbiFile::CurrentVersion);
+
+    const auto bamFiles = dataset.BamFiles();
+    uint16_t fileNumber = 0;
+    for (const auto& bamFile : bamFiles) {
+        PbiRawData currentPbi{bamFile.PacBioIndexFilename()};
+        const auto currentPbiCount = currentPbi.NumReads();
+
+        // read count
+        aggregateData.NumReads(aggregateData.NumReads() + currentPbiCount);
+
+        // BasicData
+        auto& aggregateBasicData = aggregateData.BasicData();
+        auto& currentBasicData = currentPbi.BasicData();
+        MoveAppend(std::move(currentBasicData.rgId_), aggregateBasicData.rgId_);
+        MoveAppend(std::move(currentBasicData.qStart_), aggregateBasicData.qStart_);
+        MoveAppend(std::move(currentBasicData.qEnd_), aggregateBasicData.qEnd_);
+        MoveAppend(std::move(currentBasicData.holeNumber_), aggregateBasicData.holeNumber_);
+        MoveAppend(std::move(currentBasicData.readQual_), aggregateBasicData.readQual_);
+        MoveAppend(std::move(currentBasicData.ctxtFlag_), aggregateBasicData.ctxtFlag_);
+        MoveAppend(std::move(currentBasicData.fileOffset_), aggregateBasicData.fileOffset_);
+        MoveAppend(std::vector<uint16_t>(currentPbiCount, fileNumber),
+                   aggregateBasicData.fileNumber_);
+
+        // BarcodeData
+        auto& aggregateBarcodeData = aggregateData.BarcodeData();
+        if (currentPbi.HasBarcodeData()) {
+            auto& currentBarcodeData = currentPbi.BarcodeData();
+            MoveAppend(std::move(currentBarcodeData.bcForward_), aggregateBarcodeData.bcForward_);
+            MoveAppend(std::move(currentBarcodeData.bcReverse_), aggregateBarcodeData.bcReverse_);
+            MoveAppend(std::move(currentBarcodeData.bcQual_), aggregateBarcodeData.bcQual_);
+        } else {
+            MoveAppend(std::vector<int16_t>(currentPbiCount, -1), aggregateBarcodeData.bcForward_);
+            MoveAppend(std::vector<int16_t>(currentPbiCount, -1), aggregateBarcodeData.bcReverse_);
+            MoveAppend(std::vector<int8_t>(currentPbiCount, -1), aggregateBarcodeData.bcQual_);
+        }
+
+        // MappedData
+        auto& aggregateMappedData = aggregateData.MappedData();
+        if (currentPbi.HasMappedData()) {
+            auto& currentMappedData = currentPbi.MappedData();
+            MoveAppend(std::move(currentMappedData.tId_), aggregateMappedData.tId_);
+            MoveAppend(std::move(currentMappedData.tStart_), aggregateMappedData.tStart_);
+            MoveAppend(std::move(currentMappedData.tEnd_), aggregateMappedData.tEnd_);
+            MoveAppend(std::move(currentMappedData.aStart_), aggregateMappedData.aStart_);
+            MoveAppend(std::move(currentMappedData.aEnd_), aggregateMappedData.aEnd_);
+            MoveAppend(std::move(currentMappedData.revStrand_), aggregateMappedData.revStrand_);
+            MoveAppend(std::move(currentMappedData.nM_), aggregateMappedData.nM_);
+            MoveAppend(std::move(currentMappedData.nMM_), aggregateMappedData.nMM_);
+            MoveAppend(std::move(currentMappedData.mapQV_), aggregateMappedData.mapQV_);
+        } else {
+            MoveAppend(std::vector<int32_t>(currentPbiCount, -1), aggregateMappedData.tId_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition),
+                       aggregateMappedData.tStart_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition),
+                       aggregateMappedData.tEnd_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition),
+                       aggregateMappedData.aStart_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, UnmappedPosition),
+                       aggregateMappedData.aEnd_);
+            MoveAppend(std::vector<uint8_t>(currentPbiCount, 0), aggregateMappedData.revStrand_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, 0), aggregateMappedData.nM_);
+            MoveAppend(std::vector<uint32_t>(currentPbiCount, 0), aggregateMappedData.nMM_);
+            MoveAppend(std::vector<uint8_t>(currentPbiCount, 255), aggregateMappedData.mapQV_);
+        }
+
+        ++fileNumber;
+    }
+}
+
+void PbiIndexIO::LoadBarcodeData(PbiRawBarcodeData& barcodeData, const uint32_t numReads, BGZF* fp)
+{
+    // read from file
+    LoadBgzfVector(fp, barcodeData.bcForward_, numReads);
+    LoadBgzfVector(fp, barcodeData.bcReverse_, numReads);
+    LoadBgzfVector(fp, barcodeData.bcQual_, numReads);
+
+    // validate
+    CheckExpectedSize(barcodeData, numReads);
+}
+
+void PbiIndexIO::LoadHeader(PbiRawData& index, BGZF* fp)
+{
+    // 'magic' string
+    char magic[4];
+    auto bytesRead = bgzf_read(fp, magic, 4);
+    if (bytesRead != 4 || strncmp(magic, "PBI\1", 4))
+        throw std::runtime_error{"expected PBI file, found unknown format instead"};
+
+    // version, pbi_flags, & n_reads
+    uint32_t version;
+    uint16_t sections;
+    uint32_t numReads;
+    bytesRead = bgzf_read(fp, &version, sizeof(version));
+    bytesRead = bgzf_read(fp, &sections, sizeof(sections));
+    bytesRead = bgzf_read(fp, &numReads, sizeof(numReads));
+    if (fp->is_be) {
+        version = ed_swap_4(version);
+        sections = ed_swap_2(sections);
+        numReads = ed_swap_4(numReads);
+    }
+
+    index.Version(PbiFile::VersionEnum(version));
+    index.FileSections(sections);
+    index.NumReads(numReads);
+
+    // skip reserved section
+    size_t reservedLength = 18;
+    // adjust depending on version
+    char reserved[18];
+    bytesRead = bgzf_read(fp, &reserved, reservedLength);
+}
+
+void PbiIndexIO::LoadMappedData(PbiRawMappedData& mappedData, const uint32_t numReads, BGZF* fp)
+{
+    // read from file
+    LoadBgzfVector(fp, mappedData.tId_, numReads);
+    LoadBgzfVector(fp, mappedData.tStart_, numReads);
+    LoadBgzfVector(fp, mappedData.tEnd_, numReads);
+    LoadBgzfVector(fp, mappedData.aStart_, numReads);
+    LoadBgzfVector(fp, mappedData.aEnd_, numReads);
+    LoadBgzfVector(fp, mappedData.revStrand_, numReads);
+    LoadBgzfVector(fp, mappedData.nM_, numReads);
+    LoadBgzfVector(fp, mappedData.nMM_, numReads);
+    LoadBgzfVector(fp, mappedData.mapQV_, numReads);
+
+    // validate
+    CheckExpectedSize(mappedData, numReads);
+}
+
+void PbiIndexIO::LoadReferenceData(PbiRawReferenceData& referenceData, BGZF* fp)
+{
+    assert(sizeof(PbiReferenceEntry::ID) == 4);
+    assert(sizeof(PbiReferenceEntry::Row) == 4);
+
+    // num refs
+    uint32_t numRefs;
+    auto ret = bgzf_read(fp, &numRefs, 4);
+    if (fp->is_be) numRefs = ed_swap_4(numRefs);
+
+    // reference entries
+    referenceData.entries_.clear();
+    referenceData.entries_.resize(numRefs);
+    for (auto& entry : referenceData.entries_) {
+        //    for (size_t i = 0; i < numRefs; ++i) {
+        //        PbiReferenceEntry& entry = referenceData.entries_[i];
+        ret = bgzf_read(fp, &entry.tId_, 4);
+        ret = bgzf_read(fp, &entry.beginRow_, 4);
+        ret = bgzf_read(fp, &entry.endRow_, 4);
+        if (fp->is_be) {
+            entry.tId_ = ed_swap_4(entry.tId_);
+            entry.beginRow_ = ed_swap_4(entry.beginRow_);
+            entry.endRow_ = ed_swap_4(entry.endRow_);
+        }
+    }
+    UNUSED(ret);
+}
+
+void PbiIndexIO::LoadBasicData(PbiRawBasicData& basicData, const uint32_t numReads, BGZF* fp)
+{
+    // read from file
+    LoadBgzfVector(fp, basicData.rgId_, numReads);
+    LoadBgzfVector(fp, basicData.qStart_, numReads);
+    LoadBgzfVector(fp, basicData.qEnd_, numReads);
+    LoadBgzfVector(fp, basicData.holeNumber_, numReads);
+    LoadBgzfVector(fp, basicData.readQual_, numReads);
+    LoadBgzfVector(fp, basicData.ctxtFlag_, numReads);
+    LoadBgzfVector(fp, basicData.fileOffset_, numReads);
+
+    // validate
+    CheckExpectedSize(basicData, numReads);
+}
+
+void PbiIndexIO::Save(const PbiRawData& index, const std::string& filename)
+{
+    std::unique_ptr<BGZF, HtslibBgzfDeleter> bgzf(bgzf_open(filename.c_str(), "wb"));
+    auto* fp = bgzf.get();
+    if (fp == nullptr)
+        throw std::runtime_error{"could not open PBI file: " + filename + "for writing"};
+
+    WriteHeader(index, fp);
+    const auto numReads = index.NumReads();
+    if (numReads > 0) {
+        WriteBasicData(index.BasicData(), numReads, fp);
+
+        if (index.HasMappedData()) WriteMappedData(index.MappedData(), numReads, fp);
+        if (index.HasReferenceData()) WriteReferenceData(index.ReferenceData(), fp);
+        if (index.HasBarcodeData()) WriteBarcodeData(index.BarcodeData(), numReads, fp);
+    }
+}
+
+void PbiIndexIO::WriteBarcodeData(const PbiRawBarcodeData& barcodeData, const uint32_t numReads,
+                                  BGZF* fp)
+{
+    // validate
+    CheckExpectedSize(barcodeData, numReads);
+
+    // write to file
+    WriteBgzfVector(fp, barcodeData.bcForward_);
+    WriteBgzfVector(fp, barcodeData.bcReverse_);
+    WriteBgzfVector(fp, barcodeData.bcQual_);
+}
+
+void PbiIndexIO::WriteHeader(const PbiRawData& index, BGZF* fp)
+{
+    // 'magic' string
+    constexpr static const std::array<char, 4> magic{{'P', 'B', 'I', '\1'}};
+    auto ret = bgzf_write(fp, magic.data(), 4);
+
+    // version, pbi_flags, & n_reads
+    auto version = static_cast<uint32_t>(index.Version());
+    uint16_t pbi_flags = index.FileSections();
+    auto numReads = static_cast<uint16_t>(index.NumReads());
+    if (fp->is_be) {
+        version = ed_swap_4(version);
+        pbi_flags = ed_swap_2(pbi_flags);
+        numReads = ed_swap_4(numReads);
+    }
+    ret = bgzf_write(fp, &version, 4);
+    ret = bgzf_write(fp, &pbi_flags, 2);
+    ret = bgzf_write(fp, &numReads, 4);
+
+    // reserved space
+    char reserved[18];
+    memset(reserved, 0, 18);
+    ret = bgzf_write(fp, reserved, 18);
+    UNUSED(ret);
+}
+
+void PbiIndexIO::WriteMappedData(const PbiRawMappedData& mappedData, const uint32_t numReads,
+                                 BGZF* fp)
+{
+    // validate
+    CheckExpectedSize(mappedData, numReads);
+
+    // write to file
+    WriteBgzfVector(fp, mappedData.tId_);
+    WriteBgzfVector(fp, mappedData.tStart_);
+    WriteBgzfVector(fp, mappedData.tEnd_);
+    WriteBgzfVector(fp, mappedData.aStart_);
+    WriteBgzfVector(fp, mappedData.aEnd_);
+    WriteBgzfVector(fp, mappedData.revStrand_);
+    WriteBgzfVector(fp, mappedData.nM_);
+    WriteBgzfVector(fp, mappedData.nMM_);
+    WriteBgzfVector(fp, mappedData.mapQV_);
+}
+
+void PbiIndexIO::WriteReferenceData(const PbiRawReferenceData& referenceData, BGZF* fp)
+{
+    // num_refs
+    auto numRefs = referenceData.entries_.size();
+    if (fp->is_be) numRefs = ed_swap_4(numRefs);
+    auto ret = bgzf_write(fp, &numRefs, 4);
+
+    // reference entries
+    for (const auto& entry : referenceData.entries_) {
+        auto tId = entry.tId_;
+        auto beginRow = entry.beginRow_;
+        auto endRow = entry.endRow_;
+        if (fp->is_be) {
+            tId = ed_swap_4(tId);
+            beginRow = ed_swap_4(beginRow);
+            endRow = ed_swap_4(endRow);
+        }
+        ret = bgzf_write(fp, &tId, 4);
+        ret = bgzf_write(fp, &beginRow, 4);
+        ret = bgzf_write(fp, &endRow, 4);
+    }
+    UNUSED(ret);
+}
+
+void PbiIndexIO::WriteBasicData(const PbiRawBasicData& basicData, const uint32_t numReads, BGZF* fp)
+{
+    // validate
+    CheckExpectedSize(basicData, numReads);
+
+    // write to file
+    WriteBgzfVector(fp, basicData.rgId_);
+    WriteBgzfVector(fp, basicData.qStart_);
+    WriteBgzfVector(fp, basicData.qEnd_);
+    WriteBgzfVector(fp, basicData.holeNumber_);
+    WriteBgzfVector(fp, basicData.readQual_);
+    WriteBgzfVector(fp, basicData.ctxtFlag_);
+    WriteBgzfVector(fp, basicData.fileOffset_);
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiIndexIO.h b/src/PbiIndexIO.h

new file mode 100644 (file)

index 0000000..b7eb4e8
--- /dev/null
+++ b/src/PbiIndexIO.h
@@ -0,0 +1,117 @@
+// Author: Derek Barnett
+
+#ifndef PBIINDEXIO_H
+#define PBIINDEXIO_H
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <htslib/bgzf.h>
+#include <htslib/sam.h>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/PbiFile.h"
+#include "pbbam/PbiRawData.h"
+#include "pbbam/Unused.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class PbiIndexIO
+{
+public:
+    // top-level entry points
+    static PbiRawData Load(const std::string& filename);
+    static void Load(PbiRawData& rawData, const std::string& filename);
+    static void LoadFromDataSet(PbiRawData& aggregateData, const DataSet& dataset);
+    static void Save(const PbiRawData& rawData, const std::string& filename);
+
+public:
+    // per-component load
+    static void LoadBarcodeData(PbiRawBarcodeData& barcodeData, const uint32_t numReads, BGZF* fp);
+    static void LoadHeader(PbiRawData& index, BGZF* fp);
+    static void LoadMappedData(PbiRawMappedData& mappedData, const uint32_t numReads, BGZF* fp);
+    static void LoadReferenceData(PbiRawReferenceData& referenceData, BGZF* fp);
+    static void LoadBasicData(PbiRawBasicData& basicData, const uint32_t numReads, BGZF* fp);
+
+    // per-data-field load
+    template <typename T>
+    static void LoadBgzfVector(BGZF* fp, std::vector<T>& data, const uint32_t numReads);
+
+public:
+    // per-component write
+    static void WriteBarcodeData(const PbiRawBarcodeData& barcodeData, const uint32_t numReads,
+                                 BGZF* fp);
+    static void WriteHeader(const PbiRawData& index, BGZF* fp);
+    static void WriteMappedData(const PbiRawMappedData& mappedData, const uint32_t numReads,
+                                BGZF* fp);
+    static void WriteReferenceData(const PbiRawReferenceData& referenceData, BGZF* fp);
+    static void WriteBasicData(const PbiRawBasicData& subreadData, const uint32_t numReads,
+                               BGZF* fp);
+
+    // per-data-field write
+    template <typename T>
+    static void WriteBgzfVector(BGZF* fp, const std::vector<T>& data);
+
+private:
+    // helper functions
+    template <typename T>
+    static void SwapEndianness(std::vector<T>& data);
+};
+
+template <typename T>
+inline void PbiIndexIO::LoadBgzfVector(BGZF* fp, std::vector<T>& data, const uint32_t numReads)
+{
+    assert(fp);
+    data.resize(numReads);
+    auto ret = bgzf_read(fp, &data[0], numReads * sizeof(T));
+    if (fp->is_be) SwapEndianness(data);
+    UNUSED(ret);
+}
+
+template <typename T>
+inline void PbiIndexIO::SwapEndianness(std::vector<T>& data)
+{
+    const auto elementSize = sizeof(T);
+    const auto numReads = data.size();
+    switch (elementSize) {
+        case 1:
+            break;  // no swapping necessary
+        case 2:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_2p(&data[i]);
+            break;
+        case 4:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_4p(&data[i]);
+            break;
+        case 8:
+            for (size_t i = 0; i < numReads; ++i)
+                ed_swap_8p(&data[i]);
+            break;
+        default:
+            throw std::runtime_error{"unsupported element size"};
+    }
+}
+
+template <typename T>
+inline void PbiIndexIO::WriteBgzfVector(BGZF* fp, const std::vector<T>& data)
+{
+    assert(fp);
+    std::vector<T> output = data;
+    if (fp->is_be) SwapEndianness(output);
+    auto ret = bgzf_write(fp, &output[0], data.size() * sizeof(T));
+    UNUSED(ret);
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PBIINDEXIO_H
diff --git a/src/PbiIndexedBamReader.cpp b/src/PbiIndexedBamReader.cpp

new file mode 100644 (file)

index 0000000..3b0628e
--- /dev/null
+++ b/src/PbiIndexedBamReader.cpp
@@ -0,0 +1,170 @@
+// File Description
+/// \file PbiIndexedBamReader.cpp
+/// \brief Implements the PbiIndexedBamReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiIndexedBamReader.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+
+#include <htslib/bgzf.h>
+
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct PbiIndexedBamReaderPrivate
+{
+public:
+    PbiIndexedBamReaderPrivate(const std::string& pbiFilename)
+        : index_{pbiFilename}, currentBlockReadCount_{0}, numMatchingReads_{0}
+    {
+    }
+
+    void ApplyOffsets()
+    {
+        const auto& fileOffsets = index_.BasicData().fileOffset_;
+        for (IndexResultBlock& block : blocks_)
+            block.virtualOffset_ = fileOffsets.at(block.firstIndex_);
+    }
+
+    void Filter(const PbiFilter filter)
+    {
+        // store request & reset counters
+        filter_ = std::move(filter);
+        currentBlockReadCount_ = 0;
+        blocks_.clear();
+        numMatchingReads_ = 0;
+
+        // find blocks of reads passing filter criteria
+        const auto totalReads = index_.NumReads();
+        if (totalReads == 0) {  // empty PBI - no reads to use
+            return;
+        } else if (filter_.IsEmpty()) {  // empty filter - use all reads
+            numMatchingReads_ = totalReads;
+            blocks_.emplace_back(0, totalReads);
+        } else {
+            IndexList indices;
+            indices.reserve(totalReads);
+            for (size_t i = 0; i < totalReads; ++i) {
+                if (filter_.Accepts(index_, i)) {
+                    indices.push_back(i);
+                    ++numMatchingReads_;
+                }
+            }
+            blocks_ = MergedIndexBlocks(std::move(indices));
+        }
+
+        // apply offsets
+        ApplyOffsets();
+    }
+
+    IndexResultBlocks MergedIndexBlocks(IndexList indices) const
+    {
+        if (indices.empty()) return {};
+
+        std::sort(indices.begin(), indices.end());
+        auto newEndIter = std::unique(indices.begin(), indices.end());
+        auto numIndices = std::distance(indices.begin(), newEndIter);
+        auto result = IndexResultBlocks{IndexResultBlock{indices.at(0), 1}};
+        for (auto i = 1; i < numIndices; ++i) {
+            if (indices.at(i) == indices.at(i - 1) + 1)
+                ++result.back().numReads_;
+            else
+                result.emplace_back(indices.at(i), 1);
+        }
+        return result;
+    }
+
+    int ReadRawData(BGZF* bgzf, bam1_t* b)
+    {
+        // no data to fetch, return false
+        if (blocks_.empty()) return -1;  // "EOF"
+
+        // if on new block, seek to its first record
+        if (currentBlockReadCount_ == 0) {
+            const auto seekResult = bgzf_seek(bgzf, blocks_.at(0).virtualOffset_, SEEK_SET);
+            if (seekResult == -1) throw std::runtime_error{"could not seek in BAM file"};
+        }
+
+        // read next record
+        const auto result = bam_read1(bgzf, b);
+
+        // update counters. if block finished, pop & reset
+        ++currentBlockReadCount_;
+        if (currentBlockReadCount_ == blocks_.at(0).numReads_) {
+            blocks_.pop_front();
+            currentBlockReadCount_ = 0;
+        }
+
+        return result;
+    }
+
+public:
+    PbiFilter filter_;
+    PbiRawData index_;
+    IndexResultBlocks blocks_;
+    size_t currentBlockReadCount_;
+    uint32_t numMatchingReads_;
+};
+
+}  // namespace internal
+
+PbiIndexedBamReader::PbiIndexedBamReader(PbiFilter filter, const std::string& filename)
+    : PbiIndexedBamReader{std::move(filter), BamFile{filename}}
+{
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(PbiFilter filter, BamFile bamFile)
+    : PbiIndexedBamReader{std::move(bamFile)}
+{
+    Filter(std::move(filter));
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(const std::string& bamFilename)
+    : PbiIndexedBamReader{BamFile{bamFilename}}
+{
+}
+
+PbiIndexedBamReader::PbiIndexedBamReader(BamFile bamFile)
+    : BamReader{std::move(bamFile)}
+    , d_{std::make_unique<internal::PbiIndexedBamReaderPrivate>(File().PacBioIndexFilename())}
+{
+}
+
+PbiIndexedBamReader::~PbiIndexedBamReader() {}
+
+int PbiIndexedBamReader::ReadRawData(BGZF* bgzf, bam1_t* b)
+{
+    assert(d_);
+    return d_->ReadRawData(bgzf, b);
+}
+
+const PbiFilter& PbiIndexedBamReader::Filter() const
+{
+    assert(d_);
+    return d_->filter_;
+}
+
+PbiIndexedBamReader& PbiIndexedBamReader::Filter(PbiFilter filter)
+{
+    assert(d_);
+    d_->Filter(std::move(filter));
+    return *this;
+}
+
+uint32_t PbiIndexedBamReader::NumReads() const
+{
+    assert(d_);
+    return d_->numMatchingReads_;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/PbiRawData.cpp b/src/PbiRawData.cpp

new file mode 100644 (file)

index 0000000..412ce18
--- /dev/null
+++ b/src/PbiRawData.cpp
@@ -0,0 +1,233 @@
+// File Description
+/// \file PbiRawData.cpp
+/// \brief Implements the classes used for working with raw PBI data.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/PbiRawData.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <tuple>
+
+#include <boost/numeric/conversion/cast.hpp>
+
+#include "PbiIndexIO.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static std::string ToString(const RecordType type)
+{
+    // clang-format off
+    static const auto lookup = std::map<RecordType, std::string>
+    {
+        { RecordType::ZMW,        "ZMW" },
+        { RecordType::HQREGION,   "HQREGION" },
+        { RecordType::SUBREAD,    "SUBREAD" },
+        { RecordType::CCS,        "CCS" },
+        { RecordType::SCRAP,      "SCRAP" },
+        { RecordType::TRANSCRIPT, "TRANSCRIPT" },
+        { RecordType::UNKNOWN,    "UNKNOWN" }
+    };
+    // clang-format on
+
+    try {
+        return lookup.at(type);
+    } catch (std::exception&) {
+        throw std::runtime_error{"error: unknown RecordType encountered"};
+    }
+}
+
+}  // namespace internal
+
+// ----------------------------------
+// PbiRawBarcodeData implementation
+// ----------------------------------
+
+PbiRawBarcodeData::PbiRawBarcodeData(uint32_t numReads)
+{
+    bcForward_.reserve(numReads);
+    bcReverse_.reserve(numReads);
+    bcQual_.reserve(numReads);
+}
+
+void PbiRawBarcodeData::AddRecord(const BamRecord& b)
+{
+    // check for any barcode data (both required)
+    if (b.HasBarcodes() && b.HasBarcodeQuality()) {
+
+        // fetch data from record
+        int16_t bcForward;
+        int16_t bcReverse;
+        std::tie(bcForward, bcReverse) = b.Barcodes();
+
+        const auto bcQuality = boost::numeric_cast<int8_t>(b.BarcodeQuality());
+
+        // only store actual data if all values >= 0
+        if (bcForward >= 0 && bcReverse >= 0 && bcQuality >= 0) {
+            bcForward_.push_back(bcForward);
+            bcReverse_.push_back(bcReverse);
+            bcQual_.push_back(bcQuality);
+            return;
+        }
+    }
+
+    // if we get here, at least one value is either missing or is -1
+    bcForward_.push_back(-1);
+    bcReverse_.push_back(-1);
+    bcQual_.push_back(-1);
+}
+
+// ----------------------------------
+// PbiRawMappedData implementation
+// ----------------------------------
+
+PbiRawMappedData::PbiRawMappedData(uint32_t numReads)
+{
+    tId_.reserve(numReads);
+    tStart_.reserve(numReads);
+    tEnd_.reserve(numReads);
+    aStart_.reserve(numReads);
+    aEnd_.reserve(numReads);
+    revStrand_.reserve(numReads);
+    nM_.reserve(numReads);
+    nMM_.reserve(numReads);
+    mapQV_.reserve(numReads);
+}
+
+void PbiRawMappedData::AddRecord(const BamRecord& b)
+{
+    tId_.push_back(b.ReferenceId());
+    tStart_.push_back(b.ReferenceStart());
+    tEnd_.push_back(b.ReferenceEnd());
+    aStart_.push_back(b.AlignedStart());
+    aEnd_.push_back(b.AlignedEnd());
+    revStrand_.push_back((b.AlignedStrand() == Strand::REVERSE ? 1 : 0));
+    mapQV_.push_back(b.MapQuality());
+
+    const auto matchesAndMismatches = b.NumMatchesAndMismatches();
+    nM_.push_back(matchesAndMismatches.first);
+    nMM_.push_back(matchesAndMismatches.second);
+}
+
+uint32_t PbiRawMappedData::NumDeletedBasesAt(size_t recordIndex) const
+{
+    return NumDeletedAndInsertedBasesAt(recordIndex).first;
+}
+
+std::pair<uint32_t, uint32_t> PbiRawMappedData::NumDeletedAndInsertedBasesAt(
+    size_t recordIndex) const
+{
+    const auto aStart = aStart_.at(recordIndex);
+    const auto aEnd = aEnd_.at(recordIndex);
+    const auto tStart = tStart_.at(recordIndex);
+    const auto tEnd = tEnd_.at(recordIndex);
+    const auto nM = nM_.at(recordIndex);
+    const auto nMM = nMM_.at(recordIndex);
+
+    const auto numIns = (aEnd - aStart - nM - nMM);
+    const auto numDel = (tEnd - tStart - nM - nMM);
+    return {numDel, numIns};
+}
+
+uint32_t PbiRawMappedData::NumInsertedBasesAt(size_t recordIndex) const
+{
+    return NumDeletedAndInsertedBasesAt(recordIndex).second;
+}
+
+// ------------------------------------
+// PbiReferenceEntry implementation
+// ------------------------------------
+
+const PbiReferenceEntry::ID PbiReferenceEntry::UNMAPPED_ID = static_cast<PbiReferenceEntry::ID>(-1);
+const PbiReferenceEntry::Row PbiReferenceEntry::UNSET_ROW = static_cast<PbiReferenceEntry::Row>(-1);
+
+PbiReferenceEntry::PbiReferenceEntry() : tId_{UNMAPPED_ID}, beginRow_{UNSET_ROW}, endRow_{UNSET_ROW}
+{
+}
+
+PbiReferenceEntry::PbiReferenceEntry(ID id) : tId_{id}, beginRow_{UNSET_ROW}, endRow_{UNSET_ROW} {}
+
+PbiReferenceEntry::PbiReferenceEntry(ID id, Row beginRow, Row endRow)
+    : tId_(id), beginRow_(beginRow), endRow_(endRow)
+{
+}
+
+// ------------------------------------
+// PbiRawReferenceData implementation
+// ------------------------------------
+
+PbiRawReferenceData::PbiRawReferenceData(uint32_t numRefs) { entries_.reserve(numRefs); }
+
+// ----------------------------------
+// PbiRawBasicData implementation
+// ----------------------------------
+
+PbiRawBasicData::PbiRawBasicData(uint32_t numReads)
+{
+    rgId_.reserve(numReads);
+    qStart_.reserve(numReads);
+    qEnd_.reserve(numReads);
+    holeNumber_.reserve(numReads);
+    readQual_.reserve(numReads);
+    ctxtFlag_.reserve(numReads);
+    fileOffset_.reserve(numReads);
+    fileNumber_.reserve(numReads);
+}
+
+void PbiRawBasicData::AddRecord(const BamRecord& b, int64_t offset)
+{
+    // read group ID
+    auto rgId = b.ReadGroupId();
+    if (rgId.empty()) rgId = MakeReadGroupId(b.MovieName(), internal::ToString(b.Type()));
+    const auto rawid = std::stoul(rgId, nullptr, 16);
+    const auto id = static_cast<int32_t>(rawid);
+    rgId_.push_back(id);
+
+    // query start/end
+    if (IsCcsOrTranscript(b.Type())) {
+        qStart_.push_back(-1);
+        qEnd_.push_back(-1);
+    } else {
+        qStart_.push_back(b.QueryStart());
+        qEnd_.push_back(b.QueryEnd());
+    }
+
+    // add'l basic data
+    holeNumber_.push_back(b.HasHoleNumber() ? b.HoleNumber() : 0);
+    readQual_.push_back(b.HasReadAccuracy() ? static_cast<float>(b.ReadAccuracy()) : 0.0f);
+    ctxtFlag_.push_back(b.HasLocalContextFlags() ? b.LocalContextFlags()
+                                                 : LocalContextFlags::NO_LOCAL_CONTEXT);
+
+    // virtual offset of record start
+    fileOffset_.push_back(offset);
+
+    // default file number
+    fileNumber_.push_back(0);
+}
+
+// ----------------------------------
+// PbiRawData implementation
+// ----------------------------------
+
+PbiRawData::PbiRawData(std::string pbiFilename) : filename_{std::move(pbiFilename)}
+{
+    internal::PbiIndexIO::Load(*this, filename_);
+}
+
+PbiRawData::PbiRawData(const DataSet& dataset)
+    : sections_{PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE}
+{
+    internal::PbiIndexIO::LoadFromDataSet(*this, dataset);
+}
+
+}  // namespace BAM
+}  // namesapce PacBio
diff --git a/src/ProgramInfo.cpp b/src/ProgramInfo.cpp

new file mode 100644 (file)

index 0000000..7ff7579
--- /dev/null
+++ b/src/ProgramInfo.cpp
@@ -0,0 +1,83 @@
+// File Description
+/// \file ProgramInfo.cpp
+/// \brief Implements the ProgramInfo class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ProgramInfo.h"
+
+#include <sstream>
+
+#include "SequenceUtils.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static std::string ProgramInfoTokenID{"ID"};
+static std::string ProgramInfoTokenCL{"CL"};
+static std::string ProgramInfoTokenDS{"DS"};
+static std::string ProgramInfoTokenPN{"PN"};
+static std::string ProgramInfoTokenPP{"PP"};
+static std::string ProgramInfoTokenVN{"VN"};
+
+}  // namespace internal
+
+ProgramInfo::ProgramInfo(std::string id) : id_{std::move(id)} {}
+
+ProgramInfo ProgramInfo::FromSam(const std::string& sam)
+{
+    // pop off '@PG\t', then split rest of line into tokens
+    const auto tokens = internal::Split(sam.substr(4), '\t');
+    if (tokens.empty()) return {};
+
+    ProgramInfo prog;
+    std::map<std::string, std::string> custom;
+
+    // iterate over tokens
+    for (const auto& token : tokens) {
+        const auto tokenTag = token.substr(0, 2);
+        auto tokenValue = token.substr(3);
+
+        // set program contents
+        // clang-format off
+        if      (tokenTag == internal::ProgramInfoTokenID) prog.Id(std::move(tokenValue));
+        else if (tokenTag == internal::ProgramInfoTokenCL) prog.CommandLine(std::move(tokenValue));
+        else if (tokenTag == internal::ProgramInfoTokenDS) prog.Description(std::move(tokenValue));
+        else if (tokenTag == internal::ProgramInfoTokenPN) prog.Name(std::move(tokenValue));
+        else if (tokenTag == internal::ProgramInfoTokenPP) prog.PreviousProgramId(std::move(tokenValue));
+        else if (tokenTag == internal::ProgramInfoTokenVN) prog.Version(std::move(tokenValue));
+        // clang-format on
+
+        // otherwise, "custom" tag
+        else
+            custom[tokenTag] = std::move(tokenValue);
+    }
+
+    prog.CustomTags(custom);
+    return prog;
+}
+
+std::string ProgramInfo::ToSam() const
+{
+    std::ostringstream out;
+    out << "@PG" << internal::MakeSamTag(internal::ProgramInfoTokenID, id_);
+
+    // clang-format off
+    if (!name_.empty())              out << internal::MakeSamTag(internal::ProgramInfoTokenPN, name_);
+    if (!version_.empty())           out << internal::MakeSamTag(internal::ProgramInfoTokenVN, version_);
+    if (!description_.empty())       out << internal::MakeSamTag(internal::ProgramInfoTokenDS, description_);
+    if (!previousProgramId_.empty()) out << internal::MakeSamTag(internal::ProgramInfoTokenPP, previousProgramId_);
+    if (!commandLine_.empty())       out << internal::MakeSamTag(internal::ProgramInfoTokenCL, commandLine_);
+    // clang-format on
+
+    // append any custom tags
+    for (const auto& attribute : custom_)
+        out << internal::MakeSamTag(attribute.first, attribute.second);
+    return out.str();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Pulse2BaseCache.h b/src/Pulse2BaseCache.h

new file mode 100644 (file)

index 0000000..ea63815
--- /dev/null
+++ b/src/Pulse2BaseCache.h
@@ -0,0 +1,111 @@
+// Author: Derek Barnett
+
+#ifndef PULSE2BASECACHE_H
+#define PULSE2BASECACHE_H
+
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <string>
+
+#include <boost/dynamic_bitset.hpp>
+
+#include "pbbam/Config.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class Pulse2BaseCache
+{
+public:
+    /// \brief Creates a Pulse2BaseCache from pulseCall data ('pc' tag)
+    ///
+    /// Computes & stores cache of basecalled vs. squashed pulse positions for
+    /// later masking of pulse data.
+    ///
+    /// \param pulseCalls[in]   string contents of 'pc' tag
+    ///
+    Pulse2BaseCache(const std::string& pulseCalls) : data_(pulseCalls.size())
+    {
+        // basecalled pulse -> data[i] == 1
+        // squashed pulse   -> data[i] == 0
+        //
+        const auto numPulses = pulseCalls.size();
+        for (size_t i = 0; i < numPulses; ++i)
+            data_[i] = std::isupper(pulseCalls.at(i));
+    }
+
+    Pulse2BaseCache() = delete;
+    Pulse2BaseCache(const Pulse2BaseCache&) = default;
+    Pulse2BaseCache(Pulse2BaseCache&&) = default;
+    Pulse2BaseCache& operator=(const Pulse2BaseCache&) = default;
+    Pulse2BaseCache& operator=(Pulse2BaseCache&&) = default;
+    ~Pulse2BaseCache() = default;
+
+public:
+    ///
+    /// \brief FindFirst
+    /// \return
+    ///
+    size_t FindFirst() const { return data_.find_first(); }
+
+    ///
+    /// \brief FindNext
+    /// \param from
+    /// \return
+    ///
+    size_t FindNext(size_t from) const { return data_.find_next(from); }
+
+    ///
+    /// \brief IsBasecallAt
+    /// \param pos
+    /// \return
+    ///
+    bool IsBasecallAt(const size_t pos) const { return data_[pos]; }
+
+    /// \returns the total number of pulses (basecalled & squashed)
+    ///
+    size_t NumPulses() const { return data_.size(); }
+
+    /// \returns the total number of basecalled pulses
+    ///
+    size_t NumBases() const { return data_.count(); }
+
+    /// \brief Removes squashed pulse positions from input data.
+    ///
+    /// \param[in]  Contents of any per-pulse tag.
+    /// \returns    Input \p pulseData less all squashed pulses
+    ///
+    template <typename T>
+    T RemoveSquashedPulses(const T& pulseData) const
+    {
+        const auto numPulses = pulseData.size();
+        assert(numPulses == data_.size());
+
+        // The reserve() below overshoots the required space, but numPulses is cheap
+        // to compute, and by definition will be sufficient to hold the result. Thus
+        // we only ever need to do one allocation.
+        //
+        T result;
+        result.reserve(numPulses);
+
+        // Only include data at positions that match our cached pulse data.
+        //
+        size_t inputIndex = 0;
+        for (size_t i = 0; i < numPulses; ++i) {
+            if (data_[i]) result.push_back(pulseData.at(inputIndex));
+            ++inputIndex;
+        }
+        return result;
+    }
+
+private:
+    boost::dynamic_bitset<> data_;
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PULSE2BASECACHE_H
diff --git a/src/QNameQuery.cpp b/src/QNameQuery.cpp

new file mode 100644 (file)

index 0000000..e78a260
--- /dev/null
+++ b/src/QNameQuery.cpp
@@ -0,0 +1,75 @@
+// File Description
+/// \file QNameQuery.cpp
+/// \brief Implements the QNameQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/QNameQuery.h"
+
+#include <cassert>
+
+#include <boost/optional.hpp>
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct QNameQuery::QNameQueryPrivate
+{
+public:
+    QNameQueryPrivate(const DataSet& dataset)
+        : reader_{std::make_unique<SequentialCompositeBamReader>(dataset)}, nextRecord_(boost::none)
+    {
+    }
+
+    bool GetNext(std::vector<BamRecord>& records)
+    {
+        records.clear();
+
+        std::string groupRecordName;
+
+        if (nextRecord_.is_initialized()) {
+            BamRecord r = nextRecord_.get();
+            groupRecordName = r.FullName();
+            records.push_back(std::move(r));
+            nextRecord_ = boost::none;
+        }
+
+        BamRecord record;
+        while (reader_->GetNext(record)) {
+            if (records.empty()) {
+                groupRecordName = record.FullName();
+                records.push_back(record);
+            } else {
+                assert(!records.empty());
+                if (record.FullName() == groupRecordName)
+                    records.push_back(record);
+                else {
+                    nextRecord_ = record;
+                    return true;
+                }
+            }
+        }
+        return !records.empty();
+    }
+
+public:
+    std::unique_ptr<SequentialCompositeBamReader> reader_;
+    boost::optional<BamRecord> nextRecord_;
+};
+
+QNameQuery::QNameQuery(const DataSet& dataset)
+    : internal::IGroupQuery(), d_{std::make_unique<QNameQueryPrivate>(dataset)}
+{
+}
+
+QNameQuery::~QNameQuery() {}
+
+bool QNameQuery::GetNext(std::vector<BamRecord>& records) { return d_->GetNext(records); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/QualityValue.cpp b/src/QualityValue.cpp

new file mode 100644 (file)

index 0000000..dfece05
--- /dev/null
+++ b/src/QualityValue.cpp
@@ -0,0 +1,19 @@
+// File Description
+/// \file QualityValue.h
+/// \brief Implements the QualityValue class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/QualityValue.h"
+
+#include <cstdint>
+
+namespace PacBio {
+namespace BAM {
+
+const uint8_t QualityValue::MAX = 93;
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ReadAccuracyQuery.cpp b/src/ReadAccuracyQuery.cpp

new file mode 100644 (file)

index 0000000..63e99a8
--- /dev/null
+++ b/src/ReadAccuracyQuery.cpp
@@ -0,0 +1,43 @@
+// File Description
+/// \file ReadAccuracyQuery.cpp
+/// \brief Implements the ReadAccuracyQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ReadAccuracyQuery.h"
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/PbiFilterTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct ReadAccuracyQuery::ReadAccuracyQueryPrivate
+{
+    ReadAccuracyQueryPrivate(const Accuracy accuracy, const Compare::Type compareType,
+                             const DataSet& dataset)
+        : reader_{PbiReadAccuracyFilter{accuracy, compareType}, dataset}
+    {
+    }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_;  // unsorted
+};
+
+ReadAccuracyQuery::ReadAccuracyQuery(const Accuracy accuracy, const Compare::Type compareType,
+                                     const DataSet& dataset)
+    : internal::IQuery()
+    , d_{std::make_unique<ReadAccuracyQueryPrivate>(accuracy, compareType, dataset)}
+{
+}
+
+ReadAccuracyQuery::~ReadAccuracyQuery() {}
+
+bool ReadAccuracyQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+uint32_t ReadAccuracyQuery::NumReads() const { return d_->reader_.NumReads(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ReadGroupInfo.cpp b/src/ReadGroupInfo.cpp

new file mode 100644 (file)

index 0000000..ade324a
--- /dev/null
+++ b/src/ReadGroupInfo.cpp
@@ -0,0 +1,542 @@
+// File Description
+/// \file ReadGroupInfo.cpp
+/// \brief Implements the ReadGroupInfo class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ReadGroupInfo.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <iomanip>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+
+#include "ChemistryTable.h"
+#include "SequenceUtils.h"
+#include "pbbam/MD5.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static const std::string sam_ID{"ID"};
+static const std::string sam_CN{"CN"};
+static const std::string sam_DS{"DS"};
+static const std::string sam_DT{"DT"};
+static const std::string sam_FO{"FO"};
+static const std::string sam_KS{"KS"};
+static const std::string sam_LB{"LB"};
+static const std::string sam_PG{"PG"};
+static const std::string sam_PI{"PI"};
+static const std::string sam_PL{"PL"};
+static const std::string sam_PM{"PM"};
+static const std::string sam_PU{"PU"};
+static const std::string sam_SM{"SM"};
+
+static const std::string feature_DQ{"DeletionQV"};
+static const std::string feature_DT{"DeletionTag"};
+static const std::string feature_IQ{"InsertionQV"};
+static const std::string feature_MQ{"MergeQV"};
+static const std::string feature_SQ{"SubstitutionQV"};
+static const std::string feature_ST{"SubstitutionTag"};
+static const std::string feature_IP{"Ipd"};
+static const std::string feature_PW{"PulseWidth"};
+static const std::string feature_PM{"PkMid"};
+static const std::string feature_PA{"PkMean"};
+static const std::string feature_PI{"PkMid2"};
+static const std::string feature_PS{"PkMean2"};
+static const std::string feature_LT{"Label"};
+static const std::string feature_PQ{"LabelQV"};
+static const std::string feature_PT{"AltLabel"};
+static const std::string feature_PV{"AltLabelQV"};
+static const std::string feature_PG{"PulseMergeQV"};
+static const std::string feature_PC{"PulseCall"};
+static const std::string feature_PD{"PrePulseFrames"};
+static const std::string feature_PX{"PulseCallWidth"};
+static const std::string feature_SF{"StartFrame"};
+static const std::string feature_PE{"PulseExclusion"};
+
+static const std::string token_RT{"READTYPE"};
+static const std::string token_BK{"BINDINGKIT"};
+static const std::string token_SK{"SEQUENCINGKIT"};
+static const std::string token_BV{"BASECALLERVERSION"};
+static const std::string token_FR{"FRAMERATEHZ"};
+static const std::string token_CT{"CONTROL"};
+
+static const std::string token_BF{"BarcodeFile"};
+static const std::string token_BH{"BarcodeHash"};
+static const std::string token_BC{"BarcodeCount"};
+static const std::string token_BM{"BarcodeMode"};
+static const std::string token_BQ{"BarcodeQuality"};
+
+static const std::string codec_RAW{"Frames"};
+static const std::string codec_V1{"CodecV1"};
+
+static const std::string barcodemode_NONE{"None"};
+static const std::string barcodemode_SYM{"Symmetric"};
+static const std::string barcodemode_ASYM{"Asymmetric"};
+static const std::string barcodemode_TAIL{"Tailed"};
+
+static const std::string barcodequal_NONE{"None"};
+static const std::string barcodequal_SCORE{"Score"};
+static const std::string barcodequal_PROB{"Probability"};
+
+static const std::string platformModelType_ASTRO{"ASTRO"};
+static const std::string platformModelType_RS{"RS"};
+static const std::string platformModelType_SEQUEL{"SEQUEL"};
+
+// clang-format off
+static std::string BaseFeatureName(const BaseFeature& feature)
+{
+    switch (feature) {
+        case BaseFeature::DELETION_QV      : return feature_DQ;
+        case BaseFeature::DELETION_TAG     : return feature_DT;
+        case BaseFeature::INSERTION_QV     : return feature_IQ;
+        case BaseFeature::MERGE_QV         : return feature_MQ;
+        case BaseFeature::SUBSTITUTION_QV  : return feature_SQ;
+        case BaseFeature::SUBSTITUTION_TAG : return feature_ST;
+        case BaseFeature::IPD              : return feature_IP;
+        case BaseFeature::PULSE_WIDTH      : return feature_PW;
+        case BaseFeature::PKMID            : return feature_PM;
+        case BaseFeature::PKMEAN           : return feature_PA;
+        case BaseFeature::PKMID2           : return feature_PI;
+        case BaseFeature::PKMEAN2          : return feature_PS;
+        case BaseFeature::LABEL_QV         : return feature_PQ;
+        case BaseFeature::ALT_LABEL        : return feature_PT;
+        case BaseFeature::ALT_LABEL_QV     : return feature_PV;
+        case BaseFeature::PULSE_MERGE_QV   : return feature_PG;
+        case BaseFeature::PULSE_CALL       : return feature_PC;
+        case BaseFeature::PRE_PULSE_FRAMES : return feature_PD;
+        case BaseFeature::PULSE_CALL_WIDTH : return feature_PX;
+        case BaseFeature::START_FRAME      : return feature_SF;
+        case BaseFeature::PULSE_EXCLUSION  : return feature_PE;
+        default:
+            throw std::runtime_error{ "unrecognized base feature" };
+    }
+}
+
+static std::string FrameCodecName(const FrameCodec& codec)
+{
+    switch (codec) {
+        case FrameCodec::RAW : return codec_RAW;
+        case FrameCodec::V1  : return codec_V1;
+        default:
+            throw std::runtime_error{ "unrecognized frame codec" };
+    }
+}
+
+static std::string BarcodeModeName(const BarcodeModeType& mode)
+{
+    switch (mode) {
+        case BarcodeModeType::NONE       : return barcodemode_NONE;
+        case BarcodeModeType::SYMMETRIC  : return barcodemode_SYM;
+        case BarcodeModeType::ASYMMETRIC : return barcodemode_ASYM;
+        case BarcodeModeType::TAILED     : return barcodemode_TAIL;
+        default:
+            throw std::runtime_error{ "unrecognized barcode mode type" };
+    }
+}
+
+static std::string BarcodeQualityName(const BarcodeQualityType& type)
+{
+    switch (type) {
+        case BarcodeQualityType::NONE        : return barcodequal_NONE;
+        case BarcodeQualityType::SCORE       : return barcodequal_SCORE;
+        case BarcodeQualityType::PROBABILITY : return barcodequal_PROB;
+        default:
+            throw std::runtime_error{ "unrecognized barcode quality type" };
+    }
+}
+
+static std::string PlatformModelName(const PlatformModelType& type)
+{
+    switch (type) {
+        case PlatformModelType::ASTRO  : return platformModelType_ASTRO;
+        case PlatformModelType::RS     : return platformModelType_RS;
+        case PlatformModelType::SEQUEL : return platformModelType_SEQUEL;
+        default:
+            throw std::runtime_error{ "unrecognized platform model type" };
+    }
+}
+
+static const std::map<std::string, BaseFeature> nameToFeature
+{
+    { feature_DQ, BaseFeature::DELETION_QV },
+    { feature_DT, BaseFeature::DELETION_TAG },
+    { feature_IQ, BaseFeature::INSERTION_QV },
+    { feature_MQ, BaseFeature::MERGE_QV },
+    { feature_SQ, BaseFeature::SUBSTITUTION_QV },
+    { feature_ST, BaseFeature::SUBSTITUTION_TAG },
+    { feature_IP, BaseFeature::IPD },
+    { feature_PW, BaseFeature::PULSE_WIDTH },
+    { feature_PM, BaseFeature::PKMID },
+    { feature_PA, BaseFeature::PKMEAN },
+    { feature_PI, BaseFeature::PKMID2 },
+    { feature_PS, BaseFeature::PKMEAN2 },
+    { feature_PQ, BaseFeature::LABEL_QV },
+    { feature_PT, BaseFeature::ALT_LABEL },
+    { feature_PV, BaseFeature::ALT_LABEL_QV },
+    { feature_PC, BaseFeature::PULSE_CALL },
+    { feature_PG, BaseFeature::PULSE_MERGE_QV },
+    { feature_PD, BaseFeature::PRE_PULSE_FRAMES },
+    { feature_PX, BaseFeature::PULSE_CALL_WIDTH },
+    { feature_SF, BaseFeature::START_FRAME },
+    { feature_PE, BaseFeature::PULSE_EXCLUSION }
+};
+
+static const std::map<std::string, FrameCodec> nameToCodec
+{
+    { codec_RAW, FrameCodec::RAW },
+    { codec_V1,  FrameCodec::V1 }
+};
+
+static const std::map<std::string, BarcodeModeType> nameToBarcodeMode
+{
+    { barcodemode_NONE, BarcodeModeType::NONE },
+    { barcodemode_SYM,  BarcodeModeType::SYMMETRIC },
+    { barcodemode_ASYM, BarcodeModeType::ASYMMETRIC },
+    { barcodemode_TAIL, BarcodeModeType::TAILED }
+};
+
+static const std::map<std::string, BarcodeQualityType> nameToBarcodeQuality
+{
+    { barcodequal_NONE,  BarcodeQualityType::NONE },
+    { barcodequal_SCORE, BarcodeQualityType::SCORE },
+    { barcodequal_PROB,  BarcodeQualityType::PROBABILITY }
+};
+
+static const std::map<std::string, PlatformModelType> nameToPlatformModel
+{
+    { platformModelType_ASTRO,  PlatformModelType::ASTRO },
+    { platformModelType_RS,     PlatformModelType::RS },
+    { platformModelType_SEQUEL, PlatformModelType::SEQUEL }
+};
+// clang-format on
+
+static inline bool IsLikelyBarcodeKey(const std::string& name) { return name.find("Barcode") == 0; }
+
+static inline bool IsBaseFeature(const std::string& name)
+{
+    return nameToFeature.find(name) != nameToFeature.cend();
+}
+
+static inline BaseFeature BaseFeatureFromName(const std::string& name)
+{
+    return nameToFeature.at(name);
+}
+
+static inline FrameCodec FrameCodecFromName(const std::string& name)
+{
+    return nameToCodec.at(name);
+}
+
+static inline BarcodeModeType BarcodeModeFromName(const std::string& name)
+{
+    return nameToBarcodeMode.at(name);
+}
+
+static inline BarcodeQualityType BarcodeQualityFromName(const std::string& name)
+{
+    return nameToBarcodeQuality.at(name);
+}
+
+static inline PlatformModelType PlatformModelFromName(std::string name)
+{
+    return nameToPlatformModel.at(name);
+}
+
+}  // namespace internal
+
+ReadGroupInfo::ReadGroupInfo() : readType_{"UNKNOWN"} {}
+
+ReadGroupInfo::ReadGroupInfo(std::string id) : id_{std::move(id)}, readType_{"UNKNOWN"} {}
+
+ReadGroupInfo::ReadGroupInfo(std::string movieName, std::string readType)
+    : id_{MakeReadGroupId(movieName, readType)}
+    , movieName_{std::move(movieName)}
+    , readType_{std::move(readType)}
+{
+}
+
+ReadGroupInfo::ReadGroupInfo(std::string movieName, std::string readType,
+                             PlatformModelType platform)
+    : id_{MakeReadGroupId(movieName, readType)}
+    , movieName_{std::move(movieName)}
+    , platformModel_{std::move(platform)}
+    , readType_{std::move(readType)}
+{
+}
+
+void ReadGroupInfo::DecodeSamDescription(const std::string& description)
+{
+    // split on semicolons
+    // for each, split on equal
+    //    determine name ->
+
+    const auto tokens = internal::Split(description, ';');
+    if (tokens.empty()) return;
+
+    bool hasBarcodeFile = false;
+    bool hasBarcodeHash = false;
+    bool hasBarcodeCount = false;
+    bool hasBarcodeMode = false;
+    bool hasBarcodeQuality = false;
+
+    // iterate over tokens
+    for (const auto& token : tokens) {
+
+        const auto foundEqual = token.find('=');
+        if (foundEqual == std::string::npos) continue;
+
+        const auto key = token.substr(0, foundEqual);
+        auto value = token.substr(foundEqual + 1);
+
+        // 'mandatory' items
+        // clang-format off
+        if      (key == internal::token_RT) readType_ = std::move(value);
+        else if (key == internal::token_BK) bindingKit_ = std::move(value);
+        else if (key == internal::token_BV) basecallerVersion_ = std::move(value);
+        else if (key == internal::token_SK) sequencingKit_ = std::move(value);
+        else if (key == internal::token_FR) frameRateHz_ = std::move(value);
+        else if (key == internal::token_CT) control_ = (value == "TRUE");
+        // clang-format on
+
+        // base features
+        else if (internal::IsBaseFeature(key))
+            features_[internal::BaseFeatureFromName(key)] = std::move(value);
+
+        // barcode data
+        else if (internal::IsLikelyBarcodeKey(key)) {
+            if (key == internal::token_BF) {
+                barcodeFile_ = std::move(value);
+                hasBarcodeFile = true;
+            } else if (key == internal::token_BH) {
+                barcodeHash_ = std::move(value);
+                hasBarcodeHash = true;
+            } else if (key == internal::token_BC) {
+                barcodeCount_ = std::stoul(value);
+                hasBarcodeCount = true;
+            } else if (key == internal::token_BM) {
+                barcodeMode_ = internal::BarcodeModeFromName(value);
+                hasBarcodeMode = true;
+            } else if (key == internal::token_BQ) {
+                barcodeQuality_ = internal::BarcodeQualityFromName(value);
+                hasBarcodeQuality = true;
+            }
+        }
+
+        // frame codecs
+        else {
+            const auto keyParts = internal::Split(key, ':');
+            if (keyParts.size() == 2) {
+                const auto& subkey = keyParts.at(0);
+                if (subkey == internal::feature_IP) {
+                    ipdCodec_ = internal::FrameCodecFromName(keyParts.at(1));
+                    features_[BaseFeature::IPD] = std::move(value);
+                } else if (subkey == internal::feature_PW) {
+                    pulseWidthCodec_ = internal::FrameCodecFromName(keyParts.at(1));
+                    features_[BaseFeature::PULSE_WIDTH] = std::move(value);
+                }
+            }
+        }
+    }
+
+    hasBarcodeData_ = (hasBarcodeFile && hasBarcodeHash && hasBarcodeCount && hasBarcodeMode &&
+                       hasBarcodeQuality);
+}
+
+std::string ReadGroupInfo::EncodeSamDescription() const
+{
+    constexpr static const char SEP = ';';
+    constexpr static const char COLON = ':';
+    constexpr static const char EQ = '=';
+
+    std::string result{internal::token_RT + EQ + readType_};
+
+    std::string featureName;
+    for (const auto& feature : features_) {
+
+        featureName = internal::BaseFeatureName(feature.first);
+        if (featureName.empty() || feature.second.empty())
+            continue;
+        else if (featureName == internal::feature_IP) {
+            featureName.push_back(COLON);
+            featureName.append(internal::FrameCodecName(ipdCodec_));
+        } else if (featureName == internal::feature_PW) {
+            featureName.push_back(COLON);
+            featureName.append(internal::FrameCodecName(pulseWidthCodec_));
+        }
+        result.append(SEP + featureName + EQ + feature.second);
+    }
+
+    // clang-format off
+    if (!bindingKit_.empty())        result.append(SEP + internal::token_BK + EQ + bindingKit_);
+    if (!sequencingKit_.empty())     result.append(SEP + internal::token_SK + EQ + sequencingKit_);
+    if (!basecallerVersion_.empty()) result.append(SEP + internal::token_BV + EQ + basecallerVersion_);
+    if (!frameRateHz_.empty())       result.append(SEP + internal::token_FR + EQ + frameRateHz_);
+    if (control_)                    result.append(SEP + internal::token_CT + EQ + (control_ ? "TRUE" : "FALSE"));
+    // clang-format on
+
+    if (hasBarcodeData_) {
+        const std::string barcodeData{
+            SEP + internal::token_BF + EQ + barcodeFile_ + SEP + internal::token_BH + EQ +
+            barcodeHash_ + SEP + internal::token_BC + EQ + std::to_string(barcodeCount_) + SEP +
+            internal::token_BM + EQ + internal::BarcodeModeName(barcodeMode_) + SEP +
+            internal::token_BQ + EQ + internal::BarcodeQualityName(barcodeQuality_)};
+        result.append(barcodeData);
+    }
+
+    return result;
+}
+
+ReadGroupInfo ReadGroupInfo::FromSam(const std::string& sam)
+{
+    // pop off '@RG\t', then split rest of line into tokens
+    const auto tokens = internal::Split(sam.substr(4), '\t');
+    if (tokens.empty()) return {};
+
+    ReadGroupInfo rg;
+    std::map<std::string, std::string> custom;
+
+    for (const auto& token : tokens) {
+        const auto tokenTag = token.substr(0, 2);
+        auto tokenValue = token.substr(3);
+
+        // set read group info
+        // clang-format off
+        if      (tokenTag == internal::sam_ID) rg.Id(std::move(tokenValue));
+        else if (tokenTag == internal::sam_CN) rg.SequencingCenter(std::move(tokenValue));
+        else if (tokenTag == internal::sam_DT) rg.Date(std::move(tokenValue));
+        else if (tokenTag == internal::sam_FO) rg.FlowOrder(std::move(tokenValue));
+        else if (tokenTag == internal::sam_KS) rg.KeySequence(std::move(tokenValue));
+        else if (tokenTag == internal::sam_LB) rg.Library(std::move(tokenValue));
+        else if (tokenTag == internal::sam_PG) rg.Programs(std::move(tokenValue));
+        else if (tokenTag == internal::sam_PI) rg.PredictedInsertSize(std::move(tokenValue));
+        else if (tokenTag == internal::sam_PU) rg.MovieName(std::move(tokenValue));
+        else if (tokenTag == internal::sam_SM) rg.Sample(std::move(tokenValue));
+        else if (tokenTag == internal::sam_DS) rg.DecodeSamDescription(std::move(tokenValue));
+        else if (tokenTag == internal::sam_PM) rg.PlatformModel(internal::PlatformModelFromName(std::move(tokenValue)));
+        // clang-format on
+
+        // if not platform name (always "PACBIO" for us), store as a custom tag
+        else if (tokenTag != internal::sam_PL)
+            custom[tokenTag] = std::move(tokenValue);
+    }
+    rg.CustomTags(std::move(custom));
+
+    return rg;
+}
+
+std::string ReadGroupInfo::IntToId(const int32_t id)
+{
+    std::ostringstream s;
+    s << std::setfill('0') << std::setw(8) << std::hex << id;
+    return s.str();
+}
+
+ReadGroupInfo& ReadGroupInfo::IpdCodec(FrameCodec codec, std::string tag)
+{
+    // store desired codec type
+    ipdCodec_ = std::move(codec);
+
+    // update base features map
+    const std::string actualTag = (tag.empty() ? "ip" : std::move(tag));
+    BaseFeatureTag(BaseFeature::IPD, actualTag);
+    return *this;
+}
+
+ReadGroupInfo& ReadGroupInfo::PulseWidthCodec(FrameCodec codec, std::string tag)
+{
+    // store desired codec type
+    pulseWidthCodec_ = std::move(codec);
+
+    // update base features map
+    const std::string actualTag = (tag.empty() ? "pw" : std::move(tag));
+    BaseFeatureTag(BaseFeature::PULSE_WIDTH, actualTag);
+    return *this;
+}
+
+std::string ReadGroupInfo::SequencingChemistryFromTriple(const std::string& bindingKit,
+                                                         const std::string& sequencingKit,
+                                                         const std::string& basecallerVersion)
+{
+    const auto verFields = internal::Split(basecallerVersion, '.');
+    if (verFields.size() < 2)
+        throw std::runtime_error{"basecaller version too short: " + basecallerVersion};
+    const std::string version{verFields.at(0) + '.' + verFields.at(1)};
+
+    // check updated table first, if it exists (empty if not), overriding the built-in lookup
+    for (const auto& row : internal::GetChemistryTableFromEnv()) {
+        if (bindingKit == row[0] && sequencingKit == row[1] && version == row[2]) return row[3];
+    }
+
+    for (const auto& row : internal::BuiltInChemistryTable) {
+        if (bindingKit == row[0] && sequencingKit == row[1] && version == row[2]) return row[3];
+    }
+
+    // not found
+    throw InvalidSequencingChemistryException{bindingKit, sequencingKit, basecallerVersion};
+}
+
+std::string ReadGroupInfo::ToSam() const
+{
+    std::ostringstream out;
+    out << "@RG" << internal::MakeSamTag(internal::sam_ID, id_)
+        << internal::MakeSamTag(internal::sam_PL, Platform());
+
+    const auto description = EncodeSamDescription();
+    if (!description.empty()) out << internal::MakeSamTag(internal::sam_DS, description);
+
+    // clang-format off
+    if (!sequencingCenter_.empty())    out << internal::MakeSamTag(internal::sam_CN, sequencingCenter_);
+    if (!date_.empty())                out << internal::MakeSamTag(internal::sam_DT, date_);
+    if (!flowOrder_.empty())           out << internal::MakeSamTag(internal::sam_FO, flowOrder_);
+    if (!keySequence_.empty())         out << internal::MakeSamTag(internal::sam_KS, keySequence_);
+    if (!library_.empty())             out << internal::MakeSamTag(internal::sam_LB, library_);
+    if (!programs_.empty())            out << internal::MakeSamTag(internal::sam_PG, programs_);
+    if (!predictedInsertSize_.empty()) out << internal::MakeSamTag(internal::sam_PI, predictedInsertSize_);
+    if (!movieName_.empty())           out << internal::MakeSamTag(internal::sam_PU, movieName_);
+    if (!sample_.empty())              out << internal::MakeSamTag(internal::sam_SM, sample_);
+    // clang-format on
+
+    out << internal::MakeSamTag(internal::sam_PM, internal::PlatformModelName(platformModel_));
+
+    // append any custom tags
+    for (const auto& attribute : custom_)
+        out << internal::MakeSamTag(attribute.first, attribute.second);
+
+    return out.str();
+}
+
+std::string MakeReadGroupId(const std::string& movieName, const std::string& readType)
+{
+    return MD5Hash(movieName + "//" + readType).substr(0, 8);
+}
+
+bool ReadGroupInfo::operator==(const ReadGroupInfo& other) const
+{
+    return id_ == other.id_ && sequencingCenter_ == other.sequencingCenter_ &&
+           date_ == other.date_ && flowOrder_ == other.flowOrder_ &&
+           keySequence_ == other.keySequence_ && library_ == other.library_ &&
+           programs_ == other.programs_ && platformModel_ == other.platformModel_ &&
+           predictedInsertSize_ == other.predictedInsertSize_ && movieName_ == other.movieName_ &&
+           sample_ == other.sample_ && readType_ == other.readType_ &&
+           bindingKit_ == other.bindingKit_ && sequencingKit_ == other.sequencingKit_ &&
+           basecallerVersion_ == other.basecallerVersion_ && frameRateHz_ == other.frameRateHz_ &&
+           control_ == other.control_ && ipdCodec_ == other.ipdCodec_ &&
+           pulseWidthCodec_ == other.pulseWidthCodec_ && hasBarcodeData_ == other.hasBarcodeData_ &&
+           barcodeFile_ == other.barcodeFile_ && barcodeHash_ == other.barcodeHash_ &&
+           barcodeCount_ == other.barcodeCount_ && barcodeMode_ == other.barcodeMode_ &&
+           barcodeQuality_ == other.barcodeQuality_ && features_.size() == other.features_.size() &&
+           std::equal(features_.cbegin(), features_.cend(), other.features_.cbegin()) &&
+           custom_.size() == other.custom_.size() &&
+           std::equal(custom_.begin(), custom_.end(), other.custom_.cbegin());
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/SamTagCodec.cpp b/src/SamTagCodec.cpp

new file mode 100644 (file)

index 0000000..f74879b
--- /dev/null
+++ b/src/SamTagCodec.cpp
@@ -0,0 +1,343 @@
+// File Description
+/// \file SamTagCodec.h
+/// \brief Implements the SamTagCodec class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/SamTagCodec.h"
+
+#include <cstdint>
+#include <limits>
+
+#include <boost/lexical_cast.hpp>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+template <typename T>
+inline void appendSamValue(const T& value, std::string& result)
+{
+    result.append(boost::lexical_cast<std::string>(value));
+}
+
+template <typename T>
+inline void appendSamValue_8bit(const T& value, std::string& result)
+{
+    result.append(boost::lexical_cast<std::string>(static_cast<int>(value)));
+}
+
+template <typename T>
+void appendSamMultiValue(const T& container, std::string& result)
+{
+    for (const auto x : container) {
+        result.push_back(',');
+        appendSamValue(x, result);
+    }
+}
+
+template <typename T>
+void appendSamMultiValue_8bit(const T& container, std::string& result)
+{
+    for (const auto x : container) {
+        result.push_back(',');
+        appendSamValue_8bit(x, result);
+    }
+}
+
+static std::vector<std::string> split(const std::string& s, char delim)
+{
+    std::vector<std::string> elems;
+    std::istringstream ss{s};
+    std::string item;
+    while (std::getline(ss, item, delim))
+        elems.push_back(item);
+    return elems;
+}
+
+std::vector<float> readFloatSamMultiValue(const std::string& data)
+{
+    std::vector<float> result;
+    auto* c = const_cast<char*>(data.c_str());
+    const char* end = c + data.length();
+    while (c + 1 < end)
+        result.emplace_back(strtof(c + 1, &c));
+    return result;
+}
+
+template <typename T>
+std::vector<T> readSignedSamMultiValue(const std::string& data)
+{
+    std::vector<T> result;
+    auto* c = const_cast<char*>(data.c_str());
+    const char* end = c + data.length();
+    while (c + 1 < end)
+        result.emplace_back(strtol(c + 1, &c, 0));
+    return result;
+}
+
+template <typename T>
+std::vector<T> readUnsignedSamMultiValue(const std::string& data)
+{
+    std::vector<T> result;
+    auto* c = const_cast<char*>(data.c_str());
+    const char* end = c + data.length();
+    while (c + 1 < end)
+        result.emplace_back(strtoul(c + 1, &c, 0));
+    return result;
+}
+
+}  // namespace internal
+
+TagCollection SamTagCodec::Decode(const std::string& tagString)
+{
+    TagCollection tags;
+
+    const auto tokens = internal::split(tagString, '\t');
+    for (const auto& token : tokens) {
+        if (token.size() < 6)  // TT:t:X
+            continue;
+
+        const auto name = token.substr(0, 2);
+        const auto type = token.at(3);
+        const auto remainder = token.substr(5);
+        if (remainder.empty()) throw std::runtime_error{"malformatted tag: " + token};
+
+        switch (type) {
+
+            // technically only 'A' is allowed in SAM chars,
+            // but we'll be a little permissive
+            case 'A':
+            case 'a': {
+                tags[name] = Tag{static_cast<char>(remainder[0], TagModifier::ASCII_CHAR)};
+                break;
+            }
+
+            // technically only 'i' is allowed in SAM ints, but we'll be a little
+            // permissive since SAM might be a bit more "user-edited" than BAM
+            case 'c':
+            case 'C':
+            case 's':
+            case 'S':
+            case 'i':
+            case 'I': {
+                // check out boost::numeric cast for these conversions
+
+                // negative value (force signed int)
+                if (remainder[0] == '-') {
+                    const auto x = boost::lexical_cast<int32_t>(remainder);
+                    if (x >= std::numeric_limits<int8_t>::min())
+                        tags[name] = static_cast<int8_t>(x);
+                    else if (x >= std::numeric_limits<int16_t>::min())
+                        tags[name] = static_cast<int16_t>(x);
+                    else
+                        tags[name] = x;
+                }
+
+                // unsigned int
+                else {
+                    const auto x = boost::lexical_cast<uint32_t>(remainder);
+                    if (x <= std::numeric_limits<uint8_t>::max())
+                        tags[name] = static_cast<uint8_t>(x);
+                    else if (x <= std::numeric_limits<uint16_t>::max())
+                        tags[name] = static_cast<uint16_t>(x);
+                    else
+                        tags[name] = x;
+                }
+                break;
+            }
+
+            case 'f': {
+                tags[name] = boost::lexical_cast<float>(remainder);
+                break;
+            }
+
+            case 'Z': {
+                tags[name] = remainder;
+                break;
+            }
+
+            case 'H': {
+                tags[name] = Tag(remainder, TagModifier::HEX_STRING);
+                break;
+            }
+
+            case 'B': {
+                const auto elementType = remainder[0];
+                const auto arrayData = remainder.substr(1);
+                switch (elementType) {
+                    case 'c':
+                        tags[name] = internal::readSignedSamMultiValue<int8_t>(arrayData);
+                        break;
+                    case 'C':
+                        tags[name] = internal::readUnsignedSamMultiValue<uint8_t>(arrayData);
+                        break;
+                    case 's':
+                        tags[name] = internal::readSignedSamMultiValue<int16_t>(arrayData);
+                        break;
+                    case 'S':
+                        tags[name] = internal::readUnsignedSamMultiValue<uint16_t>(arrayData);
+                        break;
+                    case 'i':
+                        tags[name] = internal::readSignedSamMultiValue<int32_t>(arrayData);
+                        break;
+                    case 'I':
+                        tags[name] = internal::readUnsignedSamMultiValue<uint32_t>(arrayData);
+                        break;
+                    case 'f':
+                        tags[name] = internal::readFloatSamMultiValue(arrayData);
+                        break;
+                    default:
+                        throw std::runtime_error{"unsupported array-tag-type encountered: " +
+                                                 std::string{1, elementType}};
+                }
+                break;
+            }
+
+            // unsupported SAM tag type
+            default:
+                throw std::runtime_error{"unsupported tag-type encountered: " +
+                                         std::string{1, type}};
+        }
+    }
+
+    return tags;
+}
+
+std::string SamTagCodec::Encode(const TagCollection& tags)
+{
+    std::string result;
+    result.reserve(1024);
+
+    for (const auto& tagIter : tags) {
+        const auto& name = tagIter.first;
+        if (name.size() != 2) throw std::runtime_error{"malformatted tag name: " + name};
+
+        const auto& tag = tagIter.second;
+        if (tag.IsNull()) continue;
+
+        // tab separator
+        if (!result.empty()) result.push_back('\t');
+
+        // "<TAG>:"
+        result.append(name);
+        result.push_back(':');
+
+        // "<TYPE>:<DATA>" for printable, ASCII char
+        if (tag.HasModifier(TagModifier::ASCII_CHAR)) {
+            const auto c = tag.ToAscii();
+            if (c != '\0') {
+                result.push_back('A');
+                result.push_back(':');
+                result.push_back(c);
+                continue;
+            }
+        }
+
+        // "<TYPE>:<DATA>" for all other data
+
+        using internal::appendSamMultiValue;
+        using internal::appendSamMultiValue_8bit;
+        using internal::appendSamValue;
+        using internal::appendSamValue_8bit;
+
+        switch (tag.Type()) {
+            case TagDataType::INT8:
+                result.push_back('i');
+                result.push_back(':');
+                appendSamValue_8bit(tag.ToInt8(), result);
+                break;
+            case TagDataType::UINT8:
+                result.push_back('i');
+                result.push_back(':');
+                appendSamValue_8bit(tag.ToUInt8(), result);
+                break;
+            case TagDataType::INT16:
+                result.push_back('i');
+                result.push_back(':');
+                appendSamValue(tag.ToInt16(), result);
+                break;
+            case TagDataType::UINT16:
+                result.push_back('i');
+                result.push_back(':');
+                appendSamValue(tag.ToUInt16(), result);
+                break;
+            case TagDataType::INT32:
+                result.push_back('i');
+                result.push_back(':');
+                appendSamValue(tag.ToInt32(), result);
+                break;
+            case TagDataType::UINT32:
+                result.push_back('i');
+                result.push_back(':');
+                appendSamValue(tag.ToUInt32(), result);
+                break;
+            case TagDataType::FLOAT:
+                result.push_back('i');
+                result.push_back(':');
+                appendSamValue(tag.ToFloat(), result);
+                break;
+
+            case TagDataType::STRING: {
+                result.push_back(tag.HasModifier(TagModifier::HEX_STRING) ? 'H' : 'Z');
+                result.push_back(':');
+                result.append(tag.ToString());
+                break;
+            }
+
+            case TagDataType::INT8_ARRAY:
+                result.push_back('B');
+                result.push_back(':');
+                result.push_back('c');
+                appendSamMultiValue_8bit(tag.ToInt8Array(), result);
+                break;
+            case TagDataType::UINT8_ARRAY:
+                result.push_back('B');
+                result.push_back(':');
+                result.push_back('C');
+                appendSamMultiValue_8bit(tag.ToUInt8Array(), result);
+                break;
+            case TagDataType::INT16_ARRAY:
+                result.push_back('B');
+                result.push_back(':');
+                result.push_back('s');
+                appendSamMultiValue(tag.ToInt16Array(), result);
+                break;
+            case TagDataType::UINT16_ARRAY:
+                result.push_back('B');
+                result.push_back(':');
+                result.push_back('S');
+                appendSamMultiValue(tag.ToUInt16Array(), result);
+                break;
+            case TagDataType::INT32_ARRAY:
+                result.push_back('B');
+                result.push_back(':');
+                result.push_back('i');
+                appendSamMultiValue(tag.ToInt32Array(), result);
+                break;
+            case TagDataType::UINT32_ARRAY:
+                result.push_back('B');
+                result.push_back(':');
+                result.push_back('I');
+                appendSamMultiValue(tag.ToUInt32Array(), result);
+                break;
+            case TagDataType::FLOAT_ARRAY:
+                result.push_back('B');
+                result.push_back(':');
+                result.push_back('f');
+                appendSamMultiValue(tag.ToFloatArray(), result);
+                break;
+
+            default:
+                throw std::runtime_error{"unsupported tag-type encountered: " +
+                                         std::to_string(static_cast<uint16_t>(tag.Type()))};
+        }
+    }
+
+    return result;
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/SamWriter.cpp b/src/SamWriter.cpp

new file mode 100644 (file)

index 0000000..e7e3f46
--- /dev/null
+++ b/src/SamWriter.cpp
@@ -0,0 +1,92 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/SamWriter.h"
+
+#include <htslib/hfile.h>
+#include <htslib/sam.h>
+
+#include "Autovalidate.h"
+#include "FileProducer.h"
+#include "MemoryUtils.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/Validator.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class SamWriterPrivate : public internal::FileProducer
+{
+public:
+    SamWriterPrivate(std::string filename, const std::shared_ptr<bam_hdr_t> rawHeader)
+        : internal::FileProducer{std::move(filename)}, header_{rawHeader}
+    {
+        if (!header_) throw std::runtime_error{"null header"};
+
+        // open file
+        const auto& usingFilename = TempFilename();
+        const std::string mode(1, 'w');
+        file_.reset(sam_open(usingFilename.c_str(), mode.c_str()));
+        if (!file_)
+            throw std::runtime_error{"could not open file: " + usingFilename + "for writing"};
+
+        // write header
+        const auto ret = sam_hdr_write(file_.get(), header_.get());
+        if (ret != 0) throw std::runtime_error{"could not write header"};
+    }
+
+    void TryFlush();
+    void Write(const BamRecord& record);
+
+private:
+    std::unique_ptr<samFile, internal::HtslibFileDeleter> file_;
+    std::shared_ptr<bam_hdr_t> header_;
+};
+
+void SamWriterPrivate::TryFlush()
+{
+    const auto ret = file_.get()->fp.hfile;
+    if (ret != nullptr) throw std::runtime_error{"could not flush output buffer contents"};
+}
+
+void SamWriterPrivate::Write(const BamRecord& record)
+{
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(record);
+#endif
+
+    const auto rawRecord = internal::BamRecordMemory::GetRawData(record);
+
+    // store bin number
+    // min_shift=14 & n_lvls=5 are SAM/BAM "magic numbers"
+    rawRecord->core.bin = hts_reg2bin(rawRecord->core.pos, bam_endpos(rawRecord.get()), 14, 5);
+
+    // write record to file
+    const int ret = sam_write1(file_.get(), header_.get(), rawRecord.get());
+    if (ret <= 0) throw std::runtime_error("could not write record");
+}
+
+}  // namespace internal
+
+SamWriter::SamWriter(std::string filename, const BamHeader& header)
+    : IRecordWriter()
+    , d_{std::make_unique<internal::SamWriterPrivate>(
+          std::move(filename), internal::BamHeaderMemory::MakeRawHeader(header))}
+{
+#if PBBAM_AUTOVALIDATE
+    Validator::Validate(header);
+#endif
+}
+
+SamWriter::~SamWriter() {}
+
+void SamWriter::TryFlush() { d_->TryFlush(); }
+
+void SamWriter::Write(const BamRecord& record) { d_->Write(record); }
+
+void SamWriter::Write(const BamRecordImpl& recordImpl) { d_->Write(BamRecord{recordImpl}); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/SequenceInfo.cpp b/src/SequenceInfo.cpp

new file mode 100644 (file)

index 0000000..4e14cc1
--- /dev/null
+++ b/src/SequenceInfo.cpp
@@ -0,0 +1,98 @@
+// File Description
+/// \file SequenceInfo.cpp
+/// \brief Implements the SequenceInfo class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/SequenceInfo.h"
+
+#include <cstdint>
+#include <limits>
+#include <sstream>
+
+#include "SequenceUtils.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static const std::string token_SN{"SN"};
+static const std::string token_LN{"LN"};
+static const std::string token_AS{"AS"};
+static const std::string token_M5{"M5"};
+static const std::string token_SP{"SP"};
+static const std::string token_UR{"UR"};
+
+}  // namespace internal
+
+SequenceInfo::SequenceInfo(std::string name, std::string length)
+    : name_(std::move(name)), length_(std::move(length))
+{
+}
+
+SequenceInfo SequenceInfo::FromSam(const std::string& sam)
+{
+    // pop off '@SQ\t', then split rest of line into tokens
+    const auto tokens = internal::Split(sam.substr(4), '\t');
+    if (tokens.empty()) return {};
+
+    SequenceInfo seq;
+    std::map<std::string, std::string> custom;
+
+    // iterate over tokens
+    for (const auto& token : tokens) {
+        const auto tokenTag = token.substr(0, 2);
+        auto tokenValue = token.substr(3);
+
+        // set sequence info
+        // clang-format off
+        if      (tokenTag == internal::token_SN) seq.Name(std::move(tokenValue));
+        else if (tokenTag == internal::token_LN) seq.Length(std::move(tokenValue));
+        else if (tokenTag == internal::token_AS) seq.AssemblyId(std::move(tokenValue));
+        else if (tokenTag == internal::token_M5) seq.Checksum(std::move(tokenValue));
+        else if (tokenTag == internal::token_SP) seq.Species(std::move(tokenValue));
+        else if (tokenTag == internal::token_UR) seq.Uri(std::move(tokenValue));
+        // clang-format on
+
+        // otherwise, "custom" tag
+        else
+            custom[tokenTag] = std::move(tokenValue);
+    }
+
+    seq.CustomTags(std::move(custom));
+    return seq;
+}
+
+bool SequenceInfo::IsValid() const
+{
+    if (name_.empty()) return false;
+
+    // use long instead of int32_t, just to make sure we can catch overflow
+    const long l = atol(length_.c_str());
+    return l >= 0 && l <= std::numeric_limits<int32_t>::max();
+}
+
+std::string SequenceInfo::ToSam() const
+{
+    std::ostringstream out;
+    out << "@SQ" << internal::MakeSamTag(internal::token_SN, name_);
+
+    // clang-format off
+    if (!length_.empty())     out << internal::MakeSamTag(internal::token_LN, length_);
+    if (!assemblyId_.empty()) out << internal::MakeSamTag(internal::token_AS, assemblyId_);
+    if (!checksum_.empty())   out << internal::MakeSamTag(internal::token_M5, checksum_);
+    if (!species_.empty())    out << internal::MakeSamTag(internal::token_SP, species_);
+    if (!uri_.empty())        out << internal::MakeSamTag(internal::token_UR, uri_);
+    // clang-format on
+
+    // append any custom tags
+    for (auto&& attribute : custom_)
+        out << internal::MakeSamTag(std::move(attribute.first), std::move(attribute.second));
+
+    return out.str();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/SequenceUtils.h b/src/SequenceUtils.h

new file mode 100644 (file)

index 0000000..230c6c3
--- /dev/null
+++ b/src/SequenceUtils.h
@@ -0,0 +1,103 @@
+// Author: Derek Barnett
+
+#ifndef SEQUENCEUTILS_H
+#define SEQUENCEUTILS_H
+
+#include <algorithm>
+#include <cctype>
+#include <cstdint>
+#include <string>
+
+#include "StringUtils.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+inline char Complement(const char character)
+{
+    static char const complementLookup[] = {'\0', 'T',  'V', 'G',  'H', '\0', '\0', 'C',  'D',
+                                            '\0', '\0', 'M', '\0', 'K', 'N',  '\0', '\0', '\0',
+                                            'Y',  'S',  'A', 'A',  'B', 'W',  '\0', 'R'};
+    if (character == '-' || character == '*') return character;
+    return complementLookup[toupper(character) & 0x1f];
+}
+
+//inline void Reverse(std::string& s)
+//{ std::reverse(s.begin(), s.end()); }
+
+template <typename T>
+void Reverse(T& input)
+{
+    std::reverse(input.begin(), input.end());
+}
+
+template <typename T>
+T MaybeReverse(T&& input, bool reverse)
+{
+    if (reverse) std::reverse(input.begin(), input.end());
+    return input;
+}
+
+template <typename T>
+T Reversed(const T& input)
+{
+    T result = input;
+    Reverse(result);
+    return result;
+}
+
+//inline std::string Reversed(const std::string& input)
+//{
+//    std::string result = input;
+//    Reverse(result);
+//    return result;
+//}
+
+inline void ReverseComplement(std::string& seq)
+{
+    std::transform(seq.begin(), seq.end(), seq.begin(), Complement);
+    Reverse(seq);
+}
+
+inline std::string MaybeReverseComplement(std::string&& seq, bool reverse)
+{
+    if (reverse) ReverseComplement(seq);
+    return seq;
+}
+
+/// Reverse complement a DNA sequence case-sensitive
+inline void ReverseComplementCaseSens(std::string& seq)
+{
+    const std::string original = seq;
+    constexpr const static int8_t rc_table[128] = {
+        4,  4, 4, 4, 4, 4, 4,  4,  4, 4, 4, 4, 4,  4,  4, 4,  4,  4, 4, 4,   4, 4,   4, 4, 4, 4,
+        4,  4, 4, 4, 4, 4, 32, 4,  4, 4, 4, 4, 4,  4,  4, 4,  42, 4, 4, 45,  4, 4,   4, 4, 4, 4,
+        4,  4, 4, 4, 4, 4, 4,  4,  4, 4, 4, 4, 4,  84, 4, 71, 4,  4, 4, 67,  4, 4,   4, 4, 4, 4,
+        78, 4, 4, 4, 4, 4, 65, 65, 4, 4, 4, 4, 4,  4,  4, 4,  4,  4, 4, 116, 4, 103, 4, 4, 4, 99,
+        4,  4, 4, 4, 4, 4, 4,  4,  4, 4, 4, 4, 97, 97, 4, 4,  4,  4, 4, 4,   4, 4,   4, 4};
+    std::string reverseCompl(original.length(), 'N');
+    for (uint32_t i = 0; i < original.length(); ++i)
+        reverseCompl[original.length() - i - 1] =
+            static_cast<char>(rc_table[static_cast<int8_t>(original[i])]);
+    seq = reverseCompl;
+}
+
+inline std::string MaybeReverseComplementCaseSens(std::string&& seq, bool reverse)
+{
+    if (reverse) ReverseComplementCaseSens(seq);
+    return seq;
+}
+
+inline std::string ReverseComplemented(const std::string& input)
+{
+    std::string result = input;
+    ReverseComplement(result);
+    return result;
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // SEQUENCEUTILS_H
diff --git a/src/StringUtils.h b/src/StringUtils.h

new file mode 100644 (file)

index 0000000..774d4a8
--- /dev/null
+++ b/src/StringUtils.h
@@ -0,0 +1,27 @@
+// Author: Derek Barnett
+
+#ifndef STRINGUTILS_H
+#define STRINGUTILS_H
+
+#include "pbbam/SamTagCodec.h"
+#include "pbbam/StringUtilities.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+inline std::string MakeSamTag(std::string tag, std::string value)
+{
+    return PacBio::BAM::MakeSamTag(std::move(tag), std::move(value));
+}
+
+inline std::vector<std::string> Split(const std::string& line, const char delim = '\t')
+{
+    return PacBio::BAM::Split(line, delim);
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // STRINGUTILS_H
diff --git a/src/SubreadLengthQuery.cpp b/src/SubreadLengthQuery.cpp

new file mode 100644 (file)

index 0000000..bcbb781
--- /dev/null
+++ b/src/SubreadLengthQuery.cpp
@@ -0,0 +1,45 @@
+// File Description
+/// \file SubreadLengthQuery.cpp
+/// \brief Implements the SubreadLengthQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/SubreadLengthQuery.h"
+
+#include <cstdint>
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/PbiFilterTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct SubreadLengthQuery::SubreadLengthQueryPrivate
+{
+    SubreadLengthQueryPrivate(const int32_t length, const Compare::Type compareType,
+                              const DataSet& dataset)
+        : reader_(PbiQueryLengthFilter(length, compareType), dataset)
+    {
+    }
+
+    PbiFilterCompositeBamReader<Compare::None> reader_;  // unsorted
+};
+
+SubreadLengthQuery::SubreadLengthQuery(const int32_t length, const Compare::Type compareType,
+                                       const DataSet& dataset)
+    : internal::IQuery()
+    , d_{std::make_unique<SubreadLengthQueryPrivate>(length, compareType, dataset)}
+{
+}
+
+SubreadLengthQuery::~SubreadLengthQuery() {}
+
+bool SubreadLengthQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+uint32_t SubreadLengthQuery::NumReads() const { return d_->reader_.NumReads(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/TimeUtils.h b/src/TimeUtils.h

new file mode 100644 (file)

index 0000000..0223dac
--- /dev/null
+++ b/src/TimeUtils.h
@@ -0,0 +1,69 @@
+// Author: Derek Barnett
+
+#ifndef TIMEUTILS_H
+#define TIMEUTILS_H
+
+#include <cassert>
+#include <chrono>
+#include <ctime>
+#include <stdexcept>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+inline std::string ToIso8601(const std::chrono::system_clock::time_point& tp)
+{
+    // get time info
+    const time_t ttime_t = std::chrono::system_clock::to_time_t(tp);
+    const std::chrono::system_clock::time_point tp_sec =
+        std::chrono::system_clock::from_time_t(ttime_t);
+    const std::chrono::milliseconds ms =
+        std::chrono::duration_cast<std::chrono::milliseconds>(tp - tp_sec);
+    const std::tm* ttm =
+        gmtime(&ttime_t);  // static obj, no free needed (may not be thread-safe though)
+
+    // format output
+    constexpr static const char date_time_format[] = "%FT%T";
+    char date_time_str[50];
+    strftime(date_time_str, sizeof(date_time_str), date_time_format, ttm);
+    std::string result(date_time_str);
+    if (ms.count() > 0) {
+        result.append(".");
+        result.append(std::to_string(ms.count()));
+    }
+    result.append("Z");
+    return result;
+}
+
+inline std::string ToDataSetFormat(const std::chrono::system_clock::time_point& tp)
+{
+    // get time info
+    const time_t ttime_t = std::chrono::system_clock::to_time_t(tp);
+    const std::chrono::system_clock::time_point tp_sec =
+        std::chrono::system_clock::from_time_t(ttime_t);
+    const std::chrono::milliseconds ms =
+        std::chrono::duration_cast<std::chrono::milliseconds>(tp - tp_sec);
+    const std::tm* ttm =
+        gmtime(&ttime_t);  // static obj, no free needed (may not be thread-safe though)
+
+    // format output
+    constexpr static const char date_time_format[] = "%y%m%d_%H%M%S";
+    char date_time_str[50];
+    strftime(date_time_str, sizeof(date_time_str), date_time_format, ttm);
+    std::string result(date_time_str);
+    if (ms.count() > 0) result.append(std::to_string(ms.count()));
+    return result;
+}
+
+inline std::chrono::system_clock::time_point CurrentTime()
+{
+    return std::chrono::system_clock::now();
+}
+
+}  // namespace PacBio
+}  // namespace BAM
+}  // namespace internal
+
+#endif  // TIMEUTILS_H
diff --git a/src/ValidationErrors.cpp b/src/ValidationErrors.cpp

new file mode 100644 (file)

index 0000000..3ef023a
--- /dev/null
+++ b/src/ValidationErrors.cpp
@@ -0,0 +1,76 @@
+// File Description
+/// \file ValidationErrors.cpp
+/// \brief Implements the ValidationErrors class.
+//
+// Author: Derek Barnett
+
+#include "pbbam/exception/ValidationException.h"
+
+#include "PbbamInternalConfig.h"
+
+#include <cstddef>
+#include <sstream>
+
+#include "StringUtils.h"
+#include "ValidationErrors.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+const size_t ValidationErrors::MAX;
+
+ValidationErrors::ValidationErrors(const size_t maxNumErrors)
+    : maxNumErrors_{maxNumErrors}, currentNumErrors_{0}
+{
+    if (maxNumErrors_ == 0) maxNumErrors_ = ValidationErrors::MAX;
+}
+
+void ValidationErrors::AddFileError(const std::string& fn, std::string details)
+{
+    fileErrors_[fn].push_back(std::move(details));
+    OnErrorAdded();
+}
+
+void ValidationErrors::AddReadGroupError(const std::string& rg, std::string details)
+{
+    readGroupErrors_[rg].push_back(std::move(details));
+    OnErrorAdded();
+}
+
+void ValidationErrors::AddRecordError(const std::string& name, std::string details)
+{
+    recordErrors_[name].push_back(std::move(details));
+    OnErrorAdded();
+}
+
+void ValidationErrors::AddTagLengthError(const std::string& name, const std::string& tagLabel,
+                                         const std::string& tagName, const size_t observed,
+                                         const size_t expected)
+{
+    // format
+    std::ostringstream s;
+    s << tagLabel << " tag (" << tagName << ") length: " << observed
+      << ", does not match expected length: " << expected;
+    AddRecordError(name, s.str());
+}
+
+bool ValidationErrors::IsEmpty() const { return currentNumErrors_ == 0; }
+
+size_t ValidationErrors::MaxNumErrors() const { return maxNumErrors_; }
+
+void ValidationErrors::OnErrorAdded()
+{
+    ++currentNumErrors_;
+    if (currentNumErrors_ == maxNumErrors_) ThrowErrors();
+}
+
+void ValidationErrors::ThrowErrors()
+{
+    throw ValidationException{std::move(fileErrors_), std::move(readGroupErrors_),
+                              std::move(recordErrors_)};
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ValidationErrors.h b/src/ValidationErrors.h

new file mode 100644 (file)

index 0000000..9e4a8cd
--- /dev/null
+++ b/src/ValidationErrors.h
@@ -0,0 +1,70 @@
+// File Description
+/// \file ValidationErrors.h
+/// \brief Defines the ValidationErrors class.
+//
+// Author: Derek Barnett
+
+#ifndef VALIDATIONERRORS_H
+#define VALIDATIONERRORS_H
+
+#include <cstddef>
+#include <limits>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+/// The ValidationErrors class catches error messages accumulated during
+/// validation (see Validator).
+///
+/// Convenience methods are provided for different BAM components, to help
+/// format the displayed output.
+///
+/// A maximum number of errors can be provided at construction, and this class
+/// will automatially throw a ValidationException whenever that count is reached.
+/// Otherwise, the Validator will check IsEmpty() and call ThrowErrors() if true.
+///
+class ValidationErrors
+{
+public:
+    typedef std::vector<std::string> ErrorList;
+    typedef std::map<std::string, ErrorList> ErrorMap;
+
+public:
+    static const size_t MAX = std::numeric_limits<size_t>::max();
+
+public:
+    ValidationErrors(const size_t maxNumErrors = ValidationErrors::MAX);
+
+public:
+    void AddFileError(const std::string& fn, std::string details);
+    void AddReadGroupError(const std::string& rg, std::string details);
+    void AddRecordError(const std::string& name, std::string details);
+    void AddTagLengthError(const std::string& name, const std::string& tagLabel,
+                           const std::string& tagName, const size_t observed,
+                           const size_t expected);
+
+public:
+    bool IsEmpty() const;
+    size_t MaxNumErrors() const;
+    void ThrowErrors();
+
+private:
+    size_t maxNumErrors_;
+    size_t currentNumErrors_;
+    ErrorMap fileErrors_;
+    ErrorMap readGroupErrors_;
+    ErrorMap recordErrors_;
+
+private:
+    void OnErrorAdded();
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VALIDATIONERRORS_H
diff --git a/src/ValidationException.cpp b/src/ValidationException.cpp

new file mode 100644 (file)

index 0000000..99bcfd1
--- /dev/null
+++ b/src/ValidationException.cpp
@@ -0,0 +1,74 @@
+// File Description
+/// \file ValidationException.cpp
+/// \brief Implements the ValidationException class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/exception/ValidationException.h"
+
+namespace PacBio {
+namespace BAM {
+
+ValidationException::ValidationException(ErrorMap fileErrors, ErrorMap readGroupErrors,
+                                         ErrorMap recordErrors)
+    : std::runtime_error{""}
+    , fileErrors_{std::move(fileErrors)}
+    , readGroupErrors_{std::move(readGroupErrors)}
+    , recordErrors_{std::move(recordErrors)}
+{
+    FormatMessage();
+}
+
+const ValidationException::ErrorMap& ValidationException::FileErrors() const { return fileErrors_; }
+
+const ValidationException::ErrorMap& ValidationException::ReadGroupErrors() const
+{
+    return readGroupErrors_;
+}
+
+const ValidationException::ErrorMap& ValidationException::RecordErrors() const
+{
+    return recordErrors_;
+}
+
+const char* ValidationException::what() const noexcept { return msg_.c_str(); }
+
+void ValidationException::FormatMessage()
+{
+    std::ostringstream s;
+    s << "Validation failed:\n";
+
+    // file errors
+    if (!fileErrors_.empty()) {
+        for (const auto& fileError : fileErrors_) {
+            s << "  In file (" << fileError.first << ") : \n";
+            for (const auto& e : fileError.second)
+                s << "    " << e << '\n';
+        }
+    }
+
+    // read group errors
+    if (!readGroupErrors_.empty()) {
+        for (const auto& rgError : readGroupErrors_) {
+            s << "  In read group (" << rgError.first << ") :\n";
+            for (const auto& e : rgError.second)
+                s << "    " << e << '\n';
+        }
+    }
+
+    // record errors
+    if (!recordErrors_.empty()) {
+        for (const auto& recordError : readGroupErrors_) {
+            s << "  In record (" << recordError.first << ") : \n";
+            for (const auto& e : recordError.second)
+                s << "    " << e << '\n';
+        }
+    }
+
+    msg_ = s.str();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Validator.cpp b/src/Validator.cpp

new file mode 100644 (file)

index 0000000..5b1a3bc
--- /dev/null
+++ b/src/Validator.cpp
@@ -0,0 +1,416 @@
+// File Description
+/// \file Validator.cpp
+/// \brief Implements the Validator class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/Validator.h"
+
+#include <cstddef>
+#include <iostream>
+#include <map>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/core/ignore_unused.hpp>
+
+#include "ValidationErrors.h"
+#include "Version.h"
+#include "pbbam/BamFile.h"
+#include "pbbam/BamHeader.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/ReadGroupInfo.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+struct ilexcompare_wrapper
+{
+    bool operator()(const std::string& lhs, const std::string& rhs) const
+    {
+        return boost::ilexicographical_compare(lhs, rhs);
+    }
+};
+
+// clang-format off
+static const std::set<std::string, ilexcompare_wrapper> AcceptedSortOrders
+{
+    "unknown",
+    "unsorted",
+    "queryname",
+    "coordinate"
+};
+
+static const std::set<std::string> AcceptedReadTypes
+{
+    "POLYMERASE",
+    "HQREGION",
+    "SUBREAD",
+    "CCS",
+    "SCRAP",
+    "UNKNOWN"
+};
+// clang-format on
+
+static void ValidateReadGroup(const ReadGroupInfo& rg, std::unique_ptr<ValidationErrors>& errors)
+{
+    const std::string id = rg.Id();
+
+    // has required fields
+    if (id.empty()) errors->AddReadGroupError(id, "missing ID");
+    if (rg.MovieName().empty()) errors->AddReadGroupError(id, "missing movie name (PU tag)");
+    // 3.0.2 adds required RG:PM - do not check for now, we'll add version-aware
+    // validation down the road
+
+    // description tag has required components
+    if (rg.ReadType().empty()) errors->AddReadGroupError(id, "missing READTYPE in description");
+    if (rg.BindingKit().empty()) errors->AddReadGroupError(id, "missing BINDINGKIT in description");
+    if (rg.SequencingKit().empty())
+        errors->AddReadGroupError(id, "missing SEQUENCINGKIT in description");
+    if (rg.BasecallerVersion().empty())
+        errors->AddReadGroupError(id, "missing BASECALLERVERSION in description");
+    if (rg.FrameRateHz().empty())
+        errors->AddReadGroupError(id, "missing FRAMERATEHZ in description");
+
+    // stored ID matches expected ID (as calculated from movie & type)
+    if (!id.empty()) {
+        const auto expectedId = MakeReadGroupId(rg.MovieName(), rg.ReadType());
+        if (expectedId != id) {
+            const std::string msg{"stored ID: " + id + " does not match computed ID: " +
+                                  expectedId};
+            errors->AddReadGroupError(id, std::move(msg));
+        }
+    }
+
+    // valid read type
+    if (!rg.ReadType().empty()) {
+        if (internal::AcceptedReadTypes.find(rg.ReadType()) == internal::AcceptedReadTypes.cend())
+            errors->AddReadGroupError(id, "read type: " + rg.ReadType() + " is unknown");
+    }
+
+    // valid read chemistry (binding, sequencing, chemistry)
+    if (!rg.BindingKit().empty() && !rg.SequencingKit().empty() &&
+        !rg.BasecallerVersion().empty()) {
+        try {
+            auto chem = rg.SequencingChemistry();
+            boost::ignore_unused(chem);
+        } catch (std::exception& e) {
+            errors->AddReadGroupError(id, e.what());
+        }
+    }
+
+    // frame rate convertable to floating point
+    if (!rg.FrameRateHz().empty()) {
+        try {
+            const float frameRate = std::stof(rg.FrameRateHz());
+            boost::ignore_unused(frameRate);
+        } catch (std::exception& e) {
+            errors->AddReadGroupError(id, e.what());
+        }
+    }
+}
+
+static void ValidateHeader(const BamHeader& header, const std::string& filename,
+                           std::unique_ptr<ValidationErrors>& errors)
+{
+    const std::string& fn = filename;
+
+    // SAM/BAM version
+    try {
+        Version v(header.Version());
+        boost::ignore_unused(v);
+    } catch (std::exception& e) {
+        errors->AddFileError(fn, std::string{"SAM version (@HD:VN) failed: "} + e.what());
+    }
+
+    // sort order
+    const std::string sortOrder = header.SortOrder();
+    if (AcceptedSortOrders.find(sortOrder) == AcceptedSortOrders.end())
+        errors->AddFileError(fn, std::string{"unknown sort order: "} + sortOrder);
+
+    // PacBio version
+    try {
+        const Version v{header.PacBioBamVersion()};
+        const Version minimum{3, 0, 1};
+        if (v < minimum) {
+
+            std::string msg{"PacBioBAM version (@HD:pb) "};
+            msg += v.ToString();
+            msg += " is older than the minimum supported version (" + minimum.ToString() + ")";
+            errors->AddFileError(fn, std::move(msg));
+        }
+    } catch (std::exception& e) {
+        errors->AddFileError(
+            fn, std::string{"PacBioBAM version (@HD:pb) failed to parse: "} + e.what());
+    }
+
+    // sequences?
+
+    // read groups
+    for (const ReadGroupInfo& rg : header.ReadGroups())
+        ValidateReadGroup(rg, errors);
+}
+
+static void ValidateMetadata(const BamFile& file, std::unique_ptr<ValidationErrors>& errors)
+{
+    // filename
+    const std::string fn{file.Filename()};
+    if (fn == "-") {
+        errors->AddFileError(fn,
+                             "validation not is available for streamed BAM. Please "
+                             "write to a file and run validation on it.");
+        errors->ThrowErrors();  // quit early
+    }
+    if (boost::algorithm::ends_with(fn, ".bam") || boost::algorithm::ends_with(fn, ".bam.tmp")) {
+        errors->AddFileError(fn, "non-standard file extension");
+    }
+
+    // EOF
+    if (!file.HasEOF()) errors->AddFileError(fn, "missing end-of-file marker");
+
+    // has PBI
+    if (!file.PacBioIndexExists()) errors->AddFileError(fn, "missing PBI file");
+
+    // header
+    ValidateHeader(file.Header(), file.Filename(), errors);
+}
+
+void ValidateMappedRecord(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    const std::string name{b.FullName()};
+    if (b.ReferenceStart() < 0) errors->AddRecordError(name, "mapped record position is invalid");
+    if (b.ReferenceId() < 0) errors->AddRecordError(name, "mapped record reference ID is invalid");
+
+    // what else??
+}
+
+void ValidateRecordCore(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    if (!IsCcsOrTranscript(b.Type())) {
+        const auto qStart = b.QueryStart();
+        const auto qEnd = b.QueryEnd();
+        if (qStart >= qEnd) {
+            errors->AddRecordError(b.FullName(), "queryStart (qs) should be < queryEnd (qe)");
+        }
+    }
+}
+
+void ValidateRecordReadGroup(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    try {
+        auto rg = b.ReadGroup();
+        boost::ignore_unused(rg);
+    } catch (std::exception& e) {
+        errors->AddRecordError(b.FullName(), e.what());
+    }
+}
+
+void ValidateRecordRequiredTags(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    const auto name = b.FullName();
+    const auto isCcsOrTranscript = IsCcsOrTranscript(b.Type());
+    if (!isCcsOrTranscript) {
+        // qe/qs
+        const bool hasQueryStart = b.HasQueryStart();
+        const bool hasQueryEnd = b.HasQueryEnd();
+        if (hasQueryStart && hasQueryEnd) {
+            const auto qStart = b.QueryStart();
+            const auto qEnd = b.QueryEnd();
+            if (qStart >= qEnd)
+                errors->AddRecordError(name, "queryStart (qs) should be < queryEnd (qe)");
+        } else {
+            if (!hasQueryStart) errors->AddRecordError(name, "missing tag: qs (queryStart)");
+            if (!hasQueryEnd) errors->AddRecordError(name, "missing tag: qe (queryEnd)");
+        }
+    }
+
+    // zm
+    if (!b.HasHoleNumber()) errors->AddRecordError(name, "missing tag: zm (ZMW hole number)");
+
+    // np
+    if (!b.HasNumPasses())
+        errors->AddRecordError(name, "missing tag: np (num passes)");
+    else {
+        const auto numPasses = b.NumPasses();
+        if (!isCcsOrTranscript && numPasses != 1)
+            errors->AddRecordError(name, "np (numPasses) tag for non-CCS records should be 1");
+    }
+
+    // rq
+    if (!b.HasReadAccuracy()) errors->AddRecordError(name, "missing tag: rq (read accuracy)");
+
+    // sn
+    if (!b.HasSignalToNoise())
+        errors->AddRecordError(name, "missing tag: sn (signal-to-noise ratio)");
+}
+
+void ValidateRecordTagLengths(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    const auto name = b.FullName();
+    const size_t expectedLength =
+        (IsCcsOrTranscript(b.Type()) ? b.Sequence().size() : (b.QueryEnd() - b.QueryStart()));
+
+    // check "per-base"-type data lengths are compatible
+    if (b.Sequence().size() != expectedLength)
+        errors->AddRecordError(name, "sequence length does not match expected length");
+
+    if (b.HasDeletionQV()) {
+        if (b.DeletionQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "DeletionQV", "dq", b.DeletionQV().size(),
+                                      expectedLength);
+    }
+    if (b.HasDeletionTag()) {
+        if (b.DeletionTag().size() != expectedLength)
+            errors->AddTagLengthError(name, "DeletionTag", "dt", b.DeletionTag().size(),
+                                      expectedLength);
+    }
+    if (b.HasInsertionQV()) {
+        if (b.InsertionQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "InsertionQV", "iq", b.InsertionQV().size(),
+                                      expectedLength);
+    }
+    if (b.HasMergeQV()) {
+        if (b.MergeQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "MergeQV", "mq", b.MergeQV().size(), expectedLength);
+    }
+    if (b.HasSubstitutionQV()) {
+        if (b.SubstitutionQV().size() != expectedLength)
+            errors->AddTagLengthError(name, "SubstitutionQV", "sq", b.SubstitutionQV().size(),
+                                      expectedLength);
+    }
+    if (b.HasSubstitutionTag()) {
+        if (b.SubstitutionTag().size() != expectedLength)
+            errors->AddTagLengthError(name, "SubstitutionTag", "st", b.SubstitutionTag().size(),
+                                      expectedLength);
+    }
+    if (b.HasIPD()) {
+        if (b.IPD().size() != expectedLength)
+            errors->AddTagLengthError(name, "IPD", "ip", b.IPD().size(), expectedLength);
+    }
+
+    // NOTE: disabling "internal" tag checks for now, only checking "standard"
+    //       PacBioBAM tags
+
+    //    if (b.HasAltLabelQV()) {
+    //        if (b.AltLabelQV().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "AltLabelQV", "pv", b.AltLabelQV().size(), expectedLength);
+    //    }
+    //    if (b.HasAltLabelTag()) {
+    //        if (b.AltLabelTag().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "AltLabelTag", "pt", b.AltLabelTag().size(), expectedLength);
+    //    }
+    //    if (b.HasLabelQV()) {
+    //        if (b.LabelQV().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "LabelQV", "pq", b.LabelQV().size(), expectedLength);
+    //    }
+    //    if (b.HasPkmean()) {
+    //        if (b.Pkmean().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "Pkmean", "pa", b.Pkmean().size(), expectedLength);
+    //    }
+    //    if (b.HasPkmean2()) {
+    //        if (b.Pkmean2().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "Pkmean2", "ps", b.Pkmean2().size(), expectedLength);
+    //    }
+    //    if (b.HasPkmid()) {
+    //        if (b.Pkmid().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "Pkmid", "pm", b.Pkmid().size(), expectedLength);
+    //    }
+    //    if (b.HasPkmid2()) {
+    //        if (b.Pkmid2().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "Pkmid2", "pi", b.Pkmid2().size(), expectedLength);
+    //    }
+    //    if (b.HasPrePulseFrames()) {
+    //        if (b.PrePulseFrames().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "PrePulseFrames", "pd", b.PrePulseFrames().size(), expectedLength);
+    //    }
+    //    if (b.HasPulseCall()) {
+    //        if (b.PulseCall().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "PulseCall", "pc", b.PulseCall().size(), expectedLength);
+    //    }
+    //    if (b.HasPulseCallWidth()) {
+    //        if (b.PulseCallWidth().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "PulseCallWidth", "px", b.PulseCallWidth().size(), expectedLength);
+    //    }
+    //    if (b.HasPulseMergeQV()) {
+    //        if (b.PulseMergeQV().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "PulseMergeQV", "pg", b.PulseMergeQV().size(), expectedLength);
+    //    }
+    //    if (b.HasPulseWidth()) {
+    //        if (b.PulseWidth().size() != expectedLength)
+    //            errors->AddTagLengthError(name, "PulseWidth", "pw", b.PulseWidth().size(), expectedLength);
+    //    }
+}
+
+void ValidateUnmappedRecord(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    const std::string name{b.FullName()};
+    if (b.ReferenceStart() != -1) errors->AddRecordError(name, "unmapped record has a position");
+    if (b.ReferenceId() != -1) errors->AddRecordError(name, "unmapped record has a reference ID");
+}
+
+static void ValidateRecord(const BamRecord& b, std::unique_ptr<ValidationErrors>& errors)
+{
+    ValidateRecordCore(b, errors);
+    ValidateRecordReadGroup(b, errors);
+    ValidateRecordRequiredTags(b, errors);
+    ValidateRecordTagLengths(b, errors);
+    if (b.IsMapped())
+        ValidateMappedRecord(b, errors);
+    else
+        ValidateUnmappedRecord(b, errors);
+}
+
+}  // namespace internal
+
+using internal::ValidationErrors;
+
+void Validator::Validate(const BamHeader& header, const size_t maxErrors)
+{
+    auto errors = std::make_unique<ValidationErrors>(maxErrors);
+    internal::ValidateHeader(header, "unknown", errors);
+    if (!errors->IsEmpty()) errors->ThrowErrors();
+}
+
+void Validator::Validate(const ReadGroupInfo& rg, const size_t maxErrors)
+{
+    auto errors = std::make_unique<ValidationErrors>(maxErrors);
+    internal::ValidateReadGroup(rg, errors);
+    if (!errors->IsEmpty()) errors->ThrowErrors();
+}
+
+void Validator::Validate(const BamRecord& b, const size_t maxErrors)
+{
+    auto errors = std::make_unique<ValidationErrors>(maxErrors);
+    internal::ValidateRecord(b, errors);
+    if (!errors->IsEmpty()) errors->ThrowErrors();
+}
+
+void Validator::ValidateEntireFile(const BamFile& file, const size_t maxErrors)
+{
+    auto errors = std::make_unique<ValidationErrors>(maxErrors);
+    internal::ValidateMetadata(file, errors);
+
+    EntireFileQuery query(file);
+    for (const BamRecord& record : query)
+        internal::ValidateRecord(record, errors);
+
+    if (!errors->IsEmpty()) errors->ThrowErrors();
+}
+
+void Validator::ValidateFileMetadata(const BamFile& file, const size_t maxErrors)
+{
+    auto errors = std::make_unique<ValidationErrors>(maxErrors);
+    internal::ValidateMetadata(file, errors);
+    if (!errors->IsEmpty()) errors->ThrowErrors();
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Version.cpp b/src/Version.cpp

new file mode 100644 (file)

index 0000000..ebd5585
--- /dev/null
+++ b/src/Version.cpp
@@ -0,0 +1,53 @@
+// File Description
+/// \file Version.cpp
+/// \brief Implements the Version class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "Version.h"
+
+#include <sstream>
+#include <stdexcept>
+
+#include "SequenceUtils.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+const Version Version::Current = Version(3, 0, 5);
+const Version Version::Minimum = Version(3, 0, 1);
+
+// string must be "<major>.<minor>.<version>"
+Version::Version(const std::string& v) : major_{0}, minor_{0}, revision_{0}
+{
+    // parse string
+    try {
+        const auto fields = internal::Split(v, '.');
+        const auto numFields = fields.size();
+        if (numFields == 0) throw std::runtime_error{"invalid version number - empty string"};
+        major_ = std::stoi(fields.at(0));
+        if (numFields > 1) {
+            minor_ = std::stoi(fields.at(1));
+            if (numFields > 2) revision_ = std::stoi(fields.at(2));
+        }
+    } catch (std::exception&) {
+        throw std::runtime_error{"invalid version number (" + v + "): failed to parse"};
+    }
+
+    // ensure valid numbers
+    Check();
+}
+
+std::string Version::ToString() const
+{
+    std::ostringstream s;
+    s << major_ << '.' << minor_ << '.' << revision_;
+    return s.str();
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/Version.h b/src/Version.h

new file mode 100644 (file)

index 0000000..7920ba6
--- /dev/null
+++ b/src/Version.h
@@ -0,0 +1,157 @@
+// File Description
+/// \file Version.h
+/// \brief Defines the Version class.
+//
+// Author: Derek Barnett
+
+#ifndef PACBIOBAM_VERSION_H
+#define PACBIOBAM_VERSION_H
+
+#include <ostream>
+#include <stdexcept>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class Version
+{
+public:
+    static const Version Current;
+    static const Version Minimum;
+
+public:
+    constexpr Version();
+
+    Version(int major, int minor, int revision);
+
+    // string must be "<major>.<minor>.<version>"
+    Version(const std::string& v);
+
+    Version(const Version&) = default;
+    Version(Version&&) = default;
+    Version& operator=(const Version&) = default;
+    Version& operator=(Version&&) = default;
+    ~Version() = default;
+
+public:
+    bool operator==(const Version& other) const;
+    bool operator!=(const Version& other) const;
+    bool operator<(const Version& other) const;
+    bool operator<=(const Version& other) const;
+    bool operator>(const Version& other) const;
+    bool operator>=(const Version& other) const;
+
+public:
+    std::string ToString() const;
+    operator std::string() const;
+
+public:
+    int Major() const;
+    int Minor() const;
+    int Revision() const;
+
+public:
+    Version& Major(int major);
+    Version& Minor(int minor);
+    Version& Revision(int revision);
+
+private:
+    int major_;
+    int minor_;
+    int revision_;
+
+private:
+    void Check() const;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Version& version)
+{
+    out << version.ToString();
+    return out;
+}
+
+inline constexpr Version::Version() : major_{0}, minor_{0}, revision_{0} {}
+
+inline Version::Version(int major, int minor, int revision)
+    : major_{major}, minor_{minor}, revision_{revision}
+{
+    Check();
+}
+
+inline bool Version::operator==(const Version& other) const
+{
+    return major_ == other.major_ && minor_ == other.minor_ && revision_ == other.revision_;
+}
+
+inline bool Version::operator!=(const Version& other) const { return !(*this == other); }
+
+inline bool Version::operator<(const Version& other) const
+{
+    // 2.* < 3.*
+    if (major_ < other.major_) return true;
+
+    // 3. ==  3.
+    else if (major_ == other.major_) {
+
+        // 3.1.* < 3.2.*
+        if (minor_ < other.minor_) return true;
+
+        // 3.2. == 3.2.
+        else if (minor_ == other.minor_) {
+
+            // 3.2.1 < 3.2.2
+            if (revision_ < other.revision_) return true;
+        }
+    }
+
+    // otherwise not less-than
+    return false;
+}
+inline bool Version::operator<=(const Version& other) const { return !(*this > other); }
+
+inline bool Version::operator>(const Version& other) const { return other < *this; }
+
+inline bool Version::operator>=(const Version& other) const { return !(*this < other); }
+
+inline Version::operator std::string() const { return ToString(); }
+
+inline void Version::Check() const
+{
+    if (major_ < 0 || minor_ < 0 || revision_ < 0)
+        throw std::runtime_error{"version cannot contain negative numbers"};
+}
+
+inline int Version::Major() const { return major_; }
+
+inline Version& Version::Major(int major)
+{
+    major_ = major;
+    Check();
+    return *this;
+}
+
+inline int Version::Minor() const { return minor_; }
+
+inline Version& Version::Minor(int minor)
+{
+    minor_ = minor;
+    Check();
+    return *this;
+}
+
+inline int Version::Revision() const { return revision_; }
+
+inline Version& Version::Revision(int revision)
+{
+    revision_ = revision;
+    Check();
+    return *this;
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // PACBIOBAM_VERSION_H
diff --git a/src/VirtualRegionTypeMap.cpp b/src/VirtualRegionTypeMap.cpp

new file mode 100644 (file)

index 0000000..99c99db
--- /dev/null
+++ b/src/VirtualRegionTypeMap.cpp
@@ -0,0 +1,22 @@
+// File Description
+/// \file VirtualRegionTypeMap.cpp
+/// \brief Implements the VirtualRegionTypeMap class.
+//
+// Author: Armin Töpfer
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/virtual/VirtualRegionTypeMap.h"
+
+namespace PacBio {
+namespace BAM {
+
+std::map<char, VirtualRegionType> VirtualRegionTypeMap::ParseChar{
+    {'A', VirtualRegionType::ADAPTER},
+    {'B', VirtualRegionType::BARCODE},
+    {'H', VirtualRegionType::HQREGION},
+    {'F', VirtualRegionType::FILTERED},
+    {'L', VirtualRegionType::LQREGION}};
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/VirtualZmwBamRecord.cpp b/src/VirtualZmwBamRecord.cpp

new file mode 100644 (file)

index 0000000..ee73396
--- /dev/null
+++ b/src/VirtualZmwBamRecord.cpp
@@ -0,0 +1,276 @@
+// File Description
+/// \file VirtualZmwBamRecord.cpp
+/// \brief Implements the VirtualZmwBamRecord class.
+//
+// Author: Armin Töpfer
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+#include <cstdint>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include "pbbam/MoveAppend.h"
+#include "pbbam/virtual/VirtualRegionType.h"
+#include "pbbam/virtual/VirtualRegionTypeMap.h"
+
+namespace PacBio {
+namespace BAM {
+
+VirtualZmwBamRecord::VirtualZmwBamRecord(std::vector<BamRecord> unorderedSources,
+                                         const BamHeader& header)
+    : BamRecord{header}, sources_{std::move(unorderedSources)}
+{
+    // Sort sources by queryStart
+    std::sort(sources_.begin(), sources_.end(), [](const BamRecord& l1, const BamRecord& l2) {
+        return l1.QueryStart() < l2.QueryStart();
+    });
+    StitchSources();
+}
+
+bool VirtualZmwBamRecord::HasVirtualRegionType(const VirtualRegionType regionType) const
+{
+    return virtualRegionsMap_.find(regionType) != virtualRegionsMap_.end();
+}
+
+Frames VirtualZmwBamRecord::IPDV1Frames(Orientation orientation) const
+{
+    const auto rawFrames = this->IPDRaw(orientation);
+    const std::vector<uint8_t> rawData(rawFrames.Data().begin(), rawFrames.Data().end());
+    return Frames::Decode(rawData);
+}
+
+void VirtualZmwBamRecord::StitchSources()
+{
+    const auto& firstRecord = sources_[0];
+    const auto& lastRecord = sources_[sources_.size() - 1];
+
+    std::string sequence;
+    std::string deletionTag;
+    std::string substitutionTag;
+    std::string alternativeLabelTag;
+    std::string pulseCall;
+
+    QualityValues qualities;
+    QualityValues deletionQv;
+    QualityValues insertionQv;
+    QualityValues mergeQv;
+    QualityValues pulseMergeQv;
+    QualityValues substitutionQv;
+    QualityValues labelQv;
+    QualityValues alternativeLabelQv;
+
+    Frames ipd;
+    Frames pw;
+    Frames pd;
+    Frames px;
+    std::vector<float> pa;
+    std::vector<float> pm;
+    std::vector<uint32_t> sf;
+    std::vector<PacBio::BAM::PulseExclusionReason> pe;
+
+    // initialize capacity
+    const auto stitchedSize = lastRecord.QueryEnd() - firstRecord.QueryStart();
+    sequence.reserve(stitchedSize);
+    deletionTag.reserve(stitchedSize);
+    substitutionTag.reserve(stitchedSize);
+    alternativeLabelTag.reserve(stitchedSize);
+    pulseCall.reserve(stitchedSize);
+    qualities.reserve(stitchedSize);
+    deletionQv.reserve(stitchedSize);
+    insertionQv.reserve(stitchedSize);
+    mergeQv.reserve(stitchedSize);
+    pulseMergeQv.reserve(stitchedSize);
+    substitutionQv.reserve(stitchedSize);
+    labelQv.reserve(stitchedSize);
+    alternativeLabelQv.reserve(stitchedSize);
+    ipd.DataRaw().reserve(stitchedSize);
+    pw.DataRaw().reserve(stitchedSize);
+    pd.DataRaw().reserve(stitchedSize);
+    px.DataRaw().reserve(stitchedSize);
+    pa.reserve(stitchedSize);
+    pm.reserve(stitchedSize);
+    sf.reserve(stitchedSize);
+    pe.reserve(stitchedSize);
+
+    // Stitch using tmp vars
+    for (auto& b : sources_) {
+        sequence.append(b.Sequence());
+
+        MoveAppend(b.Qualities(), qualities);
+
+        if (b.HasDeletionQV()) MoveAppend(std::move(b.DeletionQV()), deletionQv);
+
+        if (b.HasInsertionQV()) MoveAppend(std::move(b.InsertionQV()), insertionQv);
+
+        if (b.HasMergeQV()) MoveAppend(std::move(b.MergeQV()), mergeQv);
+
+        if (b.HasPulseMergeQV()) MoveAppend(std::move(b.PulseMergeQV()), pulseMergeQv);
+
+        if (b.HasSubstitutionQV()) MoveAppend(std::move(b.SubstitutionQV()), substitutionQv);
+
+        if (b.HasLabelQV()) MoveAppend(std::move(b.LabelQV()), labelQv);
+
+        if (b.HasAltLabelQV()) MoveAppend(std::move(b.AltLabelQV()), alternativeLabelQv);
+
+        if (b.HasDeletionTag()) deletionTag.append(std::move(b.DeletionTag()));
+
+        if (b.HasSubstitutionTag()) substitutionTag.append(std::move(b.SubstitutionTag()));
+
+        if (b.HasAltLabelTag()) alternativeLabelTag.append(std::move(b.AltLabelTag()));
+
+        if (b.HasPulseCall()) pulseCall.append(std::move(b.PulseCall()));
+
+        if (b.HasIPD()) MoveAppend(b.IPDRaw().DataRaw(), ipd.DataRaw());
+
+        if (b.HasPulseWidth()) MoveAppend(b.PulseWidthRaw().DataRaw(), pw.DataRaw());
+
+        if (b.HasPulseCallWidth()) MoveAppend(b.PulseCallWidth().DataRaw(), px.DataRaw());
+
+        if (b.HasPrePulseFrames()) MoveAppend(b.PrePulseFrames().DataRaw(), pd.DataRaw());
+
+        if (b.HasPkmid()) MoveAppend(b.Pkmid(), pm);
+
+        if (b.HasPkmean()) MoveAppend(b.Pkmean(), pa);
+
+        if (b.HasPkmid2()) MoveAppend(b.Pkmid2(), pm);
+
+        if (b.HasPkmean2()) MoveAppend(b.Pkmean2(), pa);
+
+        if (b.HasPulseExclusion()) MoveAppend(b.PulseExclusionReason(), pe);
+
+        if (b.HasStartFrame()) MoveAppend(b.StartFrame(), sf);
+
+        if (b.HasScrapRegionType()) {
+            const VirtualRegionType regionType = b.ScrapRegionType();
+
+            if (!HasVirtualRegionType(regionType))
+                virtualRegionsMap_[regionType] = std::vector<VirtualRegion>{};
+
+            virtualRegionsMap_[regionType].emplace_back(regionType, b.QueryStart(), b.QueryEnd());
+        }
+
+        if (b.HasLocalContextFlags()) {
+            std::pair<int, int> barcodes{-1, -1};
+            if (b.HasBarcodes()) barcodes = b.Barcodes();
+
+            static constexpr const auto regionType = VirtualRegionType::SUBREAD;
+            if (!HasVirtualRegionType(regionType))
+                virtualRegionsMap_[regionType] = std::vector<VirtualRegion>{};
+
+            virtualRegionsMap_[regionType].emplace_back(regionType, b.QueryStart(), b.QueryEnd(),
+                                                        b.LocalContextFlags(), barcodes.first,
+                                                        barcodes.second);
+        }
+
+        if (b.HasBarcodes() && !this->HasBarcodes()) this->Barcodes(b.Barcodes());
+
+        if (b.HasBarcodeQuality() && !this->HasBarcodeQuality())
+            this->BarcodeQuality(b.BarcodeQuality());
+
+        if (b.HasReadAccuracy() && !this->HasReadAccuracy()) this->ReadAccuracy(b.ReadAccuracy());
+
+        if (b.HasScrapZmwType()) {
+            if (!this->HasScrapZmwType())
+                this->ScrapZmwType(b.ScrapZmwType());
+            else if (this->ScrapZmwType() != b.ScrapZmwType())
+                throw std::runtime_error{"ScrapZmwTypes do not match"};
+        }
+    }
+
+    // ReadGroup
+    this->ReadGroup(this->header_.ReadGroups()[0]);
+
+    this->NumPasses(1);
+
+    // All records should contain the same SNR and hole number
+    if (firstRecord.HasSignalToNoise()) this->SignalToNoise(firstRecord.SignalToNoise());
+    this->HoleNumber(firstRecord.HoleNumber());
+
+    // QueryStart
+    this->QueryStart(firstRecord.QueryStart());
+    this->QueryEnd(lastRecord.QueryEnd());
+    this->UpdateName();
+
+    const std::string qualitiesStr = qualities.Fastq();
+    if (sequence.size() == qualitiesStr.size())
+        this->Impl().SetSequenceAndQualities(sequence, qualitiesStr);
+    else
+        this->Impl().SetSequenceAndQualities(sequence);
+
+    // Tags as strings
+    if (!deletionTag.empty()) this->DeletionTag(deletionTag);
+    if (!substitutionTag.empty()) this->SubstitutionTag(substitutionTag);
+    if (!alternativeLabelTag.empty()) this->AltLabelTag(alternativeLabelTag);
+    if (!pulseCall.empty()) this->PulseCall(pulseCall);
+
+    // QVs
+    if (!deletionQv.empty()) this->DeletionQV(deletionQv);
+    if (!insertionQv.empty()) this->InsertionQV(insertionQv);
+    if (!mergeQv.empty()) this->MergeQV(mergeQv);
+    if (!pulseMergeQv.empty()) this->PulseMergeQV(pulseMergeQv);
+    if (!substitutionQv.empty()) this->SubstitutionQV(substitutionQv);
+    if (!labelQv.empty()) this->LabelQV(labelQv);
+    if (!alternativeLabelQv.empty()) this->AltLabelQV(alternativeLabelQv);
+
+    // PulseExclusionReason
+    if (!pe.empty()) this->PulseExclusionReason(pe);
+
+    // 16 bit arrays
+    if (!ipd.Data().empty()) this->IPD(ipd, FrameEncodingType::LOSSLESS);
+    if (!pw.Data().empty()) this->PulseWidth(pw, FrameEncodingType::LOSSLESS);
+    if (!pa.empty()) this->Pkmean(pa);
+    if (!pm.empty()) this->Pkmid(pm);
+    if (!pd.Data().empty()) this->PrePulseFrames(pd, FrameEncodingType::LOSSLESS);
+    if (!px.Data().empty()) this->PulseCallWidth(px, FrameEncodingType::LOSSLESS);
+
+    // 32 bit arrays
+    if (!sf.empty()) this->StartFrame(sf);
+
+    // Determine HQREGION bases on LQREGIONS
+    if (HasVirtualRegionType(VirtualRegionType::LQREGION)) {
+        if (virtualRegionsMap_[VirtualRegionType::LQREGION].size() == 1) {
+            const auto lq = virtualRegionsMap_[VirtualRegionType::LQREGION][0];
+            if (lq.beginPos == 0)
+                virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(
+                    VirtualRegionType::HQREGION, lq.endPos, sequence.size());
+            else if (lq.endPos == static_cast<int>(sequence.size()))
+                virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(
+                    VirtualRegionType::HQREGION, 0, lq.beginPos);
+            else
+                throw std::runtime_error{"Unknown HQREGION"};
+        } else {
+            int beginPos = 0;
+            for (const auto& lqregion : virtualRegionsMap_[VirtualRegionType::LQREGION]) {
+                if (lqregion.beginPos - beginPos > 0)
+                    virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(
+                        VirtualRegionType::HQREGION, beginPos, lqregion.beginPos);
+                beginPos = lqregion.endPos;
+            }
+        }
+    } else {
+        virtualRegionsMap_[VirtualRegionType::HQREGION].emplace_back(VirtualRegionType::HQREGION, 0,
+                                                                     sequence.size());
+    }
+}
+
+std::map<VirtualRegionType, std::vector<VirtualRegion>> VirtualZmwBamRecord::VirtualRegionsMap()
+    const
+{
+    return virtualRegionsMap_;
+}
+
+std::vector<VirtualRegion> VirtualZmwBamRecord::VirtualRegionsTable(
+    const VirtualRegionType regionType) const
+{
+    const auto iter = virtualRegionsMap_.find(regionType);
+    if (iter != virtualRegionsMap_.cend()) return iter->second;
+    return {};
+}
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/VirtualZmwCompositeReader.cpp b/src/VirtualZmwCompositeReader.cpp

new file mode 100644 (file)

index 0000000..a34c5d4
--- /dev/null
+++ b/src/VirtualZmwCompositeReader.cpp
@@ -0,0 +1,109 @@
+// File Description
+/// \file VirtualZmwCompositeReader.cpp
+/// \brief Implements the VirtualZmwCompositeReader class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "VirtualZmwCompositeReader.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include "pbbam/MakeUnique.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+VirtualZmwCompositeReader::VirtualZmwCompositeReader(const DataSet& dataset)
+    : currentReader_(nullptr), filter_(PbiFilter::FromDataSet(dataset))
+{
+    // set up source queue
+    std::string primaryFn;
+    std::string scrapsFn;
+    const ExternalResources& resources = dataset.ExternalResources();
+    for (const ExternalResource& resource : resources) {
+
+        primaryFn.clear();
+        scrapsFn.clear();
+
+        // if resource is possible "primary" BAM
+        const auto& metatype = resource.MetaType();
+        if (metatype == "PacBio.SubreadFile.SubreadBamFile" ||
+            metatype == "PacBio.SubreadFile.HqRegionBamFile") {
+            // possible resolve relative path
+            primaryFn = dataset.ResolvePath(resource.ResourceId());
+
+            // check for associated scraps file
+            const auto& childResources = resource.ExternalResources();
+            for (const auto& childResource : childResources) {
+                const auto& childMetatype = childResource.MetaType();
+                if (childMetatype == "PacBio.SubreadFile.ScrapsBamFile" ||
+                    childMetatype == "PacBio.SubreadFile.HqScrapsBamFile") {
+                    // possible resolve relative path
+                    scrapsFn = dataset.ResolvePath(childResource.ResourceId());
+                    break;
+                }
+            }
+        }
+
+        // queue up source for later
+        if (!primaryFn.empty() && !scrapsFn.empty())
+            sources_.emplace_back(std::make_pair(primaryFn, scrapsFn));
+    }
+
+    // open first available source
+    OpenNextReader();
+}
+
+bool VirtualZmwCompositeReader::HasNext() { return (currentReader_ && currentReader_->HasNext()); }
+
+VirtualZmwBamRecord VirtualZmwCompositeReader::Next()
+{
+    if (currentReader_) {
+        const auto result = currentReader_->Next();
+        if (!currentReader_->HasNext()) OpenNextReader();
+        return result;
+    }
+
+    // no reader active
+    throw std::runtime_error{
+        "no readers active, make sure you use "
+        "VirtualZmwCompositeReader::HasNext before "
+        "requesting next record"};
+}
+
+std::vector<BamRecord> VirtualZmwCompositeReader::NextRaw()
+{
+    if (currentReader_) {
+        const auto result = currentReader_->NextRaw();
+        if (!currentReader_->HasNext()) OpenNextReader();
+        return result;
+    }
+
+    // no reader active
+    throw std::runtime_error{
+        "no readers active, make sure you use "
+        "VirtualZmwCompositeReader::HasNext before "
+        "requesting next group of records"};
+}
+
+void VirtualZmwCompositeReader::OpenNextReader()
+{
+    currentReader_.reset(nullptr);
+
+    // find next source pair with data
+    while (!sources_.empty()) {
+        const auto nextSource = sources_.front();
+        sources_.pop_front();
+
+        currentReader_ =
+            std::make_unique<VirtualZmwReader>(nextSource.first, nextSource.second, filter_);
+        if (currentReader_->HasNext()) return;
+    }
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/VirtualZmwCompositeReader.h b/src/VirtualZmwCompositeReader.h

new file mode 100644 (file)

index 0000000..3ab3d65
--- /dev/null
+++ b/src/VirtualZmwCompositeReader.h
@@ -0,0 +1,79 @@
+// File Description
+/// \file VirtualZmwCompositeReader.h
+/// \brief Defines the VirtualZmwCompositeReader class.
+//
+// Author: Derek Barnett
+
+#ifndef VIRTUALZMWCOMPOSITEREADER_H
+#define VIRTUALZMWCOMPOSITEREADER_H
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "VirtualZmwReader.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/PbiFilter.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+/// \brief The VirtualZmwCompositeReader provides an interface for
+///        re-stitching "virtual" polymerase reads from their constituent parts,
+///        across multiple %BAM resources from a DataSet.
+///
+/// This class is essentially a DataSet-aware wrapper around
+/// VirtualZmwReader, enabling multiple resources as input. See that
+/// class's documentation for more info.
+///
+class PBBAM_EXPORT VirtualZmwCompositeReader
+{
+public:
+    /// \name Constructors & Related Methods
+    /// \{
+
+    VirtualZmwCompositeReader(const DataSet& dataset);
+
+    VirtualZmwCompositeReader() = delete;
+    VirtualZmwCompositeReader(const VirtualZmwCompositeReader&) = delete;
+    VirtualZmwCompositeReader(VirtualZmwCompositeReader&&) = delete;
+    VirtualZmwCompositeReader& operator=(const VirtualZmwCompositeReader&) = delete;
+    VirtualZmwCompositeReader& operator=(VirtualZmwCompositeReader&&) = delete;
+    ~VirtualZmwCompositeReader() = default;
+
+    /// \}
+
+public:
+    /// \name Stitched Record Reading
+    ///
+
+    /// \returns true if more ZMWs/files are available for reading.
+    bool HasNext();
+
+    /// \returns the next stitched polymerase read
+    VirtualZmwBamRecord Next();
+
+    /// \returns the next set of reads that belong to one ZMW from one %BAM
+    ///          resource (a primary %BAM and/or its scraps file). This enables
+    ///          stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw();
+
+    /// \}
+
+private:
+    std::deque<std::pair<std::string, std::string> > sources_;
+    std::unique_ptr<VirtualZmwReader> currentReader_;
+    PbiFilter filter_;
+
+private:
+    void OpenNextReader();
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VIRTUALCOMPOSITEREADER_H
diff --git a/src/VirtualZmwReader.cpp b/src/VirtualZmwReader.cpp

new file mode 100644 (file)

index 0000000..bd99368
--- /dev/null
+++ b/src/VirtualZmwReader.cpp
@@ -0,0 +1,106 @@
+// File Description
+/// \file VirtualZmwReader.cpp
+/// \brief Implements the VirtualZmwReader class.
+//
+// Author: Armin Töpfer
+
+#include "PbbamInternalConfig.h"
+
+#include "VirtualZmwReader.h"
+
+#include <stdexcept>
+
+#include "pbbam/ReadGroupInfo.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+VirtualZmwReader::VirtualZmwReader(const std::string& primaryBamFilepath,
+                                   const std::string& scrapsBamFilepath)
+    : VirtualZmwReader(primaryBamFilepath, scrapsBamFilepath, PbiFilter{})
+{
+}
+
+VirtualZmwReader::VirtualZmwReader(const std::string& primaryBamFilepath,
+                                   const std::string& scrapsBamFilepath, const PbiFilter& filter)
+{
+    primaryBamFile_ = std::make_unique<BamFile>(primaryBamFilepath);
+    scrapsBamFile_ = std::make_unique<BamFile>(scrapsBamFilepath);
+
+    if (filter.IsEmpty()) {
+        primaryQuery_ = std::make_unique<EntireFileQuery>(*primaryBamFile_);
+        scrapsQuery_ = std::make_unique<EntireFileQuery>(*scrapsBamFile_);
+    } else {
+        primaryQuery_ = std::make_unique<PbiFilterQuery>(filter, *primaryBamFile_);
+        scrapsQuery_ = std::make_unique<PbiFilterQuery>(filter, *scrapsBamFile_);
+    }
+
+    primaryIt_ = (primaryQuery_->begin());
+    scrapsIt_ = (scrapsQuery_->begin());
+
+    stitchedHeader_ = std::make_unique<BamHeader>(primaryBamFile_->Header().ToSam());
+
+    // update stitched read group in header
+    auto readGroups = stitchedHeader_->ReadGroups();
+    if (readGroups.empty())
+        throw std::runtime_error{"Bam header of the primary bam has no read groups."};
+    readGroups[0].ReadType("POLYMERASE");
+    readGroups[0].Id(readGroups[0].MovieName(), "POLYMERASE");
+    if (readGroups.size() > 1) {
+        std::vector<ReadGroupInfo> singleGroup;
+        singleGroup.emplace_back(std::move(readGroups[0]));
+        readGroups = std::move(singleGroup);
+        stitchedHeader_->ClearReadGroups();
+    }
+    stitchedHeader_->ReadGroups(readGroups);
+}
+
+VirtualZmwReader::~VirtualZmwReader() {}
+
+bool VirtualZmwReader::HasNext()
+{
+    // Return true until both iterators are at the end of the query
+    return primaryIt_ != primaryQuery_->end() || scrapsIt_ != scrapsQuery_->end();
+}
+
+// This method is not thread safe
+VirtualZmwBamRecord VirtualZmwReader::Next()
+{
+    return VirtualZmwBamRecord{NextRaw(), *stitchedHeader_};
+}
+
+std::vector<BamRecord> VirtualZmwReader::NextRaw()
+{
+    std::vector<BamRecord> bamRecordVec;
+
+    // Current hole number, the smallest of scraps and primary.
+    // It can be that the next ZMW is scrap only.
+    int currentHoleNumber;
+    if (primaryIt_ == primaryQuery_->end())
+        currentHoleNumber = (*scrapsIt_).HoleNumber();
+    else if (scrapsIt_ == scrapsQuery_->end())
+        currentHoleNumber = (*primaryIt_).HoleNumber();
+    else
+        currentHoleNumber = std::min((*primaryIt_).HoleNumber(), (*scrapsIt_).HoleNumber());
+
+    // collect subreads or hqregions
+    while (primaryIt_ != primaryQuery_->end() && currentHoleNumber == (*primaryIt_).HoleNumber()) {
+        bamRecordVec.push_back(*primaryIt_++);
+    }
+
+    // collect scraps
+    while (scrapsIt_ != scrapsQuery_->end() && currentHoleNumber == (*scrapsIt_).HoleNumber()) {
+        bamRecordVec.push_back(*scrapsIt_++);
+    }
+
+    return bamRecordVec;
+}
+
+BamHeader VirtualZmwReader::PrimaryHeader() const { return primaryBamFile_->Header(); }
+
+BamHeader VirtualZmwReader::ScrapsHeader() const { return scrapsBamFile_->Header(); }
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/VirtualZmwReader.h b/src/VirtualZmwReader.h

new file mode 100644 (file)

index 0000000..93457ca
--- /dev/null
+++ b/src/VirtualZmwReader.h
@@ -0,0 +1,90 @@
+// File Description
+/// \file VirtualZmwReader.h
+/// \brief Defines the VirtualZmwReader class.
+//
+// Author: Armin Töpfer
+
+#ifndef VIRTUALZMWREADER_H
+#define VIRTUALZMWREADER_H
+
+#include <memory>
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/Config.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/PbiFilter.h"
+#include "pbbam/PbiFilterQuery.h"
+#include "pbbam/virtual/VirtualZmwBamRecord.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class VirtualZmwReader
+{
+public:
+    /// \brief Creates a reader that will operate on a primary %BAM file (e.g.
+    ///        subread data) and a scraps file, consuming all reads.
+    ///
+    /// \param[in] primaryBamFilepath hqregion.bam or subreads.bam file path
+    /// \param[in] scrapsBamFilepath  scraps.bam file path
+    ///
+    VirtualZmwReader(const std::string& primaryBamFilepath, const std::string& scrapsBamFilepath);
+
+    /// \brief Creates a reader that will operate on a primary %BAM file (e.g.
+    ///        subread data) and a scraps file, respecting the provided PBI
+    ///        filter.
+    ///
+    /// \note All %BAM files must have a corresponding ".pbi" index file to use
+    ///       the filter. You may need to call BamFile::EnsurePacBioIndexExists
+    ///       before constructing the reader.
+    ///
+    /// \param[in] primaryBamFilepath hqregion.bam or subreads.bam file path
+    /// \param[in] scrapsBamFilepath  scraps.bam file path
+    /// \param[in] filter PBI filter criteria
+    ///
+    VirtualZmwReader(const std::string& primaryBamFilepath, const std::string& scrapsBamFilepath,
+                     const PbiFilter& filter);
+
+    VirtualZmwReader() = delete;
+    VirtualZmwReader(const VirtualZmwReader&) = delete;
+    VirtualZmwReader(VirtualZmwReader&&) = delete;
+    VirtualZmwReader& operator=(const VirtualZmwReader&) = delete;
+    VirtualZmwReader& operator=(VirtualZmwReader&&) = delete;
+    ~VirtualZmwReader();
+
+public:
+    /// \returns the BamHeader associated with this reader's "primary" %BAM file
+    BamHeader PrimaryHeader() const;
+
+    /// \returns the BamHeader associated with this reader's "scraps" %BAM file
+    BamHeader ScrapsHeader() const;
+
+public:
+    /// \returns true if more ZMWs are available for reading.
+    bool HasNext();
+
+    /// \returns the next stitched polymerase read
+    VirtualZmwBamRecord Next();
+
+    /// \returns the next set of reads that belong to one ZMW.
+    ///          This enables stitching records in a distinct thread.
+    ///
+    std::vector<BamRecord> NextRaw();
+
+private:
+    std::unique_ptr<BamFile> primaryBamFile_;
+    std::unique_ptr<BamFile> scrapsBamFile_;
+    std::unique_ptr<internal::IQuery> primaryQuery_;
+    std::unique_ptr<internal::IQuery> scrapsQuery_;
+    internal::IQuery::iterator primaryIt_;
+    internal::IQuery::iterator scrapsIt_;
+    std::unique_ptr<BamHeader> stitchedHeader_;
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // VirtualZmwREADER_H
diff --git a/src/WhitelistedZmwReadStitcher.cpp b/src/WhitelistedZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..0a6575c
--- /dev/null
+++ b/src/WhitelistedZmwReadStitcher.cpp
@@ -0,0 +1,139 @@
+// File Description
+/// \file WhitelistedZmwReadStitcher.cpp
+/// \brief Implements the WhitelistedZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/virtual/WhitelistedZmwReadStitcher.h"
+
+#include <cassert>
+#include <cstdint>
+
+#include "VirtualZmwReader.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/PbiIndexedBamReader.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct WhitelistedZmwReadStitcher::WhitelistedZmwReadStitcherPrivate
+{
+public:
+    WhitelistedZmwReadStitcherPrivate(const std::vector<int32_t>& zmwWhitelist,
+                                      const std::string& primaryBamFilePath,
+                                      const std::string& scrapsBamFilePath)
+        : primaryBamFile_{std::make_unique<BamFile>(primaryBamFilePath)}
+        , scrapsBamFile_{std::make_unique<BamFile>(scrapsBamFilePath)}
+        , primaryReader_{std::make_unique<PbiIndexedBamReader>(*primaryBamFile_)}
+        , scrapsReader_{std::make_unique<PbiIndexedBamReader>(*scrapsBamFile_)}
+    {
+        // setup new header for stitched data
+        polyHeader_ = std::make_unique<BamHeader>(primaryBamFile_->Header().ToSam());
+        auto readGroups = polyHeader_->ReadGroups();
+        if (readGroups.empty())
+            throw std::runtime_error{"Bam header of the primary bam has no read groups."};
+        readGroups[0].ReadType("POLYMERASE");
+        readGroups[0].Id(readGroups[0].MovieName(), "POLYMERASE");
+        if (readGroups.size() > 1) {
+            std::vector<ReadGroupInfo> singleGroup;
+            singleGroup.emplace_back(std::move(readGroups[0]));
+            readGroups = std::move(singleGroup);
+            polyHeader_->ClearReadGroups();
+        }
+        polyHeader_->ReadGroups(readGroups);
+
+        // remove ZMWs up front, that are not found in either file
+        PreFilterZmws(zmwWhitelist);
+    }
+
+    bool HasNext() const { return !zmwWhitelist_.empty(); }
+
+    VirtualZmwBamRecord Next()
+    {
+        auto bamRecordVec = NextRaw();
+        return {std::move(bamRecordVec), *polyHeader_};
+    }
+
+    std::vector<BamRecord> NextRaw()
+    {
+        std::vector<BamRecord> result;
+        if (!HasNext()) return result;
+
+        const auto& zmw = zmwWhitelist_.front();
+        primaryReader_->Filter(PbiZmwFilter{zmw});
+        scrapsReader_->Filter(PbiZmwFilter{zmw});
+
+        BamRecord record;
+        while (primaryReader_->GetNext(record))
+            result.push_back(record);
+        while (scrapsReader_->GetNext(record))
+            result.push_back(record);
+
+        zmwWhitelist_.pop_front();
+        return result;
+    }
+
+    BamHeader PrimaryHeader() const { return primaryBamFile_->Header(); }
+
+    BamHeader ScrapsHeader() const { return scrapsBamFile_->Header(); }
+
+private:
+    std::unique_ptr<BamFile> primaryBamFile_;
+    std::unique_ptr<BamFile> scrapsBamFile_;
+    std::unique_ptr<PbiIndexedBamReader> primaryReader_;
+    std::unique_ptr<PbiIndexedBamReader> scrapsReader_;
+    std::unique_ptr<BamHeader> polyHeader_;
+    std::deque<int32_t> zmwWhitelist_;
+
+private:
+    void PreFilterZmws(const std::vector<int32_t>& zmwWhitelist)
+    {
+        // fetch input ZMWs
+        const PbiRawData primaryIndex{primaryBamFile_->PacBioIndexFilename()};
+        const PbiRawData scrapsIndex{scrapsBamFile_->PacBioIndexFilename()};
+        const auto& primaryZmws = primaryIndex.BasicData().holeNumber_;
+        const auto& scrapsZmws = scrapsIndex.BasicData().holeNumber_;
+
+        // toss them all into a set (for uniqueness & lookup here soon)
+        std::set<int32_t> inputZmws;
+        for (const auto& zmw : primaryZmws)
+            inputZmws.insert(zmw);
+        for (const auto& zmw : scrapsZmws)
+            inputZmws.insert(zmw);
+
+        // check our requested whitelist against files' ZMWs, keep if found
+        const auto inputEnd = inputZmws.cend();
+        for (const int32_t zmw : zmwWhitelist) {
+            if (inputZmws.find(zmw) != inputEnd) zmwWhitelist_.push_back(zmw);
+        }
+    }
+};
+
+// --------------------------------
+// ZmwReadStitcher implementation
+// --------------------------------
+
+WhitelistedZmwReadStitcher::WhitelistedZmwReadStitcher(const std::vector<int32_t>& zmwWhitelist,
+                                                       const std::string& primaryBamFilePath,
+                                                       const std::string& scrapsBamFilePath)
+    : d_{std::make_unique<WhitelistedZmwReadStitcherPrivate>(zmwWhitelist, primaryBamFilePath,
+                                                             scrapsBamFilePath)}
+{
+}
+
+WhitelistedZmwReadStitcher::~WhitelistedZmwReadStitcher() {}
+
+bool WhitelistedZmwReadStitcher::HasNext() const { return d_->HasNext(); }
+
+VirtualZmwBamRecord WhitelistedZmwReadStitcher::Next() { return d_->Next(); }
+
+std::vector<BamRecord> WhitelistedZmwReadStitcher::NextRaw() { return d_->NextRaw(); }
+
+BamHeader WhitelistedZmwReadStitcher::PrimaryHeader() const { return d_->PrimaryHeader(); }
+
+BamHeader WhitelistedZmwReadStitcher::ScrapsHeader() const { return d_->ScrapsHeader(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/XmlReader.cpp b/src/XmlReader.cpp

new file mode 100644 (file)

index 0000000..9d75c09
--- /dev/null
+++ b/src/XmlReader.cpp
@@ -0,0 +1,114 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "XmlReader.h"
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include "StringUtils.h"
+#include "pugixml/pugixml.hpp"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static void UpdateRegistry(const std::string& attributeName, const std::string& attributeValue,
+                           NamespaceRegistry& registry)
+{
+    std::vector<std::string> nameParts = Split(attributeName, ':');
+    assert(!nameParts.empty());
+    if (nameParts.size() > 2)
+        throw std::runtime_error{"malformed xmlns attribute: " + attributeName};
+
+    const bool isDefault = (nameParts.size() == 1);
+    const XsdType xsd = registry.XsdForUri(attributeValue);
+
+    if (isDefault)
+        registry.SetDefaultXsd(xsd);
+    else {
+        assert(nameParts.size() == 2);
+        const std::string& name = nameParts.at(1);
+        const std::string& uri = attributeValue;
+        NamespaceInfo namespaceInfo(name, uri);
+        registry.Register(xsd, namespaceInfo);
+    }
+}
+
+static void FromXml(const pugi::xml_node& xmlNode, DataSetElement& parent)
+{
+    // ignore non-named XML nodes
+    //
+    // pugi::xml separates XML parts into more node types than we use
+    //
+    const std::string label = xmlNode.name();
+    if (label.empty()) return;
+
+    // label & text
+    DataSetElement e(xmlNode.name(), FromInputXml());
+    e.Text(xmlNode.text().get());
+
+    // iterate attributes
+    auto attrIter = xmlNode.attributes_begin();
+    auto attrEnd = xmlNode.attributes_end();
+    for (; attrIter != attrEnd; ++attrIter)
+        e.Attribute(attrIter->name(), attrIter->value());
+
+    // iterate children, recursively building up subtree
+    auto childIter = xmlNode.begin();
+    auto childEnd = xmlNode.end();
+    for (; childIter != childEnd; ++childIter) {
+        pugi::xml_node childNode = *childIter;
+        FromXml(childNode, e);
+    }
+
+    // add our element to its parent
+    parent.AddChild(e);
+}
+
+std::unique_ptr<DataSetBase> XmlReader::FromStream(std::istream& in)
+{
+    pugi::xml_document doc;
+    const pugi::xml_parse_result loadResult = doc.load(in);
+    if (loadResult.status != pugi::status_ok)
+        throw std::runtime_error{"could not read XML file, error code:" +
+                                 std::to_string(loadResult.status)};
+
+    // parse top-level attributes
+    pugi::xml_node rootNode = doc.document_element();
+    if (rootNode == pugi::xml_node()) throw std::runtime_error{"could not fetch XML root node"};
+
+    // create dataset matching type strings
+    std::unique_ptr<DataSetBase> dataset(new DataSetBase);
+    dataset->Label(rootNode.name());
+
+    // iterate attributes, capture namespace info
+    const std::string xmlnsPrefix("xmlns");
+    auto attrIter = rootNode.attributes_begin();
+    auto attrEnd = rootNode.attributes_end();
+    for (; attrIter != attrEnd; ++attrIter) {
+        const std::string name = attrIter->name();
+        const std::string value = attrIter->value();
+        dataset->Attribute(name, value);
+
+        if (name.find(xmlnsPrefix) == 0) UpdateRegistry(name, value, dataset->Namespaces());
+    }
+
+    // iterate children, recursively building up subtree
+    auto childIter = rootNode.begin();
+    auto childEnd = rootNode.end();
+    for (; childIter != childEnd; ++childIter) {
+        pugi::xml_node childNode = *childIter;
+        internal::FromXml(childNode, *dataset.get());
+    }
+
+    return dataset;
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/XmlReader.h b/src/XmlReader.h

new file mode 100644 (file)

index 0000000..1fdd21f
--- /dev/null
+++ b/src/XmlReader.h
@@ -0,0 +1,25 @@
+// Author: Derek Barnett
+
+#ifndef XMLREADER_H
+#define XMLREADER_H
+
+#include <iosfwd>
+#include <memory>
+
+#include "pbbam/DataSet.h"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+class XmlReader
+{
+public:
+    static std::unique_ptr<DataSetBase> FromStream(std::istream& in);
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // XMLREADER_H
diff --git a/src/XmlWriter.cpp b/src/XmlWriter.cpp

new file mode 100644 (file)

index 0000000..5b3ef7a
--- /dev/null
+++ b/src/XmlWriter.cpp
@@ -0,0 +1,156 @@
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "XmlWriter.h"
+
+#include <cstddef>
+#include <fstream>
+#include <iostream>
+#include <map>
+
+#include "pbbam/DataSet.h"
+#include "pugixml/pugixml.hpp"
+
+namespace PacBio {
+namespace BAM {
+namespace internal {
+
+static std::string Prefix(const std::string& input)
+{
+    const auto colonFound = input.find(':');
+    if (colonFound == std::string::npos || colonFound == 0) return std::string();
+    return input.substr(0, colonFound);
+}
+
+static std::string OutputName(const DataSetElement& node, const NamespaceRegistry& registry)
+{
+    // if from input XML, respect the namespaces given
+    if (node.IsVerbatimLabel()) return node.QualifiedNameLabel();
+
+    // otherwise, probably user-generated
+    else {
+        // if no namespace prefix, prepend the appropriate one & return
+        if (node.PrefixLabel().empty()) {
+            static const std::string colon = ":";
+            auto xsdType = node.Xsd();
+            if (xsdType == XsdType::NONE)
+                xsdType = registry.XsdForElement(node.LocalNameLabel().to_string());
+            return registry.Namespace(xsdType).Name() + colon + node.LocalNameLabel().to_string();
+        }
+        // otherwise, has prefix - return full name
+        else
+            return node.QualifiedNameLabel();
+    }
+}
+
+static void ToXml(const DataSetElement& node, const NamespaceRegistry& registry,
+                  std::map<XsdType, std::string>& xsdPrefixesUsed, pugi::xml_node& parentXml)
+{
+    // create child of parent, w/ label & text
+    const auto label = OutputName(node, registry);
+    if (label.empty()) return;  // error?
+    auto xmlNode = parentXml.append_child(label.c_str());
+
+    if (!node.Text().empty()) xmlNode.text().set(node.Text().c_str());
+
+    // store XSD type for later
+    const auto prefix = Prefix(label);
+    if (!prefix.empty()) xsdPrefixesUsed[node.Xsd()] = prefix;
+
+    // add attributes
+    for (const auto& attribute : node.Attributes()) {
+        const auto& name = attribute.first;
+        if (name.empty()) continue;
+        auto attr = xmlNode.append_attribute(name.c_str());
+        attr.set_value(attribute.second.c_str());
+    }
+
+    // additional stuff later? (e.g. comments)
+
+    // iterate children, recursively building up subtree
+    for (const auto& child : node.Children())
+        ToXml(child, registry, xsdPrefixesUsed, xmlNode);
+}
+
+void XmlWriter::ToStream(const DataSetBase& dataset, std::ostream& out)
+{
+    pugi::xml_document doc;
+
+    const auto& registry = dataset.Namespaces();
+
+    // create top-level dataset XML node
+    const auto label = internal::OutputName(dataset, registry);
+    if (label.empty()) throw std::runtime_error{"could not convert dataset node to XML"};
+    auto root = doc.append_child(label.c_str());
+
+    const auto& text = dataset.Text();
+    if (!text.empty()) root.text().set(text.c_str());
+
+    // add top-level attributes
+    for (const auto& attribute : dataset.Attributes()) {
+        const auto& name = attribute.first;
+        const auto& value = attribute.second;
+        if (name.empty()) continue;
+        auto attr = root.append_attribute(name.c_str());
+        attr.set_value(value.c_str());
+    }
+
+    std::map<XsdType, std::string> xsdPrefixesUsed;
+    xsdPrefixesUsed[dataset.Xsd()] = Prefix(label);
+
+    // iterate children, recursively building up subtree
+    for (const auto& child : dataset.Children())
+        ToXml(child, registry, xsdPrefixesUsed, root);
+
+    // write XML to stream
+    auto decl = doc.prepend_child(pugi::node_declaration);
+    decl.append_attribute("version") = "1.0";
+    decl.append_attribute("encoding") = "utf-8";
+
+    // add XSD namespace attributes
+    auto xmlnsDefaultAttribute = root.attribute("xmlns");
+    if (xmlnsDefaultAttribute.empty()) {
+        xmlnsDefaultAttribute = root.append_attribute("xmlns");
+        xmlnsDefaultAttribute.set_value(registry.DefaultNamespace().Uri().c_str());
+    }
+    auto xsiAttribute = root.attribute("xmlns:xsi");
+    if (xsiAttribute.empty()) {
+        xsiAttribute = root.append_attribute("xmlns:xsi");
+        xsiAttribute.set_value("http://www.w3.org/2001/XMLSchema-instance");
+    }
+    auto xsiSchemaLocationAttribute = root.attribute("xsi:schemaLocation");
+    if (xsiSchemaLocationAttribute.empty()) {
+        xsiSchemaLocationAttribute = root.append_attribute("xsi:schemaLocation");
+        xsiSchemaLocationAttribute.set_value(registry.DefaultNamespace().Uri().c_str());
+    }
+
+    static const std::string xmlnsPrefix = "xmlns:";
+    for (const auto prefixIter : xsdPrefixesUsed) {
+        const auto& xsdType = prefixIter.first;
+        const auto& prefix = prefixIter.second;
+        if (xsdType == XsdType::NONE || prefix.empty()) continue;
+
+        const auto& nsInfo = registry.Namespace(xsdType);
+        assert(nsInfo.Name() == prefix);
+        const auto xmlnsName = xmlnsPrefix + prefix;
+        auto xmlnsAttribute = root.attribute(xmlnsName.c_str());
+        if (xmlnsAttribute.empty()) {
+            xmlnsAttribute = root.append_attribute(xmlnsName.c_str());
+            xmlnsAttribute.set_value(nsInfo.Uri().c_str());
+        }
+    }
+
+    // "no escapes" to allow explicit ">" "<" comparison operators in filter parameters
+    // we may remove this if/when comparison is separated from the value
+    doc.save(out, "\t", pugi::format_default | pugi::format_no_escapes, pugi::encoding_utf8);
+}
+
+void XmlWriter::ToStream(const std::unique_ptr<DataSetBase>& dataset, std::ostream& out)
+{
+    ToStream(*dataset.get(), out);
+}
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/XmlWriter.h b/src/XmlWriter.h

new file mode 100644 (file)

index 0000000..56c1398
--- /dev/null
+++ b/src/XmlWriter.h
@@ -0,0 +1,27 @@
+// Author: Derek Barnett
+
+#ifndef XMLWRITER_H
+#define XMLWRITER_H
+
+#include <iosfwd>
+#include <memory>
+
+namespace PacBio {
+namespace BAM {
+
+class DataSetBase;
+
+namespace internal {
+
+class XmlWriter
+{
+public:
+    static void ToStream(const DataSetBase& dataset, std::ostream& out);
+    static void ToStream(const std::unique_ptr<DataSetBase>& dataset, std::ostream& out);
+};
+
+}  // namespace internal
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif  // XMLWRITER_H
diff --git a/src/ZmwGroupQuery.cpp b/src/ZmwGroupQuery.cpp

new file mode 100644 (file)

index 0000000..48bd1e7
--- /dev/null
+++ b/src/ZmwGroupQuery.cpp
@@ -0,0 +1,77 @@
+// File Description
+/// \file ZmwQuery.cpp
+/// \brief Implements the ZmwQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ZmwGroupQuery.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+
+#include "MemoryUtils.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/PbiFilterTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct ZmwGroupQuery::ZmwGroupQueryPrivate
+{
+    using ReaderType = PbiFilterCompositeBamReader<Compare::Zmw>;
+
+    ZmwGroupQueryPrivate(const std::vector<int32_t>& zmwWhitelist, const DataSet& dataset)
+        : whitelist_(zmwWhitelist.cbegin(), zmwWhitelist.cend())
+    {
+        std::sort(whitelist_.begin(), whitelist_.end());
+        whitelist_.erase(std::unique(whitelist_.begin(), whitelist_.end()), whitelist_.end());
+
+        if (!whitelist_.empty()) {
+            reader_ = std::make_unique<ReaderType>(PbiZmwFilter{whitelist_.front()}, dataset);
+            whitelist_.pop_front();
+        }
+    }
+
+    bool GetNext(std::vector<BamRecord>& records)
+    {
+        records.clear();
+        if (!reader_) return false;
+
+        // get all records matching ZMW
+        BamRecord r;
+        while (reader_->GetNext(r))
+            records.push_back(r);
+
+        // set next ZMW (if any left)
+        if (!whitelist_.empty()) {
+            reader_->Filter(PbiZmwFilter{whitelist_.front()});
+            whitelist_.pop_front();
+        }
+
+        // otherwise destroy reader, next iteration will return false
+        else
+            reader_.reset();
+
+        return true;
+    }
+
+    std::deque<int32_t> whitelist_;
+    std::unique_ptr<ReaderType> reader_;
+};
+
+ZmwGroupQuery::ZmwGroupQuery(const std::vector<int32_t>& zmwWhitelist, const DataSet& dataset)
+    : internal::IGroupQuery(), d_{std::make_unique<ZmwGroupQueryPrivate>(zmwWhitelist, dataset)}
+{
+}
+
+ZmwGroupQuery::~ZmwGroupQuery() {}
+
+bool ZmwGroupQuery::GetNext(std::vector<BamRecord>& records) { return d_->GetNext(records); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ZmwQuery.cpp b/src/ZmwQuery.cpp

new file mode 100644 (file)

index 0000000..ec9edd4
--- /dev/null
+++ b/src/ZmwQuery.cpp
@@ -0,0 +1,40 @@
+// File Description
+/// \file ZmwQuery.cpp
+/// \brief Implements the ZmwQuery class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ZmwQuery.h"
+
+#include <cstdint>
+
+#include "pbbam/CompositeBamReader.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/PbiFilterTypes.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct ZmwQuery::ZmwQueryPrivate
+{
+    ZmwQueryPrivate(std::vector<int32_t> zmwWhitelist, const DataSet& dataset)
+        : reader_{PbiZmwFilter{std::move(zmwWhitelist)}, dataset}
+    {
+    }
+
+    PbiFilterCompositeBamReader<Compare::Zmw> reader_;
+};
+
+ZmwQuery::ZmwQuery(std::vector<int32_t> zmwWhitelist, const DataSet& dataset)
+    : internal::IQuery(), d_{std::make_unique<ZmwQueryPrivate>(zmwWhitelist, dataset)}
+{
+}
+
+ZmwQuery::~ZmwQuery() {}
+
+bool ZmwQuery::GetNext(BamRecord& r) { return d_->reader_.GetNext(r); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ZmwReadStitcher.cpp b/src/ZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..62ee705
--- /dev/null
+++ b/src/ZmwReadStitcher.cpp
@@ -0,0 +1,168 @@
+// File Description
+/// \file ZmwReadStitcher.cpp
+/// \brief Implements the ZmwReadStitcher class.
+//
+// Author: Derek Barnett
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/virtual/ZmwReadStitcher.h"
+
+#include <deque>
+#include <stdexcept>
+#include <utility>
+
+#include "VirtualZmwReader.h"
+#include "pbbam/DataSet.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/MakeUnique.h"
+#include "pbbam/PbiFilter.h"
+#include "pbbam/PbiFilterQuery.h"
+
+namespace PacBio {
+namespace BAM {
+
+struct ZmwReadStitcher::ZmwReadStitcherPrivate
+{
+public:
+    ZmwReadStitcherPrivate(std::string primaryBamFilePath, std::string scrapsBamFilePath,
+                           PbiFilter filter)
+        : filter_{std::move(filter)}
+    {
+        sources_.push_back({std::move(primaryBamFilePath), std::move(scrapsBamFilePath)});
+        OpenNextReader();
+    }
+
+    ZmwReadStitcherPrivate(const DataSet& dataset) : filter_{PbiFilter::FromDataSet(dataset)}
+    {
+        // set up source queue
+        std::string primaryFn;
+        std::string scrapsFn;
+        const auto& resources = dataset.ExternalResources();
+        for (const ExternalResource& resource : resources) {
+
+            primaryFn.clear();
+            scrapsFn.clear();
+
+            // if resource is possible "primary" BAM
+            const auto& metatype = resource.MetaType();
+            if (metatype == "PacBio.SubreadFile.SubreadBamFile" ||
+                metatype == "PacBio.SubreadFile.HqRegionBamFile") {
+                // possible resolve relative path
+                primaryFn = dataset.ResolvePath(resource.ResourceId());
+
+                // check for associated scraps file
+                const auto& childResources = resource.ExternalResources();
+                for (const auto& childResource : childResources) {
+                    const auto& childMetatype = childResource.MetaType();
+                    if (childMetatype == "PacBio.SubreadFile.ScrapsBamFile" ||
+                        childMetatype == "PacBio.SubreadFile.HqScrapsBamFile") {
+                        // possible resolve relative path
+                        scrapsFn = dataset.ResolvePath(childResource.ResourceId());
+                        break;
+                    }
+                }
+            }
+
+            // queue up source for later
+            if (!primaryFn.empty() && !scrapsFn.empty())
+                sources_.emplace_back(std::make_pair(primaryFn, scrapsFn));
+        }
+
+        OpenNextReader();
+    }
+
+public:
+    bool HasNext() const { return (currentReader_ && currentReader_->HasNext()); }
+
+    VirtualZmwBamRecord Next()
+    {
+        if (currentReader_) {
+            const auto result = currentReader_->Next();
+            if (!currentReader_->HasNext()) OpenNextReader();
+            return result;
+        }
+
+        // no reader active
+        throw std::runtime_error{
+            "no readers active, make sure you use "
+            "ZmwReadStitcher::HasNext before "
+            "requesting next record"};
+    }
+
+    std::vector<BamRecord> NextRaw()
+    {
+        if (currentReader_) {
+            const auto result = currentReader_->NextRaw();
+            if (!currentReader_->HasNext()) OpenNextReader();
+            return result;
+        }
+
+        // no reader active
+        throw std::runtime_error{
+            "no readers active, make sure you use "
+            "ZmwReadStitcher::HasNext before "
+            "requesting next group of records"};
+    }
+
+    BamHeader PrimaryHeader() const { return currentReader_->PrimaryHeader(); }
+
+    BamHeader ScrapsHeader() const { return currentReader_->ScrapsHeader(); }
+
+private:
+    std::deque<std::pair<std::string, std::string> > sources_;
+    std::unique_ptr<internal::VirtualZmwReader> currentReader_;
+    PbiFilter filter_;
+
+private:
+    void OpenNextReader()
+    {
+        currentReader_.reset(nullptr);
+
+        // find next source pair with data
+        while (!sources_.empty()) {
+            const auto nextSource = sources_.front();
+            sources_.pop_front();
+
+            currentReader_ = std::make_unique<internal::VirtualZmwReader>(
+                nextSource.first, nextSource.second, filter_);
+            if (currentReader_->HasNext()) return;
+        }
+    }
+};
+
+// --------------------------------
+// ZmwReadStitcher implementation
+// --------------------------------
+
+ZmwReadStitcher::ZmwReadStitcher(std::string primaryBamFilePath, std::string scrapsBamFilePath)
+    : ZmwReadStitcher{std::move(primaryBamFilePath), std::move(scrapsBamFilePath), PbiFilter{}}
+{
+}
+
+ZmwReadStitcher::ZmwReadStitcher(std::string primaryBamFilePath, std::string scrapsBamFilePath,
+                                 PbiFilter filter)
+    : d_{std::make_unique<ZmwReadStitcherPrivate>(std::move(primaryBamFilePath),
+                                                  std::move(scrapsBamFilePath), std::move(filter))}
+{
+}
+
+ZmwReadStitcher::ZmwReadStitcher(const DataSet& dataset)
+    : d_{std::make_unique<ZmwReadStitcherPrivate>(dataset)}
+{
+}
+
+ZmwReadStitcher::~ZmwReadStitcher() {}
+
+bool ZmwReadStitcher::HasNext() { return d_->HasNext(); }
+
+VirtualZmwBamRecord ZmwReadStitcher::Next() { return d_->Next(); }
+
+std::vector<BamRecord> ZmwReadStitcher::NextRaw() { return d_->NextRaw(); }
+
+BamHeader ZmwReadStitcher::PrimaryHeader() const { return d_->PrimaryHeader().DeepCopy(); }
+
+BamHeader ZmwReadStitcher::ScrapsHeader() const { return d_->ScrapsHeader().DeepCopy(); }
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/ZmwTypeMap.cpp b/src/ZmwTypeMap.cpp

new file mode 100644 (file)

index 0000000..4c4b5b5
--- /dev/null
+++ b/src/ZmwTypeMap.cpp
@@ -0,0 +1,25 @@
+// File Description
+/// \file ZmwTypeMap.cpp
+/// \brief Implements the ZmwTypeMap class.
+//
+// Author: Armin Töpfer
+
+#include "PbbamInternalConfig.h"
+
+#include "pbbam/ZmwTypeMap.h"
+
+namespace PacBio {
+namespace BAM {
+
+// clang-format off
+std::map<char, ZmwType> ZmwTypeMap::ParseChar
+{
+    { 'C' , ZmwType::CONTROL   },
+    { 'M' , ZmwType::MALFORMED },
+    { 'N' , ZmwType::NORMAL    },
+    { 'S' , ZmwType::SENTINEL  }
+};
+// clang-format on
+
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/src/files.cmake b/src/files.cmake

new file mode 100644 (file)

index 0000000..2e3f73e
--- /dev/null
+++ b/src/files.cmake
@@ -0,0 +1,263 @@
+
+# headers
+set( PacBioBAM_H
+
+    # API headers
+    ${PacBioBAM_IncludeDir}/pbbam/Accuracy.h
+    ${PacBioBAM_IncludeDir}/pbbam/AlignmentPrinter.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamFile.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamHeader.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamRecord.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamRecordBuilder.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamRecordImpl.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamRecordTag.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamRecordView.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamTagCodec.h
+    ${PacBioBAM_IncludeDir}/pbbam/BaiIndexedBamReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/CompositeBamReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/CompositeFastaReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/BamWriter.h
+    ${PacBioBAM_IncludeDir}/pbbam/BarcodeQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/Cigar.h
+    ${PacBioBAM_IncludeDir}/pbbam/CigarOperation.h
+    ${PacBioBAM_IncludeDir}/pbbam/ClipType.h
+    ${PacBioBAM_IncludeDir}/pbbam/Compare.h
+    ${PacBioBAM_IncludeDir}/pbbam/Config.h
+    ${PacBioBAM_IncludeDir}/pbbam/DataSet.h
+    ${PacBioBAM_IncludeDir}/pbbam/DataSetTypes.h
+    ${PacBioBAM_IncludeDir}/pbbam/DataSetXsd.h
+    ${PacBioBAM_IncludeDir}/pbbam/EntireFileQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/FastaReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/FastaSequence.h
+    ${PacBioBAM_IncludeDir}/pbbam/FastaSequenceQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/FastqReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/FastqSequence.h
+    ${PacBioBAM_IncludeDir}/pbbam/FrameEncodingType.h
+    ${PacBioBAM_IncludeDir}/pbbam/Frames.h
+    ${PacBioBAM_IncludeDir}/pbbam/GenomicInterval.h
+    ${PacBioBAM_IncludeDir}/pbbam/GenomicIntervalQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/IndexedBamWriter.h
+    ${PacBioBAM_IncludeDir}/pbbam/IndexedFastaReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/Interval.h
+    ${PacBioBAM_IncludeDir}/pbbam/IRecordWriter.h
+    ${PacBioBAM_IncludeDir}/pbbam/LocalContextFlags.h
+    ${PacBioBAM_IncludeDir}/pbbam/MakeUnique.h
+    ${PacBioBAM_IncludeDir}/pbbam/MD5.h
+    ${PacBioBAM_IncludeDir}/pbbam/Orientation.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiBasicTypes.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiBuilder.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiFile.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiFilter.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiFilterQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiFilterTypes.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiIndexedBamReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/PbiRawData.h
+    ${PacBioBAM_IncludeDir}/pbbam/Position.h
+    ${PacBioBAM_IncludeDir}/pbbam/ProgramInfo.h
+    ${PacBioBAM_IncludeDir}/pbbam/PulseBehavior.h
+    ${PacBioBAM_IncludeDir}/pbbam/PulseExclusionReason.h
+    ${PacBioBAM_IncludeDir}/pbbam/QNameQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/QualityValue.h
+    ${PacBioBAM_IncludeDir}/pbbam/QualityValues.h
+    ${PacBioBAM_IncludeDir}/pbbam/ReadAccuracyQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/ReadGroupInfo.h
+    ${PacBioBAM_IncludeDir}/pbbam/RecordType.h
+    ${PacBioBAM_IncludeDir}/pbbam/SamTagCodec.h
+    ${PacBioBAM_IncludeDir}/pbbam/SamWriter.h
+    ${PacBioBAM_IncludeDir}/pbbam/SequenceInfo.h
+    ${PacBioBAM_IncludeDir}/pbbam/Strand.h
+    ${PacBioBAM_IncludeDir}/pbbam/StringUtilities.h
+    ${PacBioBAM_IncludeDir}/pbbam/SubreadLengthQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/Tag.h
+    ${PacBioBAM_IncludeDir}/pbbam/TagCollection.h
+    ${PacBioBAM_IncludeDir}/pbbam/Unused.h
+    ${PacBioBAM_IncludeDir}/pbbam/Validator.h
+    ${PacBioBAM_IncludeDir}/pbbam/ZmwGroupQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/ZmwQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/ZmwType.h
+    ${PacBioBAM_IncludeDir}/pbbam/ZmwTypeMap.h
+
+    # exception headers
+    ${PacBioBAM_IncludeDir}/pbbam/exception/BundleChemistryMappingException.h
+    ${PacBioBAM_IncludeDir}/pbbam/exception/InvalidSequencingChemistryException.h
+    ${PacBioBAM_IncludeDir}/pbbam/exception/ValidationException.h
+
+    # API-internal headers & inline files
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Accuracy.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/BamHeader.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecord.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecordBuilder.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecordImpl.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/BamRecordView.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Cigar.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/CigarOperation.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Compare.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/CompositeBamReader.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/CompositeFastaReader.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSet.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetBaseTypes.h
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetBaseTypes.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetElement.h
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetElement.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetListElement.h
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetListElement.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/DataSetTypes.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/FastaSequence.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/FastqSequence.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Frames.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/GenomicInterval.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Interval.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/PbiBasicTypes.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/PbiFilter.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/PbiFilterTypes.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/PbiRawData.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/ProgramInfo.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/QualityValue.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/QualityValues.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/QueryBase.h
+    ${PacBioBAM_IncludeDir}/pbbam/internal/QueryBase.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/ReadGroupInfo.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/SequenceInfo.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Tag.inl
+    ${PacBioBAM_IncludeDir}/pbbam/internal/Validator.inl
+
+    # vcf headers
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/VcfVariant.h
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/VcfFile.h
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/VcfFormat.h
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/VcfHeader.h
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/VcfHeaderTypes.h
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/VcfReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/VcfSort.h
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/VcfQuery.h
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/VcfWriter.h
+
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/internal/VcfVariant.inl
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/internal/VcfFile.inl
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/internal/VcfHeader.inl
+    ${PacBioBAM_IncludeDir}/pbbam/vcf/internal/VcfHeaderTypes.inl
+ 
+    # virtual headers
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualPolymeraseBamRecord.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualPolymeraseCompositeReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualPolymeraseReader.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualRegion.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualRegionType.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualRegionTypeMap.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/VirtualZmwBamRecord.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/WhitelistedZmwReadStitcher.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/ZmwReadStitcher.h
+    ${PacBioBAM_IncludeDir}/pbbam/virtual/ZmwWhitelistVirtualReader.h
+
+    # library-internal headers
+    ${PacBioBAM_SourceDir}/BamRecordTags.h
+    ${PacBioBAM_SourceDir}/ChemistryTable.h
+    ${PacBioBAM_SourceDir}/DataSetIO.h
+    ${PacBioBAM_SourceDir}/DataSetUtils.h
+    ${PacBioBAM_SourceDir}/EnumClassHash.h
+    ${PacBioBAM_SourceDir}/FileProducer.h
+    ${PacBioBAM_SourceDir}/FileUtils.h
+    ${PacBioBAM_SourceDir}/FofnReader.h
+    ${PacBioBAM_SourceDir}/MemoryUtils.h
+    ${PacBioBAM_SourceDir}/PbiIndexIO.h
+    ${PacBioBAM_SourceDir}/Pulse2BaseCache.h
+    ${PacBioBAM_SourceDir}/SequenceUtils.h
+    ${PacBioBAM_SourceDir}/StringUtils.h
+    ${PacBioBAM_SourceDir}/TimeUtils.h
+    ${PacBioBAM_SourceDir}/ValidationErrors.h
+    ${PacBioBAM_SourceDir}/Version.h
+    ${PacBioBAM_SourceDir}/VirtualZmwCompositeReader.h
+    ${PacBioBAM_SourceDir}/VirtualZmwReader.h
+    ${PacBioBAM_SourceDir}/XmlReader.h
+    ${PacBioBAM_SourceDir}/XmlWriter.h
+    ${PacBioBAM_SourceDir}/pugixml/pugiconfig.hpp
+    ${PacBioBAM_SourceDir}/pugixml/pugixml.hpp
+)
+
+# sources
+set( PacBioBAM_CPP
+
+    ${PacBioBAM_SourceDir}/Accuracy.cpp
+    ${PacBioBAM_SourceDir}/AlignmentPrinter.cpp
+    ${PacBioBAM_SourceDir}/BaiIndexedBamReader.cpp
+    ${PacBioBAM_SourceDir}/BamFile.cpp
+    ${PacBioBAM_SourceDir}/BamHeader.cpp
+    ${PacBioBAM_SourceDir}/BamReader.cpp
+    ${PacBioBAM_SourceDir}/BamRecord.cpp
+    ${PacBioBAM_SourceDir}/BamRecordBuilder.cpp
+    ${PacBioBAM_SourceDir}/BamRecordImpl.cpp
+    ${PacBioBAM_SourceDir}/BamRecordTags.cpp
+    ${PacBioBAM_SourceDir}/BamTagCodec.cpp
+    ${PacBioBAM_SourceDir}/BamWriter.cpp
+    ${PacBioBAM_SourceDir}/BarcodeQuery.cpp
+    ${PacBioBAM_SourceDir}/ChemistryTable.cpp
+    ${PacBioBAM_SourceDir}/Cigar.cpp
+    ${PacBioBAM_SourceDir}/CigarOperation.cpp
+    ${PacBioBAM_SourceDir}/Compare.cpp
+    ${PacBioBAM_SourceDir}/Config.cpp
+    ${PacBioBAM_SourceDir}/DataSet.cpp
+    ${PacBioBAM_SourceDir}/DataSetBaseTypes.cpp
+    ${PacBioBAM_SourceDir}/DataSetElement.cpp
+    ${PacBioBAM_SourceDir}/DataSetIO.cpp
+    ${PacBioBAM_SourceDir}/DataSetTypes.cpp
+    ${PacBioBAM_SourceDir}/DataSetXsd.cpp
+    ${PacBioBAM_SourceDir}/EntireFileQuery.cpp
+    ${PacBioBAM_SourceDir}/FastaReader.cpp
+    ${PacBioBAM_SourceDir}/FastaSequenceQuery.cpp
+    ${PacBioBAM_SourceDir}/FastqReader.cpp
+    ${PacBioBAM_SourceDir}/FileProducer.cpp
+    ${PacBioBAM_SourceDir}/FileUtils.cpp
+    ${PacBioBAM_SourceDir}/FofnReader.cpp
+    ${PacBioBAM_SourceDir}/Frames.cpp
+    ${PacBioBAM_SourceDir}/GenomicInterval.cpp
+    ${PacBioBAM_SourceDir}/GenomicIntervalQuery.cpp
+    ${PacBioBAM_SourceDir}/IndexedBamWriter.cpp
+    ${PacBioBAM_SourceDir}/IndexedFastaReader.cpp
+    ${PacBioBAM_SourceDir}/MD5.cpp
+    ${PacBioBAM_SourceDir}/MemoryUtils.cpp
+    ${PacBioBAM_SourceDir}/PbiBuilder.cpp
+    ${PacBioBAM_SourceDir}/PbiFile.cpp
+    ${PacBioBAM_SourceDir}/PbiFilter.cpp
+    ${PacBioBAM_SourceDir}/PbiFilterQuery.cpp
+    ${PacBioBAM_SourceDir}/PbiFilterTypes.cpp
+    ${PacBioBAM_SourceDir}/PbiIndexedBamReader.cpp
+    ${PacBioBAM_SourceDir}/PbiIndexIO.cpp
+    ${PacBioBAM_SourceDir}/PbiRawData.cpp
+    ${PacBioBAM_SourceDir}/ProgramInfo.cpp
+    ${PacBioBAM_SourceDir}/QNameQuery.cpp
+    ${PacBioBAM_SourceDir}/QualityValue.cpp
+    ${PacBioBAM_SourceDir}/ReadAccuracyQuery.cpp
+    ${PacBioBAM_SourceDir}/ReadGroupInfo.cpp
+    ${PacBioBAM_SourceDir}/SamTagCodec.cpp
+    ${PacBioBAM_SourceDir}/SamWriter.cpp
+    ${PacBioBAM_SourceDir}/SequenceInfo.cpp
+    ${PacBioBAM_SourceDir}/SubreadLengthQuery.cpp
+    ${PacBioBAM_SourceDir}/Validator.cpp
+    ${PacBioBAM_SourceDir}/ValidationErrors.cpp
+    ${PacBioBAM_SourceDir}/ValidationException.cpp
+    ${PacBioBAM_SourceDir}/Version.cpp
+    ${PacBioBAM_SourceDir}/VirtualZmwBamRecord.cpp
+    ${PacBioBAM_SourceDir}/VirtualZmwCompositeReader.cpp
+    ${PacBioBAM_SourceDir}/VirtualZmwReader.cpp
+    ${PacBioBAM_SourceDir}/VirtualRegionTypeMap.cpp
+    ${PacBioBAM_SourceDir}/XmlReader.cpp
+    ${PacBioBAM_SourceDir}/XmlWriter.cpp
+    ${PacBioBAM_SourceDir}/WhitelistedZmwReadStitcher.cpp
+    ${PacBioBAM_SourceDir}/ZmwGroupQuery.cpp
+    ${PacBioBAM_SourceDir}/ZmwReadStitcher.cpp
+    ${PacBioBAM_SourceDir}/ZmwQuery.cpp
+    ${PacBioBAM_SourceDir}/ZmwTypeMap.cpp
+
+    # vcf
+    ${PacBioBAM_SourceDir}/vcf/VcfVariant.cpp
+    ${PacBioBAM_SourceDir}/vcf/VcfFormat.cpp
+    ${PacBioBAM_SourceDir}/vcf/VcfHeader.cpp
+    ${PacBioBAM_SourceDir}/vcf/VcfReader.cpp
+    ${PacBioBAM_SourceDir}/vcf/VcfSort.cpp
+    ${PacBioBAM_SourceDir}/vcf/VcfQuery.cpp
+    ${PacBioBAM_SourceDir}/vcf/VcfWriter.cpp
+
+    # XML I/O
+    ${PacBioBAM_SourceDir}/pugixml/pugixml.cpp
+)
diff --git a/src/meson.build b/src/meson.build

new file mode 100644 (file)

index 0000000..8949c3e
--- /dev/null
+++ b/src/meson.build
@@ -0,0 +1,112 @@
+###########
+# sources #
+###########
+
+pbbam_cpp_sources = files([
+  'Accuracy.cpp',
+  'AlignmentPrinter.cpp',
+  'BaiIndexedBamReader.cpp',
+  'BamFile.cpp',
+  'BamHeader.cpp',
+  'BamReader.cpp',
+  'BamRecord.cpp',
+  'BamRecordBuilder.cpp',
+  'BamRecordImpl.cpp',
+  'BamRecordTags.cpp',
+  'BamTagCodec.cpp',
+  'BamWriter.cpp',
+  'BarcodeQuery.cpp',
+  'ChemistryTable.cpp',
+  'Cigar.cpp',
+  'CigarOperation.cpp',
+  'Compare.cpp',
+  'Config.cpp',
+  'DataSet.cpp',
+  'DataSetBaseTypes.cpp',
+  'DataSetElement.cpp',
+  'DataSetIO.cpp',
+  'DataSetTypes.cpp',
+  'DataSetXsd.cpp',
+  'EntireFileQuery.cpp',
+  'FastaReader.cpp',
+  'FastaSequenceQuery.cpp',
+  'FastqReader.cpp',
+  'FileProducer.cpp',
+  'FileUtils.cpp',
+  'FofnReader.cpp',
+  'Frames.cpp',
+  'GenomicInterval.cpp',
+  'GenomicIntervalQuery.cpp',
+  'IndexedBamWriter.cpp',
+  'IndexedFastaReader.cpp',
+  'MD5.cpp',
+  'MemoryUtils.cpp',
+  'PbiBuilder.cpp',
+  'PbiFile.cpp',
+  'PbiFilter.cpp',
+  'PbiFilterQuery.cpp',
+  'PbiFilterTypes.cpp',
+  'PbiIndexedBamReader.cpp',
+  'PbiIndexIO.cpp',
+  'PbiRawData.cpp',
+  'ProgramInfo.cpp',
+  'QNameQuery.cpp',
+  'QualityValue.cpp',
+  'ReadAccuracyQuery.cpp',
+  'ReadGroupInfo.cpp',
+  'SamTagCodec.cpp',
+  'SamWriter.cpp',
+  'SequenceInfo.cpp',
+  'SubreadLengthQuery.cpp',
+  'Validator.cpp',
+  'ValidationErrors.cpp',
+  'ValidationException.cpp',
+  'Version.cpp',
+  'VirtualZmwBamRecord.cpp',
+  'VirtualZmwCompositeReader.cpp',
+  'VirtualZmwReader.cpp',
+  'VirtualRegionTypeMap.cpp',
+  'XmlReader.cpp',
+  'XmlWriter.cpp',
+  'WhitelistedZmwReadStitcher.cpp',
+  'ZmwGroupQuery.cpp',
+  'ZmwReadStitcher.cpp',
+  'ZmwQuery.cpp',
+  'ZmwTypeMap.cpp',
+
+  # vcf
+  'vcf/VcfVariant.cpp',
+  'vcf/VcfFormat.cpp',
+  'vcf/VcfHeader.cpp',
+  'vcf/VcfReader.cpp',
+  'vcf/VcfSort.cpp',
+  'vcf/VcfQuery.cpp',
+  'vcf/VcfWriter.cpp',
+
+  # XML I/O
+  'pugixml/pugixml.cpp'
+])
+
+pbbam_extra_flags = []
+if get_option('auto-validate')
+  pbbam_extra_flags += '-DPBBAM_AUTOVALIDATE=1'
+endif
+
+# install library if
+# - either running as a proper project
+# - or using shared libraries
+pbbam_lib_install = (not meson.is_subproject()) or (get_option('default_library') == 'shared')
+
+pbbam_lib = library(
+  'pbbam',
+  pbbam_cpp_sources,
+  # use boost SONAME practice:
+  #   cause ld startup issues before
+  #   you even have the chance of running
+  #   into ABI issues.
+  soversion : meson.project_version(),
+  version : meson.project_version(),
+  install : pbbam_lib_install,
+  dependencies : [pbbam_thread_dep, pbbam_boost_dep, pbbam_zlib_dep, pbbam_htslib_dep],
+  include_directories : pbbam_include_directories,
+  cpp_args : [pbbam_extra_flags, pbbam_warning_flags, pbbam_macros])
diff --git a/src/pugixml/pugiconfig.hpp b/src/pugixml/pugiconfig.hpp

new file mode 100644 (file)

index 0000000..6219dbe
--- /dev/null
+++ b/src/pugixml/pugiconfig.hpp
@@ -0,0 +1,71 @@
+/**
+ * pugixml parser - version 1.5
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef HEADER_PUGICONFIG_HPP
+#define HEADER_PUGICONFIG_HPP
+
+// Uncomment this to enable wchar_t mode
+// #define PUGIXML_WCHAR_MODE
+
+// Uncomment this to disable XPath
+// #define PUGIXML_NO_XPATH
+
+// Uncomment this to disable STL
+// #define PUGIXML_NO_STL
+
+// Uncomment this to disable exceptions
+// #define PUGIXML_NO_EXCEPTIONS
+
+// Set this to control attributes for public classes/functions, i.e.:
+// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
+// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
+// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
+// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
+
+// Tune these constants to adjust memory-related behavior
+// #define PUGIXML_MEMORY_PAGE_SIZE 32768
+// #define PUGIXML_MEMORY_OUTPUT_STACK 10240
+// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
+
+// Uncomment this to switch to header-only version
+// #define PUGIXML_HEADER_ONLY
+
+// Uncomment this to enable long long support
+// #define PUGIXML_HAS_LONG_LONG
+
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/src/pugixml/pugixml.cpp b/src/pugixml/pugixml.cpp

new file mode 100644 (file)

index 0000000..37bdec0
--- /dev/null
+++ b/src/pugixml/pugixml.cpp
@@ -0,0 +1,11539 @@
+/**
+ * pugixml parser - version 1.5
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef SOURCE_PUGIXML_CPP
+#define SOURCE_PUGIXML_CPP
+
+// disable all the warnings in this file
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+
+#if __GNUC__ >= 6
+#pragma GCC diagnostic ignored "-Wnull-dereference"
+#endif
+
+#if !defined(__clang__) and !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic ignored "-Wuseless-cast"
+#endif
+
+#include "../PbbamInternalConfig.h"
+
+#include "pugixml.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cassert>
+
+#ifdef PUGIXML_WCHAR_MODE
+#      include <cwchar>
+#endif
+
+#ifndef PUGIXML_NO_XPATH
+#      include <cmath>
+#      include <cfloat>
+#      ifdef PUGIXML_NO_EXCEPTIONS
+#              include <csetjmp>
+#      endif
+#endif
+
+#ifndef PUGIXML_NO_STL
+#      include <istream>
+#      include <ostream>
+#      include <string>
+#endif
+
+// For placement new
+#include <new>
+
+#ifdef _MSC_VER
+#      pragma warning(push)
+#      pragma warning(disable: 4127) // conditional expression is constant
+#      pragma warning(disable: 4324) // structure was padded due to __declspec(align())
+#      pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable
+#      pragma warning(disable: 4702) // unreachable code
+#      pragma warning(disable: 4996) // this function or variable may be unsafe
+#      pragma warning(disable: 4793) // function compiled as native: presence of '_setjmp' makes a function unmanaged
+#endif
+
+#ifdef __INTEL_COMPILER
+#      pragma warning(disable: 177) // function was declared but never referenced 
+#      pragma warning(disable: 279) // controlling expression is constant
+#      pragma warning(disable: 1478 1786) // function was declared "deprecated"
+#      pragma warning(disable: 1684) // conversion from pointer to same-sized integral type
+#endif
+
+#if defined(__BORLANDC__) && defined(PUGIXML_HEADER_ONLY)
+#      pragma warn -8080 // symbol is declared but never used; disabling this inside push/pop bracket does not make the warning go away
+#endif
+
+#ifdef __BORLANDC__
+#      pragma option push
+#      pragma warn -8008 // condition is always false
+#      pragma warn -8066 // unreachable code
+#endif
+
+#ifdef __SNC__
+// Using diag_push/diag_pop does not disable the warnings inside templates due to a compiler bug
+#      pragma diag_suppress=178 // function was declared but never referenced
+#      pragma diag_suppress=237 // controlling expression is constant
+#endif
+
+// Inlining controls
+#if defined(_MSC_VER) && _MSC_VER >= 1300
+#      define PUGI__NO_INLINE __declspec(noinline)
+#elif defined(__GNUC__)
+#      define PUGI__NO_INLINE __attribute__((noinline))
+#else
+#      define PUGI__NO_INLINE 
+#endif
+
+// Branch weight controls
+#if defined(__GNUC__)
+#      define PUGI__UNLIKELY(cond) __builtin_expect(cond, 0)
+#else
+#      define PUGI__UNLIKELY(cond) (cond)
+#endif
+
+// Simple static assertion
+#define PUGI__STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; }
+
+// Digital Mars C++ bug workaround for passing char loaded from memory via stack
+#ifdef __DMC__
+#      define PUGI__DMC_VOLATILE volatile
+#else
+#      define PUGI__DMC_VOLATILE
+#endif
+
+// Borland C++ bug workaround for not defining ::memcpy depending on header include order (can't always use std::memcpy because some compilers don't have it at all)
+#if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST)
+using std::memcpy;
+using std::memmove;
+#endif
+
+// In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features
+#if defined(_MSC_VER) && !defined(__S3E__)
+#      define PUGI__MSVC_CRT_VERSION _MSC_VER
+#endif
+
+#ifdef PUGIXML_HEADER_ONLY
+#      define PUGI__NS_BEGIN namespace pugi { namespace impl {
+#      define PUGI__NS_END } }
+#      define PUGI__FN inline
+#      define PUGI__FN_NO_INLINE inline
+#else
+#      if defined(_MSC_VER) && _MSC_VER < 1300 // MSVC6 seems to have an amusing bug with anonymous namespaces inside namespaces
+#              define PUGI__NS_BEGIN namespace pugi { namespace impl {
+#              define PUGI__NS_END } }
+#      else
+#              define PUGI__NS_BEGIN namespace pugi { namespace impl { namespace {
+#              define PUGI__NS_END } } }
+#      endif
+#      define PUGI__FN
+#      define PUGI__FN_NO_INLINE PUGI__NO_INLINE
+#endif
+
+// uintptr_t
+#if !defined(_MSC_VER) || _MSC_VER >= 1600
+#      include <stdint.h>
+#else
+#      ifndef _UINTPTR_T_DEFINED
+// No native uintptr_t in MSVC6 and in some WinCE versions
+typedef size_t uintptr_t;
+#define _UINTPTR_T_DEFINED
+#      endif
+PUGI__NS_BEGIN
+       typedef unsigned __int8 uint8_t;
+       typedef unsigned __int16 uint16_t;
+       typedef unsigned __int32 uint32_t;
+PUGI__NS_END
+#endif
+
+// Memory allocation
+PUGI__NS_BEGIN
+       PUGI__FN void* default_allocate(size_t size)
+       {
+               return malloc(size);
+       }
+
+       PUGI__FN void default_deallocate(void* ptr)
+       {
+               free(ptr);
+       }
+
+       template <typename T>
+       struct xml_memory_management_function_storage
+       {
+               static allocation_function allocate;
+               static deallocation_function deallocate;
+       };
+
+       // Global allocation functions are stored in class statics so that in header mode linker deduplicates them
+       // Without a template<> we'll get multiple definitions of the same static
+       template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
+       template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
+
+       typedef xml_memory_management_function_storage<int> xml_memory;
+PUGI__NS_END
+
+// String utilities
+PUGI__NS_BEGIN
+       // Get string length
+       PUGI__FN size_t strlength(const char_t* s)
+       {
+               assert(s);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcslen(s);
+       #else
+               return strlen(s);
+       #endif
+       }
+
+       // Compare two strings
+       PUGI__FN bool strequal(const char_t* src, const char_t* dst)
+       {
+               assert(src && dst);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcscmp(src, dst) == 0;
+       #else
+               return strcmp(src, dst) == 0;
+       #endif
+       }
+
+       // Compare lhs with [rhs_begin, rhs_end)
+       PUGI__FN bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count)
+       {
+               for (size_t i = 0; i < count; ++i)
+                       if (lhs[i] != rhs[i])
+                               return false;
+       
+               return lhs[count] == 0;
+       }
+
+       // Get length of wide string, even if CRT lacks wide character support
+       PUGI__FN size_t strlength_wide(const wchar_t* s)
+       {
+               assert(s);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcslen(s);
+       #else
+               const wchar_t* end = s;
+               while (*end) end++;
+               return static_cast<size_t>(end - s);
+       #endif
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       // Convert string to wide string, assuming all symbols are ASCII
+       PUGI__FN void widen_ascii(wchar_t* dest, const char* source)
+       {
+               for (const char* i = source; *i; ++i) *dest++ = *i;
+               *dest = 0;
+       }
+#endif
+PUGI__NS_END
+
+#if !defined(PUGIXML_NO_STL) || !defined(PUGIXML_NO_XPATH)
+// auto_ptr-like buffer holder for exception recovery
+PUGI__NS_BEGIN
+       struct buffer_holder
+       {
+               void* data;
+               void (*deleter)(void*);
+
+               buffer_holder(void* data_, void (*deleter_)(void*)): data(data_), deleter(deleter_)
+               {
+               }
+
+               ~buffer_holder()
+               {
+                       if (data) deleter(data);
+               }
+
+               void* release()
+               {
+                       void* result = data;
+                       data = 0;
+                       return result;
+               }
+       };
+PUGI__NS_END
+#endif
+
+PUGI__NS_BEGIN
+       static const size_t xml_memory_page_size =
+       #ifdef PUGIXML_MEMORY_PAGE_SIZE
+               PUGIXML_MEMORY_PAGE_SIZE
+       #else
+               32768
+       #endif
+               ;
+
+       static const uintptr_t xml_memory_page_alignment = 64;
+       static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1);
+       static const uintptr_t xml_memory_page_contents_shared_mask = 32;
+       static const uintptr_t xml_memory_page_name_allocated_mask = 16;
+       static const uintptr_t xml_memory_page_value_allocated_mask = 8;
+       static const uintptr_t xml_memory_page_type_mask = 7;
+       static const uintptr_t xml_memory_page_name_allocated_or_shared_mask = xml_memory_page_name_allocated_mask | xml_memory_page_contents_shared_mask;
+       static const uintptr_t xml_memory_page_value_allocated_or_shared_mask = xml_memory_page_value_allocated_mask | xml_memory_page_contents_shared_mask;
+
+       #define PUGI__NODETYPE(n) static_cast<xml_node_type>(((n)->header & impl::xml_memory_page_type_mask) + 1)
+
+       struct xml_allocator;
+
+       struct xml_memory_page
+       {
+               static xml_memory_page* construct(void* memory)
+               {
+                       xml_memory_page* result = static_cast<xml_memory_page*>(memory);
+
+                       result->allocator = 0;
+                       result->prev = 0;
+                       result->next = 0;
+                       result->busy_size = 0;
+                       result->freed_size = 0;
+
+                       return result;
+               }
+
+               xml_allocator* allocator;
+
+               xml_memory_page* prev;
+               xml_memory_page* next;
+
+               size_t busy_size;
+               size_t freed_size;
+       };
+
+       struct xml_memory_string_header
+       {
+               uint16_t page_offset; // offset from page->data
+               uint16_t full_size; // 0 if string occupies whole page
+       };
+
+       struct xml_allocator
+       {
+               xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size)
+               {
+               }
+
+               xml_memory_page* allocate_page(size_t data_size)
+               {
+                       size_t size = sizeof(xml_memory_page) + data_size;
+
+                       // allocate block with some alignment, leaving memory for worst-case padding
+                       void* memory = xml_memory::allocate(size + xml_memory_page_alignment);
+                       if (!memory) return 0;
+
+                       // align to next page boundary (note: this guarantees at least 1 usable byte before the page)
+                       char* page_memory = reinterpret_cast<char*>((reinterpret_cast<uintptr_t>(memory) + xml_memory_page_alignment) & ~(xml_memory_page_alignment - 1));
+
+                       // prepare page structure
+                       xml_memory_page* page = xml_memory_page::construct(page_memory);
+                       assert(page);
+
+                       page->allocator = _root->allocator;
+
+                       // record the offset for freeing the memory block
+                       assert(page_memory > memory && page_memory - static_cast<char*>(memory) <= 127);
+                       page_memory[-1] = static_cast<char>(page_memory - static_cast<char*>(memory));
+
+                       return page;
+               }
+
+               static void deallocate_page(xml_memory_page* page)
+               {
+                       char* page_memory = reinterpret_cast<char*>(page);
+
+                       xml_memory::deallocate(page_memory - page_memory[-1]);
+               }
+
+               void* allocate_memory_oob(size_t size, xml_memory_page*& out_page);
+
+               void* allocate_memory(size_t size, xml_memory_page*& out_page)
+               {
+                       if (_busy_size + size > xml_memory_page_size) return allocate_memory_oob(size, out_page);
+
+                       void* buf = reinterpret_cast<char*>(_root) + sizeof(xml_memory_page) + _busy_size;
+
+                       _busy_size += size;
+
+                       out_page = _root;
+
+                       return buf;
+               }
+
+               void deallocate_memory(void* ptr, size_t size, xml_memory_page* page)
+               {
+                       if (page == _root) page->busy_size = _busy_size;
+
+                       assert(ptr >= reinterpret_cast<char*>(page) + sizeof(xml_memory_page) && ptr < reinterpret_cast<char*>(page) + sizeof(xml_memory_page) + page->busy_size);
+                       (void)!ptr;
+
+                       page->freed_size += size;
+                       assert(page->freed_size <= page->busy_size);
+
+                       if (page->freed_size == page->busy_size)
+                       {
+                               if (page->next == 0)
+                               {
+                                       assert(_root == page);
+
+                                       // top page freed, just reset sizes
+                                       page->busy_size = page->freed_size = 0;
+                                       _busy_size = 0;
+                               }
+                               else
+                               {
+                                       assert(_root != page);
+                                       assert(page->prev);
+
+                                       // remove from the list
+                                       page->prev->next = page->next;
+                                       page->next->prev = page->prev;
+
+                                       // deallocate
+                                       deallocate_page(page);
+                               }
+                       }
+               }
+
+               char_t* allocate_string(size_t length)
+               {
+                       PUGI__STATIC_ASSERT(xml_memory_page_size <= (1 << 16));
+
+                       // allocate memory for string and header block
+                       size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t);
+                       
+                       // round size up to pointer alignment boundary
+                       size_t full_size = (size + (sizeof(void*) - 1)) & ~(sizeof(void*) - 1);
+
+                       xml_memory_page* page;
+                       xml_memory_string_header* header = static_cast<xml_memory_string_header*>(allocate_memory(full_size, page));
+
+                       if (!header) return 0;
+
+                       // setup header
+                       ptrdiff_t page_offset = reinterpret_cast<char*>(header) - reinterpret_cast<char*>(page) - sizeof(xml_memory_page);
+
+                       assert(page_offset >= 0 && page_offset < (1 << 16));
+                       header->page_offset = static_cast<uint16_t>(page_offset);
+
+                       // full_size == 0 for large strings that occupy the whole page
+                       assert(full_size < (1 << 16) || (page->busy_size == full_size && page_offset == 0));
+                       header->full_size = static_cast<uint16_t>(full_size < (1 << 16) ? full_size : 0);
+
+                       // round-trip through void* to avoid 'cast increases required alignment of target type' warning
+                       // header is guaranteed a pointer-sized alignment, which should be enough for char_t
+                       return static_cast<char_t*>(static_cast<void*>(header + 1));
+               }
+
+               void deallocate_string(char_t* string)
+               {
+                       // this function casts pointers through void* to avoid 'cast increases required alignment of target type' warnings
+                       // we're guaranteed the proper (pointer-sized) alignment on the input string if it was allocated via allocate_string
+
+                       // get header
+                       xml_memory_string_header* header = static_cast<xml_memory_string_header*>(static_cast<void*>(string)) - 1;
+                       assert(header);
+
+                       // deallocate
+                       size_t page_offset = sizeof(xml_memory_page) + header->page_offset;
+                       xml_memory_page* page = reinterpret_cast<xml_memory_page*>(static_cast<void*>(reinterpret_cast<char*>(header) - page_offset));
+
+                       // if full_size == 0 then this string occupies the whole page
+                       size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size;
+
+                       deallocate_memory(header, full_size, page);
+               }
+
+               xml_memory_page* _root;
+               size_t _busy_size;
+       };
+
+       PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page)
+       {
+               const size_t large_allocation_threshold = xml_memory_page_size / 4;
+
+               xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size);
+               out_page = page;
+
+               if (!page) return 0;
+
+               if (size <= large_allocation_threshold)
+               {
+                       _root->busy_size = _busy_size;
+
+                       // insert page at the end of linked list
+                       page->prev = _root;
+                       _root->next = page;
+                       _root = page;
+
+                       _busy_size = size;
+               }
+               else
+               {
+                       // insert page before the end of linked list, so that it is deleted as soon as possible
+                       // the last page is not deleted even if it's empty (see deallocate_memory)
+                       assert(_root->prev);
+
+                       page->prev = _root->prev;
+                       page->next = _root;
+
+                       _root->prev->next = page;
+                       _root->prev = page;
+               }
+
+               // allocate inside page
+               page->busy_size = size;
+
+               return reinterpret_cast<char*>(page) + sizeof(xml_memory_page);
+       }
+PUGI__NS_END
+
+namespace pugi
+{
+       /// A 'name=value' XML attribute structure.
+       struct xml_attribute_struct
+       {
+               /// Default ctor
+               xml_attribute_struct(impl::xml_memory_page* page): header(reinterpret_cast<uintptr_t>(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0)
+               {
+               }
+
+               uintptr_t header;
+
+               char_t* name;   ///< Pointer to attribute name.
+               char_t* value;  ///< Pointer to attribute value.
+
+               xml_attribute_struct* prev_attribute_c; ///< Previous attribute (cyclic list)
+               xml_attribute_struct* next_attribute;   ///< Next attribute
+       };
+
+       /// An XML document tree node.
+       struct xml_node_struct
+       {
+               /// Default ctor
+               /// \param type - node type
+               xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(reinterpret_cast<uintptr_t>(page) | (type - 1)), parent(0), name(0), value(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0)
+               {
+               }
+
+               uintptr_t header;
+
+               xml_node_struct*                parent;                                 ///< Pointer to parent
+
+               char_t*                                 name;                                   ///< Pointer to element name.
+               char_t*                                 value;                                  ///< Pointer to any associated string data.
+
+               xml_node_struct*                first_child;                    ///< First child
+               
+               xml_node_struct*                prev_sibling_c;                 ///< Left brother (cyclic list)
+               xml_node_struct*                next_sibling;                   ///< Right brother
+               
+               xml_attribute_struct*   first_attribute;                ///< First attribute
+       };
+}
+
+PUGI__NS_BEGIN
+       struct xml_extra_buffer
+       {
+               char_t* buffer;
+               xml_extra_buffer* next;
+       };
+
+       struct xml_document_struct: public xml_node_struct, public xml_allocator
+       {
+               xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0), extra_buffers(0)
+               {
+               }
+
+               const char_t* buffer;
+
+               xml_extra_buffer* extra_buffers;
+       };
+
+       inline xml_allocator& get_allocator(const xml_node_struct* node)
+       {
+               assert(node);
+
+               return *reinterpret_cast<xml_memory_page*>(node->header & xml_memory_page_pointer_mask)->allocator;
+       }
+
+       template <typename Object> inline xml_document_struct& get_document(const Object* object)
+       {
+               assert(object);
+
+               return *static_cast<xml_document_struct*>(reinterpret_cast<xml_memory_page*>(object->header & xml_memory_page_pointer_mask)->allocator);
+       }
+PUGI__NS_END
+
+// Low-level DOM operations
+PUGI__NS_BEGIN
+       inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc)
+       {
+               xml_memory_page* page;
+               void* memory = alloc.allocate_memory(sizeof(xml_attribute_struct), page);
+
+               return new (memory) xml_attribute_struct(page);
+       }
+
+       inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type)
+       {
+               xml_memory_page* page;
+               void* memory = alloc.allocate_memory(sizeof(xml_node_struct), page);
+
+               return new (memory) xml_node_struct(page, type);
+       }
+
+       inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc)
+       {
+               uintptr_t header = a->header;
+
+               if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(a->name);
+               if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(a->value);
+
+               alloc.deallocate_memory(a, sizeof(xml_attribute_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+       }
+
+       inline void destroy_node(xml_node_struct* n, xml_allocator& alloc)
+       {
+               uintptr_t header = n->header;
+
+               if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(n->name);
+               if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(n->value);
+
+               for (xml_attribute_struct* attr = n->first_attribute; attr; )
+               {
+                       xml_attribute_struct* next = attr->next_attribute;
+
+                       destroy_attribute(attr, alloc);
+
+                       attr = next;
+               }
+
+               for (xml_node_struct* child = n->first_child; child; )
+               {
+                       xml_node_struct* next = child->next_sibling;
+
+                       destroy_node(child, alloc);
+
+                       child = next;
+               }
+
+               alloc.deallocate_memory(n, sizeof(xml_node_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
+       }
+
+       inline void append_node(xml_node_struct* child, xml_node_struct* node)
+       {
+               child->parent = node;
+
+               xml_node_struct* head = node->first_child;
+
+               if (head)
+               {
+                       xml_node_struct* tail = head->prev_sibling_c;
+
+                       tail->next_sibling = child;
+                       child->prev_sibling_c = tail;
+                       head->prev_sibling_c = child;
+               }
+               else
+               {
+                       node->first_child = child;
+                       child->prev_sibling_c = child;
+               }
+       }
+
+       inline void prepend_node(xml_node_struct* child, xml_node_struct* node)
+       {
+               child->parent = node;
+
+               xml_node_struct* head = node->first_child;
+
+               if (head)
+               {
+                       child->prev_sibling_c = head->prev_sibling_c;
+                       head->prev_sibling_c = child;
+               }
+               else
+                       child->prev_sibling_c = child;
+
+               child->next_sibling = head;
+               node->first_child = child;
+       }
+
+       inline void insert_node_after(xml_node_struct* child, xml_node_struct* node)
+       {
+               xml_node_struct* parent = node->parent;
+
+               child->parent = parent;
+
+               if (node->next_sibling)
+                       node->next_sibling->prev_sibling_c = child;
+               else
+                       parent->first_child->prev_sibling_c = child;
+
+               child->next_sibling = node->next_sibling;
+               child->prev_sibling_c = node;
+
+               node->next_sibling = child;
+       }
+
+       inline void insert_node_before(xml_node_struct* child, xml_node_struct* node)
+       {
+               xml_node_struct* parent = node->parent;
+
+               child->parent = parent;
+
+               if (node->prev_sibling_c->next_sibling)
+                       node->prev_sibling_c->next_sibling = child;
+               else
+                       parent->first_child = child;
+
+               child->prev_sibling_c = node->prev_sibling_c;
+               child->next_sibling = node;
+
+               node->prev_sibling_c = child;
+       }
+
+       inline void remove_node(xml_node_struct* node)
+       {
+               xml_node_struct* parent = node->parent;
+
+               if (node->next_sibling)
+                       node->next_sibling->prev_sibling_c = node->prev_sibling_c;
+               else
+                       parent->first_child->prev_sibling_c = node->prev_sibling_c;
+
+               if (node->prev_sibling_c->next_sibling)
+                       node->prev_sibling_c->next_sibling = node->next_sibling;
+               else
+                       parent->first_child = node->next_sibling;
+
+               node->parent = 0;
+               node->prev_sibling_c = 0;
+               node->next_sibling = 0;
+       }
+
+       inline void append_attribute(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               xml_attribute_struct* head = node->first_attribute;
+
+               if (head)
+               {
+                       xml_attribute_struct* tail = head->prev_attribute_c;
+
+                       tail->next_attribute = attr;
+                       attr->prev_attribute_c = tail;
+                       head->prev_attribute_c = attr;
+               }
+               else
+               {
+                       node->first_attribute = attr;
+                       attr->prev_attribute_c = attr;
+               }
+       }
+
+       inline void prepend_attribute(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               xml_attribute_struct* head = node->first_attribute;
+
+               if (head)
+               {
+                       attr->prev_attribute_c = head->prev_attribute_c;
+                       head->prev_attribute_c = attr;
+               }
+               else
+                       attr->prev_attribute_c = attr;
+
+               attr->next_attribute = head;
+               node->first_attribute = attr;
+       }
+
+       inline void insert_attribute_after(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node)
+       {
+               if (place->next_attribute)
+                       place->next_attribute->prev_attribute_c = attr;
+               else
+                       node->first_attribute->prev_attribute_c = attr;
+
+               attr->next_attribute = place->next_attribute;
+               attr->prev_attribute_c = place;
+               place->next_attribute = attr;
+       }
+
+       inline void insert_attribute_before(xml_attribute_struct* attr, xml_attribute_struct* place, xml_node_struct* node)
+       {
+               if (place->prev_attribute_c->next_attribute)
+                       place->prev_attribute_c->next_attribute = attr;
+               else
+                       node->first_attribute = attr;
+
+               attr->prev_attribute_c = place->prev_attribute_c;
+               attr->next_attribute = place;
+               place->prev_attribute_c = attr;
+       }
+
+       inline void remove_attribute(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               if (attr->next_attribute)
+                       attr->next_attribute->prev_attribute_c = attr->prev_attribute_c;
+               else
+                       node->first_attribute->prev_attribute_c = attr->prev_attribute_c;
+
+               if (attr->prev_attribute_c->next_attribute)
+                       attr->prev_attribute_c->next_attribute = attr->next_attribute;
+               else
+                       node->first_attribute = attr->next_attribute;
+
+               attr->prev_attribute_c = 0;
+               attr->next_attribute = 0;
+       }
+
+       PUGI__FN_NO_INLINE xml_node_struct* append_new_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element)
+       {
+               xml_node_struct* child = allocate_node(alloc, type);
+               if (!child) return 0;
+
+               append_node(child, node);
+
+               return child;
+       }
+
+       PUGI__FN_NO_INLINE xml_attribute_struct* append_new_attribute(xml_node_struct* node, xml_allocator& alloc)
+       {
+               xml_attribute_struct* attr = allocate_attribute(alloc);
+               if (!attr) return 0;
+
+               append_attribute(attr, node);
+
+               return attr;
+       }
+PUGI__NS_END
+
+// Helper classes for code generation
+PUGI__NS_BEGIN
+       struct opt_false
+       {
+               enum { value = 0 };
+       };
+
+       struct opt_true
+       {
+               enum { value = 1 };
+       };
+PUGI__NS_END
+
+// Unicode utilities
+PUGI__NS_BEGIN
+       inline uint16_t endian_swap(uint16_t value)
+       {
+               return static_cast<uint16_t>(((value & 0xff) << 8) | (value >> 8));
+       }
+
+       inline uint32_t endian_swap(uint32_t value)
+       {
+               return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24);
+       }
+
+       struct utf8_counter
+       {
+               typedef size_t value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       // U+0000..U+007F
+                       if (ch < 0x80) return result + 1;
+                       // U+0080..U+07FF
+                       else if (ch < 0x800) return result + 2;
+                       // U+0800..U+FFFF
+                       else return result + 3;
+               }
+
+               static value_type high(value_type result, uint32_t)
+               {
+                       // U+10000..U+10FFFF
+                       return result + 4;
+               }
+       };
+
+       struct utf8_writer
+       {
+               typedef uint8_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       // U+0000..U+007F
+                       if (ch < 0x80)
+                       {
+                               *result = static_cast<uint8_t>(ch);
+                               return result + 1;
+                       }
+                       // U+0080..U+07FF
+                       else if (ch < 0x800)
+                       {
+                               result[0] = static_cast<uint8_t>(0xC0 | (ch >> 6));
+                               result[1] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+                               return result + 2;
+                       }
+                       // U+0800..U+FFFF
+                       else
+                       {
+                               result[0] = static_cast<uint8_t>(0xE0 | (ch >> 12));
+                               result[1] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+                               result[2] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+                               return result + 3;
+                       }
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       // U+10000..U+10FFFF
+                       result[0] = static_cast<uint8_t>(0xF0 | (ch >> 18));
+                       result[1] = static_cast<uint8_t>(0x80 | ((ch >> 12) & 0x3F));
+                       result[2] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
+                       result[3] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
+                       return result + 4;
+               }
+
+               static value_type any(value_type result, uint32_t ch)
+               {
+                       return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+               }
+       };
+
+       struct utf16_counter
+       {
+               typedef size_t value_type;
+
+               static value_type low(value_type result, uint32_t)
+               {
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t)
+               {
+                       return result + 2;
+               }
+       };
+
+       struct utf16_writer
+       {
+               typedef uint16_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       *result = static_cast<uint16_t>(ch);
+
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       uint32_t msh = static_cast<uint32_t>(ch - 0x10000) >> 10;
+                       uint32_t lsh = static_cast<uint32_t>(ch - 0x10000) & 0x3ff;
+
+                       result[0] = static_cast<uint16_t>(0xD800 + msh);
+                       result[1] = static_cast<uint16_t>(0xDC00 + lsh);
+
+                       return result + 2;
+               }
+
+               static value_type any(value_type result, uint32_t ch)
+               {
+                       return (ch < 0x10000) ? low(result, ch) : high(result, ch);
+               }
+       };
+
+       struct utf32_counter
+       {
+               typedef size_t value_type;
+
+               static value_type low(value_type result, uint32_t)
+               {
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t)
+               {
+                       return result + 1;
+               }
+       };
+
+       struct utf32_writer
+       {
+               typedef uint32_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       *result = ch;
+
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       *result = ch;
+
+                       return result + 1;
+               }
+
+               static value_type any(value_type result, uint32_t ch)
+               {
+                       *result = ch;
+
+                       return result + 1;
+               }
+       };
+
+       struct latin1_writer
+       {
+               typedef uint8_t* value_type;
+
+               static value_type low(value_type result, uint32_t ch)
+               {
+                       *result = static_cast<uint8_t>(ch > 255 ? '?' : ch);
+
+                       return result + 1;
+               }
+
+               static value_type high(value_type result, uint32_t ch)
+               {
+                       (void)ch;
+
+                       *result = '?';
+
+                       return result + 1;
+               }
+       };
+
+       template <size_t size> struct wchar_selector;
+
+       template <> struct wchar_selector<2>
+       {
+               typedef uint16_t type;
+               typedef utf16_counter counter;
+               typedef utf16_writer writer;
+       };
+
+       template <> struct wchar_selector<4>
+       {
+               typedef uint32_t type;
+               typedef utf32_counter counter;
+               typedef utf32_writer writer;
+       };
+
+       typedef wchar_selector<sizeof(wchar_t)>::counter wchar_counter;
+       typedef wchar_selector<sizeof(wchar_t)>::writer wchar_writer;
+
+       template <typename Traits, typename opt_swap = opt_false> struct utf_decoder
+       {
+               static inline typename Traits::value_type decode_utf8_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+               {
+                       const uint8_t utf8_byte_mask = 0x3f;
+
+                       while (size)
+                       {
+                               uint8_t lead = *data;
+
+                               // 0xxxxxxx -> U+0000..U+007F
+                               if (lead < 0x80)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                                       size -= 1;
+
+                                       // process aligned single-byte (ascii) blocks
+                                       if ((reinterpret_cast<uintptr_t>(data) & 3) == 0)
+                                       {
+                                               // round-trip through void* to silence 'cast increases required alignment of target type' warnings
+                                               while (size >= 4 && (*static_cast<const uint32_t*>(static_cast<const void*>(data)) & 0x80808080) == 0)
+                                               {
+                                                       result = Traits::low(result, data[0]);
+                                                       result = Traits::low(result, data[1]);
+                                                       result = Traits::low(result, data[2]);
+                                                       result = Traits::low(result, data[3]);
+                                                       data += 4;
+                                                       size -= 4;
+                                               }
+                                       }
+                               }
+                               // 110xxxxx -> U+0080..U+07FF
+                               else if (static_cast<unsigned int>(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80)
+                               {
+                                       result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask));
+                                       data += 2;
+                                       size -= 2;
+                               }
+                               // 1110xxxx -> U+0800-U+FFFF
+                               else if (static_cast<unsigned int>(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
+                               {
+                                       result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask));
+                                       data += 3;
+                                       size -= 3;
+                               }
+                               // 11110xxx -> U+10000..U+10FFFF
+                               else if (static_cast<unsigned int>(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
+                               {
+                                       result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask));
+                                       data += 4;
+                                       size -= 4;
+                               }
+                               // 10xxxxxx or 11111xxx -> invalid
+                               else
+                               {
+                                       data += 1;
+                                       size -= 1;
+                               }
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_utf16_block(const uint16_t* data, size_t size, typename Traits::value_type result)
+               {
+                       const uint16_t* end = data + size;
+
+                       while (data < end)
+                       {
+                               unsigned int lead = opt_swap::value ? endian_swap(*data) : *data;
+
+                               // U+0000..U+D7FF
+                               if (lead < 0xD800)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                               }
+                               // U+E000..U+FFFF
+                               else if (static_cast<unsigned int>(lead - 0xE000) < 0x2000)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                               }
+                               // surrogate pair lead
+                               else if (static_cast<unsigned int>(lead - 0xD800) < 0x400 && data + 1 < end)
+                               {
+                                       uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1];
+
+                                       if (static_cast<unsigned int>(next - 0xDC00) < 0x400)
+                                       {
+                                               result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff));
+                                               data += 2;
+                                       }
+                                       else
+                                       {
+                                               data += 1;
+                                       }
+                               }
+                               else
+                               {
+                                       data += 1;
+                               }
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_utf32_block(const uint32_t* data, size_t size, typename Traits::value_type result)
+               {
+                       const uint32_t* end = data + size;
+
+                       while (data < end)
+                       {
+                               uint32_t lead = opt_swap::value ? endian_swap(*data) : *data;
+
+                               // U+0000..U+FFFF
+                               if (lead < 0x10000)
+                               {
+                                       result = Traits::low(result, lead);
+                                       data += 1;
+                               }
+                               // U+10000..U+10FFFF
+                               else
+                               {
+                                       result = Traits::high(result, lead);
+                                       data += 1;
+                               }
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_latin1_block(const uint8_t* data, size_t size, typename Traits::value_type result)
+               {
+                       for (size_t i = 0; i < size; ++i)
+                       {
+                               result = Traits::low(result, data[i]);
+                       }
+
+                       return result;
+               }
+
+               static inline typename Traits::value_type decode_wchar_block_impl(const uint16_t* data, size_t size, typename Traits::value_type result)
+               {
+                       return decode_utf16_block(data, size, result);
+               }
+
+               static inline typename Traits::value_type decode_wchar_block_impl(const uint32_t* data, size_t size, typename Traits::value_type result)
+               {
+                       return decode_utf32_block(data, size, result);
+               }
+
+               static inline typename Traits::value_type decode_wchar_block(const wchar_t* data, size_t size, typename Traits::value_type result)
+               {
+                       return decode_wchar_block_impl(reinterpret_cast<const wchar_selector<sizeof(wchar_t)>::type*>(data), size, result);
+               }
+       };
+
+       template <typename T> PUGI__FN void convert_utf_endian_swap(T* result, const T* data, size_t length)
+       {
+               for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]);
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length)
+       {
+               for (size_t i = 0; i < length; ++i) result[i] = static_cast<wchar_t>(endian_swap(static_cast<wchar_selector<sizeof(wchar_t)>::type>(data[i])));
+       }
+#endif
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+       enum chartype_t
+       {
+               ct_parse_pcdata = 1,    // \0, &, \r, <
+               ct_parse_attr = 2,              // \0, &, \r, ', "
+               ct_parse_attr_ws = 4,   // \0, &, \r, ', ", \n, tab
+               ct_space = 8,                   // \r, \n, space, tab
+               ct_parse_cdata = 16,    // \0, ], >, \r
+               ct_parse_comment = 32,  // \0, -, >, \r
+               ct_symbol = 64,                 // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
+               ct_start_symbol = 128   // Any symbol > 127, a-z, A-Z, _, :
+       };
+
+       static const unsigned char chartype_table[256] =
+       {
+               55,  0,   0,   0,   0,   0,   0,   0,      0,   12,  12,  0,   0,   63,  0,   0,   // 0-15
+               0,   0,   0,   0,   0,   0,   0,   0,      0,   0,   0,   0,   0,   0,   0,   0,   // 16-31
+               8,   0,   6,   0,   0,   0,   7,   6,      0,   0,   0,   0,   0,   96,  64,  0,   // 32-47
+               64,  64,  64,  64,  64,  64,  64,  64,     64,  64,  192, 0,   1,   0,   48,  0,   // 48-63
+               0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 64-79
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0,   0,   16,  0,   192, // 80-95
+               0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 96-111
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0, 0, 0, 0, 0,           // 112-127
+
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 128+
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
+               192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192
+       };
+
+       enum chartypex_t
+       {
+               ctx_special_pcdata = 1,   // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
+               ctx_special_attr = 2,     // Any symbol >= 0 and < 32 (except \t), &, <, >, "
+               ctx_start_symbol = 4,     // Any symbol > 127, a-z, A-Z, _
+               ctx_digit = 8,                    // 0-9
+               ctx_symbol = 16                   // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
+       };
+       
+       static const unsigned char chartypex_table[256] =
+       {
+               3,  3,  3,  3,  3,  3,  3,  3,     3,  0,  2,  3,  3,  2,  3,  3,     // 0-15
+               3,  3,  3,  3,  3,  3,  3,  3,     3,  3,  3,  3,  3,  3,  3,  3,     // 16-31
+               0,  0,  2,  0,  0,  0,  3,  0,     0,  0,  0,  0,  0, 16, 16,  0,     // 32-47
+               24, 24, 24, 24, 24, 24, 24, 24,    24, 24, 0,  0,  3,  0,  3,  0,     // 48-63
+
+               0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 64-79
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  20,    // 80-95
+               0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 96-111
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  0,     // 112-127
+
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 128+
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
+               20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20
+       };
+       
+#ifdef PUGIXML_WCHAR_MODE
+       #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast<unsigned int>(c) < 128 ? table[static_cast<unsigned int>(c)] : table[128]) & (ct))
+#else
+       #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast<unsigned char>(c)] & (ct))
+#endif
+
+       #define PUGI__IS_CHARTYPE(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartype_table)
+       #define PUGI__IS_CHARTYPEX(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartypex_table)
+
+       PUGI__FN bool is_little_endian()
+       {
+               unsigned int ui = 1;
+
+               return *reinterpret_cast<unsigned char*>(&ui) == 1;
+       }
+
+       PUGI__FN xml_encoding get_wchar_encoding()
+       {
+               PUGI__STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
+
+               if (sizeof(wchar_t) == 2)
+                       return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+               else 
+                       return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+       }
+
+       PUGI__FN xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3)
+       {
+               // look for BOM in first few bytes
+               if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be;
+               if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le;
+               if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be;
+               if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le;
+               if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8;
+
+               // look for <, <? or <?xm in various encodings
+               if (d0 == 0 && d1 == 0 && d2 == 0 && d3 == 0x3c) return encoding_utf32_be;
+               if (d0 == 0x3c && d1 == 0 && d2 == 0 && d3 == 0) return encoding_utf32_le;
+               if (d0 == 0 && d1 == 0x3c && d2 == 0 && d3 == 0x3f) return encoding_utf16_be;
+               if (d0 == 0x3c && d1 == 0 && d2 == 0x3f && d3 == 0) return encoding_utf16_le;
+               if (d0 == 0x3c && d1 == 0x3f && d2 == 0x78 && d3 == 0x6d) return encoding_utf8;
+
+               // look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early)
+               if (d0 == 0 && d1 == 0x3c) return encoding_utf16_be;
+               if (d0 == 0x3c && d1 == 0) return encoding_utf16_le;
+
+               // no known BOM detected, assume utf8
+               return encoding_utf8;
+       }
+
+       PUGI__FN xml_encoding get_buffer_encoding(xml_encoding encoding, const void* contents, size_t size)
+       {
+               // replace wchar encoding with utf implementation
+               if (encoding == encoding_wchar) return get_wchar_encoding();
+
+               // replace utf16 encoding with utf16 with specific endianness
+               if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+               // replace utf32 encoding with utf32 with specific endianness
+               if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+               // only do autodetection if no explicit encoding is requested
+               if (encoding != encoding_auto) return encoding;
+
+               // skip encoding autodetection if input buffer is too small
+               if (size < 4) return encoding_utf8;
+
+               // try to guess encoding (based on XML specification, Appendix F.1)
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+
+               PUGI__DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3];
+
+               return guess_buffer_encoding(d0, d1, d2, d3);
+       }
+
+       PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+       {
+               size_t length = size / sizeof(char_t);
+
+               if (is_mutable)
+               {
+                       out_buffer = static_cast<char_t*>(const_cast<void*>(contents));
+                       out_length = length;
+               }
+               else
+               {
+                       char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!buffer) return false;
+
+                       if (contents)
+                               memcpy(buffer, contents, length * sizeof(char_t));
+                       else
+                               assert(length == 0);
+
+                       buffer[length] = 0;
+
+                       out_buffer = buffer;
+                       out_length = length + 1;
+               }
+
+               return true;
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       PUGI__FN bool need_endian_swap_utf(xml_encoding le, xml_encoding re)
+       {
+               return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) ||
+                          (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be);
+       }
+
+       PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+       {
+               const char_t* data = static_cast<const char_t*>(contents);
+               size_t length = size / sizeof(char_t);
+
+               if (is_mutable)
+               {
+                       char_t* buffer = const_cast<char_t*>(data);
+
+                       convert_wchar_endian_swap(buffer, data, length);
+
+                       out_buffer = buffer;
+                       out_length = length;
+               }
+               else
+               {
+                       char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!buffer) return false;
+
+                       convert_wchar_endian_swap(buffer, data, length);
+                       buffer[length] = 0;
+
+                       out_buffer = buffer;
+                       out_length = length + 1;
+               }
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+       {
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+               size_t data_length = size;
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf8 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer>::decode_utf8_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint16_t* data = static_cast<const uint16_t*>(contents);
+               size_t data_length = size / sizeof(uint16_t);
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter, opt_swap>::decode_utf16_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf16 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer, opt_swap>::decode_utf16_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint32_t* data = static_cast<const uint32_t*>(contents);
+               size_t data_length = size / sizeof(uint32_t);
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter, opt_swap>::decode_utf32_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf32 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer, opt_swap>::decode_utf32_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
+       {
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+               size_t data_length = size;
+
+               // get length in wchar_t units
+               size_t length = data_length;
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // convert latin1 input to wchar_t
+               wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer);
+               wchar_writer::value_type oend = utf_decoder<wchar_writer>::decode_latin1_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+       {
+               // get native encoding
+               xml_encoding wchar_encoding = get_wchar_encoding();
+
+               // fast path: no conversion required
+               if (encoding == wchar_encoding) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+               // only endian-swapping is required
+               if (need_endian_swap_utf(encoding, wchar_encoding)) return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable);
+
+               // source encoding is utf8
+               if (encoding == encoding_utf8) return convert_buffer_utf8(out_buffer, out_length, contents, size);
+
+               // source encoding is utf16
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is utf32
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is latin1
+               if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size);
+
+               assert(!"Invalid encoding");
+               return false;
+       }
+#else
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint16_t* data = static_cast<const uint16_t*>(contents);
+               size_t data_length = size / sizeof(uint16_t);
+
+               // first pass: get length in utf8 units
+               size_t length = utf_decoder<utf8_counter, opt_swap>::decode_utf16_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf16 input to utf8
+               uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* oend = utf_decoder<utf8_writer, opt_swap>::decode_utf16_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
+       {
+               const uint32_t* data = static_cast<const uint32_t*>(contents);
+               size_t data_length = size / sizeof(uint32_t);
+
+               // first pass: get length in utf8 units
+               size_t length = utf_decoder<utf8_counter, opt_swap>::decode_utf32_block(data, data_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert utf32 input to utf8
+               uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* oend = utf_decoder<utf8_writer, opt_swap>::decode_utf32_block(data, data_length, obegin);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size)
+       {
+               for (size_t i = 0; i < size; ++i)
+                       if (data[i] > 127)
+                               return i;
+
+               return size;
+       }
+
+       PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
+       {
+               const uint8_t* data = static_cast<const uint8_t*>(contents);
+               size_t data_length = size;
+
+               // get size of prefix that does not need utf8 conversion
+               size_t prefix_length = get_latin1_7bit_prefix_length(data, data_length);
+               assert(prefix_length <= data_length);
+
+               const uint8_t* postfix = data + prefix_length;
+               size_t postfix_length = data_length - prefix_length;
+
+               // if no conversion is needed, just return the original buffer
+               if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+               // first pass: get length in utf8 units
+               size_t length = prefix_length + utf_decoder<utf8_counter>::decode_latin1_block(postfix, postfix_length, 0);
+
+               // allocate buffer of suitable length
+               char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+               if (!buffer) return false;
+
+               // second pass: convert latin1 input to utf8
+               memcpy(buffer, data, prefix_length);
+
+               uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* oend = utf_decoder<utf8_writer>::decode_latin1_block(postfix, postfix_length, obegin + prefix_length);
+
+               assert(oend == obegin + length);
+               *oend = 0;
+
+               out_buffer = buffer;
+               out_length = length + 1;
+
+               return true;
+       }
+
+       PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
+       {
+               // fast path: no conversion required
+               if (encoding == encoding_utf8) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
+
+               // source encoding is utf16
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is utf32
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       return (native_encoding == encoding) ?
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
+                               convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
+               }
+
+               // source encoding is latin1
+               if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
+
+               assert(!"Invalid encoding");
+               return false;
+       }
+#endif
+
+       PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length)
+       {
+               // get length in utf8 characters
+               return utf_decoder<utf8_counter>::decode_wchar_block(str, length, 0);
+       }
+
+       PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length)
+       {
+               // convert to utf8
+               uint8_t* begin = reinterpret_cast<uint8_t*>(buffer);
+               uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(str, length, begin);
+       
+               assert(begin + size == end);
+               (void)!end;
+
+               // zero-terminate
+               buffer[size] = 0;
+       }
+       
+#ifndef PUGIXML_NO_STL
+       PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length)
+       {
+               // first pass: get length in utf8 characters
+               size_t size = as_utf8_begin(str, length);
+
+               // allocate resulting string
+               std::string result;
+               result.resize(size);
+
+               // second pass: convert to utf8
+               if (size > 0) as_utf8_end(&result[0], size, str, length);
+
+               return result;
+       }
+
+       PUGI__FN std::basic_string<wchar_t> as_wide_impl(const char* str, size_t size)
+       {
+               const uint8_t* data = reinterpret_cast<const uint8_t*>(str);
+
+               // first pass: get length in wchar_t units
+               size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
+
+               // allocate resulting string
+               std::basic_string<wchar_t> result;
+               result.resize(length);
+
+               // second pass: convert to wchar_t
+               if (length > 0)
+               {
+                       wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]);
+                       wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin);
+
+                       assert(begin + length == end);
+                       (void)!end;
+               }
+
+               return result;
+       }
+#endif
+
+       inline bool strcpy_insitu_allow(size_t length, uintptr_t header, uintptr_t header_mask, char_t* target)
+       {
+               // never reuse shared memory
+               if (header & xml_memory_page_contents_shared_mask) return false;
+
+               size_t target_length = strlength(target);
+
+               // always reuse document buffer memory if possible
+               if ((header & header_mask) == 0) return target_length >= length;
+
+               // reuse heap memory if waste is not too great
+               const size_t reuse_threshold = 32;
+
+               return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2);
+       }
+
+       PUGI__FN bool strcpy_insitu(char_t*& dest, uintptr_t& header, uintptr_t header_mask, const char_t* source)
+       {
+               assert(header);
+
+               size_t source_length = strlength(source);
+
+               if (source_length == 0)
+               {
+                       // empty string and null pointer are equivalent, so just deallocate old memory
+                       xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+                       if (header & header_mask) alloc->deallocate_string(dest);
+                       
+                       // mark the string as not allocated
+                       dest = 0;
+                       header &= ~header_mask;
+
+                       return true;
+               }
+               else if (dest && strcpy_insitu_allow(source_length, header, header_mask, dest))
+               {
+                       // we can reuse old buffer, so just copy the new data (including zero terminator)
+                       memcpy(dest, source, (source_length + 1) * sizeof(char_t));
+                       
+                       return true;
+               }
+               else
+               {
+                       xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
+
+                       // allocate new buffer
+                       char_t* buf = alloc->allocate_string(source_length + 1);
+                       if (!buf) return false;
+
+                       // copy the string (including zero terminator)
+                       memcpy(buf, source, (source_length + 1) * sizeof(char_t));
+
+                       // deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures)
+                       if (header & header_mask) alloc->deallocate_string(dest);
+                       
+                       // the string is now allocated, so set the flag
+                       dest = buf;
+                       header |= header_mask;
+
+                       return true;
+               }
+       }
+
+       struct gap
+       {
+               char_t* end;
+               size_t size;
+                       
+               gap(): end(0), size(0)
+               {
+               }
+                       
+               // Push new gap, move s count bytes further (skipping the gap).
+               // Collapse previous gap.
+               void push(char_t*& s, size_t count)
+               {
+                       if (end) // there was a gap already; collapse it
+                       {
+                               // Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
+                               assert(s >= end);
+                               memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+                       }
+                               
+                       s += count; // end of current gap
+                               
+                       // "merge" two gaps
+                       end = s;
+                       size += count;
+               }
+                       
+               // Collapse all gaps, return past-the-end pointer
+               char_t* flush(char_t* s)
+               {
+                       if (end)
+                       {
+                               // Move [old_gap_end, current_pos) to [old_gap_start, ...)
+                               assert(s >= end);
+                               memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
+
+                               return s - size;
+                       }
+                       else return s;
+               }
+       };
+       
+       PUGI__FN char_t* strconv_escape(char_t* s, gap& g)
+       {
+               char_t* stre = s + 1;
+
+               switch (*stre)
+               {
+                       case '#':       // &#...
+                       {
+                               unsigned int ucsc = 0;
+
+                               if (stre[1] == 'x') // &#x... (hex code)
+                               {
+                                       stre += 2;
+
+                                       char_t ch = *stre;
+
+                                       if (ch == ';') return stre;
+
+                                       for (;;)
+                                       {
+                                               if (static_cast<unsigned int>(ch - '0') <= 9)
+                                                       ucsc = 16 * ucsc + (ch - '0');
+                                               else if (static_cast<unsigned int>((ch | ' ') - 'a') <= 5)
+                                                       ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10);
+                                               else if (ch == ';')
+                                                       break;
+                                               else // cancel
+                                                       return stre;
+
+                                               ch = *++stre;
+                                       }
+                                       
+                                       ++stre;
+                               }
+                               else    // &#... (dec code)
+                               {
+                                       char_t ch = *++stre;
+
+                                       if (ch == ';') return stre;
+
+                                       for (;;)
+                                       {
+                                               if (static_cast<unsigned int>(static_cast<unsigned int>(ch) - '0') <= 9)
+                                                       ucsc = 10 * ucsc + (ch - '0');
+                                               else if (ch == ';')
+                                                       break;
+                                               else // cancel
+                                                       return stre;
+
+                                               ch = *++stre;
+                                       }
+                                       
+                                       ++stre;
+                               }
+
+                       #ifdef PUGIXML_WCHAR_MODE
+                               s = reinterpret_cast<char_t*>(wchar_writer::any(reinterpret_cast<wchar_writer::value_type>(s), ucsc));
+                       #else
+                               s = reinterpret_cast<char_t*>(utf8_writer::any(reinterpret_cast<uint8_t*>(s), ucsc));
+                       #endif
+                                       
+                               g.push(s, stre - s);
+                               return stre;
+                       }
+
+                       case 'a':       // &a
+                       {
+                               ++stre;
+
+                               if (*stre == 'm') // &am
+                               {
+                                       if (*++stre == 'p' && *++stre == ';') // &amp;
+                                       {
+                                               *s++ = '&';
+                                               ++stre;
+                                                       
+                                               g.push(s, stre - s);
+                                               return stre;
+                                       }
+                               }
+                               else if (*stre == 'p') // &ap
+                               {
+                                       if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // &apos;
+                                       {
+                                               *s++ = '\'';
+                                               ++stre;
+
+                                               g.push(s, stre - s);
+                                               return stre;
+                                       }
+                               }
+                               break;
+                       }
+
+                       case 'g': // &g
+                       {
+                               if (*++stre == 't' && *++stre == ';') // &gt;
+                               {
+                                       *s++ = '>';
+                                       ++stre;
+                                       
+                                       g.push(s, stre - s);
+                                       return stre;
+                               }
+                               break;
+                       }
+
+                       case 'l': // &l
+                       {
+                               if (*++stre == 't' && *++stre == ';') // &lt;
+                               {
+                                       *s++ = '<';
+                                       ++stre;
+                                               
+                                       g.push(s, stre - s);
+                                       return stre;
+                               }
+                               break;
+                       }
+
+                       case 'q': // &q
+                       {
+                               if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // &quot;
+                               {
+                                       *s++ = '"';
+                                       ++stre;
+                                       
+                                       g.push(s, stre - s);
+                                       return stre;
+                               }
+                               break;
+                       }
+
+                       default:
+                               break;
+               }
+               
+               return stre;
+       }
+
+       // Parser utilities
+       #define PUGI__ENDSWITH(c, e)        ((c) == (e) || ((c) == 0 && endch == (e)))
+       #define PUGI__SKIPWS()              { while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; }
+       #define PUGI__OPTSET(OPT)           ( optmsk & (OPT) )
+       #define PUGI__PUSHNODE(TYPE)        { cursor = append_new_node(cursor, alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); }
+       #define PUGI__POPNODE()             { cursor = cursor->parent; }
+       #define PUGI__SCANFOR(X)            { while (*s != 0 && !(X)) ++s; }
+       #define PUGI__SCANWHILE(X)          { while (X) ++s; }
+       #define PUGI__SCANWHILE_UNROLL(X)   { for (;;) { char_t ss = s[0]; if (PUGI__UNLIKELY(!(X))) { break; } ss = s[1]; if (PUGI__UNLIKELY(!(X))) { s += 1; break; } ss = s[2]; if (PUGI__UNLIKELY(!(X))) { s += 2; break; } ss = s[3]; if (PUGI__UNLIKELY(!(X))) { s += 3; break; } s += 4; } }
+       #define PUGI__ENDSEG()              { ch = *s; *s = 0; ++s; }
+       #define PUGI__THROW_ERROR(err, m)   return error_offset = m, error_status = err, static_cast<char_t*>(0)
+       #define PUGI__CHECK_ERROR(err, m)   { if (*s == 0) PUGI__THROW_ERROR(err, m); }
+
+       PUGI__FN char_t* strconv_comment(char_t* s, char_t endch)
+       {
+               gap g;
+               
+               while (true)
+               {
+                       PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_comment));
+               
+                       if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+                       {
+                               *s++ = '\n'; // replace first one with 0x0a
+                               
+                               if (*s == '\n') g.push(s, 1);
+                       }
+                       else if (s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>')) // comment ends here
+                       {
+                               *g.flush(s) = 0;
+                               
+                               return s + (s[2] == '>' ? 3 : 2);
+                       }
+                       else if (*s == 0)
+                       {
+                               return 0;
+                       }
+                       else ++s;
+               }
+       }
+
+       PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch)
+       {
+               gap g;
+                       
+               while (true)
+               {
+                       PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_cdata));
+                       
+                       if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+                       {
+                               *s++ = '\n'; // replace first one with 0x0a
+                               
+                               if (*s == '\n') g.push(s, 1);
+                       }
+                       else if (s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>')) // CDATA ends here
+                       {
+                               *g.flush(s) = 0;
+                               
+                               return s + 1;
+                       }
+                       else if (*s == 0)
+                       {
+                               return 0;
+                       }
+                       else ++s;
+               }
+       }
+       
+       typedef char_t* (*strconv_pcdata_t)(char_t*);
+               
+       template <typename opt_trim, typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
+       {
+               static char_t* parse(char_t* s)
+               {
+                       gap g;
+
+                       char_t* begin = s;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_pcdata));
+
+                               if (*s == '<') // PCDATA ends here
+                               {
+                                       char_t* end = g.flush(s);
+
+                                       if (opt_trim::value)
+                                               while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
+                                                       --end;
+
+                                       *end = 0;
+                                       
+                                       return s + 1;
+                               }
+                               else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
+                               {
+                                       *s++ = '\n'; // replace first one with 0x0a
+                                       
+                                       if (*s == '\n') g.push(s, 1);
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (*s == 0)
+                               {
+                                       char_t* end = g.flush(s);
+
+                                       if (opt_trim::value)
+                                               while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
+                                                       --end;
+
+                                       *end = 0;
+
+                                       return s;
+                               }
+                               else ++s;
+                       }
+               }
+       };
+       
+       PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
+       {
+               PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800);
+
+               switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim)
+               {
+               case 0: return strconv_pcdata_impl<opt_false, opt_false, opt_false>::parse;
+               case 1: return strconv_pcdata_impl<opt_false, opt_false, opt_true>::parse;
+               case 2: return strconv_pcdata_impl<opt_false, opt_true, opt_false>::parse;
+               case 3: return strconv_pcdata_impl<opt_false, opt_true, opt_true>::parse;
+               case 4: return strconv_pcdata_impl<opt_true, opt_false, opt_false>::parse;
+               case 5: return strconv_pcdata_impl<opt_true, opt_false, opt_true>::parse;
+               case 6: return strconv_pcdata_impl<opt_true, opt_true, opt_false>::parse;
+               case 7: return strconv_pcdata_impl<opt_true, opt_true, opt_true>::parse;
+               default: assert(false); return 0; // should not get here
+               }
+       }
+
+       typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
+       
+       template <typename opt_escape> struct strconv_attribute_impl
+       {
+               static char_t* parse_wnorm(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       // trim leading whitespaces
+                       if (PUGI__IS_CHARTYPE(*s, ct_space))
+                       {
+                               char_t* str = s;
+                               
+                               do ++str;
+                               while (PUGI__IS_CHARTYPE(*str, ct_space));
+                               
+                               g.push(s, str - s);
+                       }
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space));
+                               
+                               if (*s == end_quote)
+                               {
+                                       char_t* str = g.flush(s);
+                                       
+                                       do *str-- = 0;
+                                       while (PUGI__IS_CHARTYPE(*str, ct_space));
+                               
+                                       return s + 1;
+                               }
+                               else if (PUGI__IS_CHARTYPE(*s, ct_space))
+                               {
+                                       *s++ = ' ';
+               
+                                       if (PUGI__IS_CHARTYPE(*s, ct_space))
+                                       {
+                                               char_t* str = s + 1;
+                                               while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str;
+                                               
+                                               g.push(s, str - s);
+                                       }
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+
+               static char_t* parse_wconv(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr_ws));
+                               
+                               if (*s == end_quote)
+                               {
+                                       *g.flush(s) = 0;
+                               
+                                       return s + 1;
+                               }
+                               else if (PUGI__IS_CHARTYPE(*s, ct_space))
+                               {
+                                       if (*s == '\r')
+                                       {
+                                               *s++ = ' ';
+                               
+                                               if (*s == '\n') g.push(s, 1);
+                                       }
+                                       else *s++ = ' ';
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+
+               static char_t* parse_eol(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
+                               
+                               if (*s == end_quote)
+                               {
+                                       *g.flush(s) = 0;
+                               
+                                       return s + 1;
+                               }
+                               else if (*s == '\r')
+                               {
+                                       *s++ = '\n';
+                                       
+                                       if (*s == '\n') g.push(s, 1);
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+
+               static char_t* parse_simple(char_t* s, char_t end_quote)
+               {
+                       gap g;
+
+                       while (true)
+                       {
+                               PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPE(ss, ct_parse_attr));
+                               
+                               if (*s == end_quote)
+                               {
+                                       *g.flush(s) = 0;
+                               
+                                       return s + 1;
+                               }
+                               else if (opt_escape::value && *s == '&')
+                               {
+                                       s = strconv_escape(s, g);
+                               }
+                               else if (!*s)
+                               {
+                                       return 0;
+                               }
+                               else ++s;
+                       }
+               }
+       };
+
+       PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
+       {
+               PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
+               
+               switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
+               {
+               case 0:  return strconv_attribute_impl<opt_false>::parse_simple;
+               case 1:  return strconv_attribute_impl<opt_true>::parse_simple;
+               case 2:  return strconv_attribute_impl<opt_false>::parse_eol;
+               case 3:  return strconv_attribute_impl<opt_true>::parse_eol;
+               case 4:  return strconv_attribute_impl<opt_false>::parse_wconv;
+               case 5:  return strconv_attribute_impl<opt_true>::parse_wconv;
+               case 6:  return strconv_attribute_impl<opt_false>::parse_wconv;
+               case 7:  return strconv_attribute_impl<opt_true>::parse_wconv;
+               case 8:  return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 9:  return strconv_attribute_impl<opt_true>::parse_wnorm;
+               case 10: return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 11: return strconv_attribute_impl<opt_true>::parse_wnorm;
+               case 12: return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 13: return strconv_attribute_impl<opt_true>::parse_wnorm;
+               case 14: return strconv_attribute_impl<opt_false>::parse_wnorm;
+               case 15: return strconv_attribute_impl<opt_true>::parse_wnorm;
+               default: assert(false); return 0; // should not get here
+               }
+       }
+
+       inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
+       {
+               xml_parse_result result;
+               result.status = status;
+               result.offset = offset;
+
+               return result;
+       }
+
+       struct xml_parser
+       {
+               xml_allocator alloc;
+               char_t* error_offset;
+               xml_parse_status error_status;
+               
+               xml_parser(const xml_allocator& alloc_): alloc(alloc_), error_offset(0), error_status(status_ok)
+               {
+               }
+
+               // DOCTYPE consists of nested sections of the following possible types:
+               // <!-- ... -->, <? ... ?>, "...", '...'
+               // <![...]]>
+               // <!...>
+               // First group can not contain nested groups
+               // Second group can contain nested groups of the same type
+               // Third group can contain all other groups
+               char_t* parse_doctype_primitive(char_t* s)
+               {
+                       if (*s == '"' || *s == '\'')
+                       {
+                               // quoted string
+                               char_t ch = *s++;
+                               PUGI__SCANFOR(*s == ch);
+                               if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               s++;
+                       }
+                       else if (s[0] == '<' && s[1] == '?')
+                       {
+                               // <? ... ?>
+                               s += 2;
+                               PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
+                               if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               s += 2;
+                       }
+                       else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
+                       {
+                               s += 4;
+                               PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
+                               if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               s += 4;
+                       }
+                       else PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                       return s;
+               }
+
+               char_t* parse_doctype_ignore(char_t* s)
+               {
+                       size_t depth = 0;
+
+                       assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
+                       s += 3;
+
+                       while (*s)
+                       {
+                               if (s[0] == '<' && s[1] == '!' && s[2] == '[')
+                               {
+                                       // nested ignore section
+                                       s += 3;
+                                       depth++;
+                               }
+                               else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
+                               {
+                                       // ignore section end
+                                       s += 3;
+
+                                       if (depth == 0)
+                                               return s;
+
+                                       depth--;
+                               }
+                               else s++;
+                       }
+
+                       PUGI__THROW_ERROR(status_bad_doctype, s);
+               }
+
+               char_t* parse_doctype_group(char_t* s, char_t endch)
+               {
+                       size_t depth = 0;
+
+                       assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
+                       s += 2;
+
+                       while (*s)
+                       {
+                               if (s[0] == '<' && s[1] == '!' && s[2] != '-')
+                               {
+                                       if (s[2] == '[')
+                                       {
+                                               // ignore
+                                               s = parse_doctype_ignore(s);
+                                               if (!s) return s;
+                                       }
+                                       else
+                                       {
+                                               // some control group
+                                               s += 2;
+                                               depth++;
+                                       }
+                               }
+                               else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
+                               {
+                                       // unknown tag (forbidden), or some primitive group
+                                       s = parse_doctype_primitive(s);
+                                       if (!s) return s;
+                               }
+                               else if (*s == '>')
+                               {
+                                       if (depth == 0)
+                                               return s;
+
+                                       depth--;
+                                       s++;
+                               }
+                               else s++;
+                       }
+
+                       if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                       return s;
+               }
+
+               char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch)
+               {
+                       // parse node contents, starting with exclamation mark
+                       ++s;
+
+                       if (*s == '-') // '<!-...'
+                       {
+                               ++s;
+
+                               if (*s == '-') // '<!--...'
+                               {
+                                       ++s;
+
+                                       if (PUGI__OPTSET(parse_comments))
+                                       {
+                                               PUGI__PUSHNODE(node_comment); // Append a new node on the tree.
+                                               cursor->value = s; // Save the offset.
+                                       }
+
+                                       if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments))
+                                       {
+                                               s = strconv_comment(s, endch);
+
+                                               if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value);
+                                       }
+                                       else
+                                       {
+                                               // Scan for terminating '-->'.
+                                               PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && PUGI__ENDSWITH(s[2], '>'));
+                                               PUGI__CHECK_ERROR(status_bad_comment, s);
+
+                                               if (PUGI__OPTSET(parse_comments))
+                                                       *s = 0; // Zero-terminate this segment at the first terminating '-'.
+
+                                               s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
+                                       }
+                               }
+                               else PUGI__THROW_ERROR(status_bad_comment, s);
+                       }
+                       else if (*s == '[')
+                       {
+                               // '<![CDATA[...'
+                               if (*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
+                               {
+                                       ++s;
+
+                                       if (PUGI__OPTSET(parse_cdata))
+                                       {
+                                               PUGI__PUSHNODE(node_cdata); // Append a new node on the tree.
+                                               cursor->value = s; // Save the offset.
+
+                                               if (PUGI__OPTSET(parse_eol))
+                                               {
+                                                       s = strconv_cdata(s, endch);
+
+                                                       if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value);
+                                               }
+                                               else
+                                               {
+                                                       // Scan for terminating ']]>'.
+                                                       PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>'));
+                                                       PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+                                                       *s++ = 0; // Zero-terminate this segment.
+                                               }
+                                       }
+                                       else // Flagged for discard, but we still have to scan for the terminator.
+                                       {
+                                               // Scan for terminating ']]>'.
+                                               PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && PUGI__ENDSWITH(s[2], '>'));
+                                               PUGI__CHECK_ERROR(status_bad_cdata, s);
+
+                                               ++s;
+                                       }
+
+                                       s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
+                               }
+                               else PUGI__THROW_ERROR(status_bad_cdata, s);
+                       }
+                       else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && PUGI__ENDSWITH(s[6], 'E'))
+                       {
+                               s -= 2;
+
+                               if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s);
+
+                               char_t* mark = s + 9;
+
+                               s = parse_doctype_group(s, endch);
+                               if (!s) return s;
+
+                               assert((*s == 0 && endch == '>') || *s == '>');
+                               if (*s) *s++ = 0;
+
+                               if (PUGI__OPTSET(parse_doctype))
+                               {
+                                       while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark;
+
+                                       PUGI__PUSHNODE(node_doctype);
+
+                                       cursor->value = mark;
+                               }
+                       }
+                       else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s);
+                       else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s);
+                       else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+
+                       return s;
+               }
+
+               char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch)
+               {
+                       // load into registers
+                       xml_node_struct* cursor = ref_cursor;
+                       char_t ch = 0;
+
+                       // parse node contents, starting with question mark
+                       ++s;
+
+                       // read PI target
+                       char_t* target = s;
+
+                       if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s);
+
+                       PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol));
+                       PUGI__CHECK_ERROR(status_bad_pi, s);
+
+                       // determine node type; stricmp / strcasecmp is not portable
+                       bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s;
+
+                       if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi))
+                       {
+                               if (declaration)
+                               {
+                                       // disallow non top-level declarations
+                                       if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s);
+
+                                       PUGI__PUSHNODE(node_declaration);
+                               }
+                               else
+                               {
+                                       PUGI__PUSHNODE(node_pi);
+                               }
+
+                               cursor->name = target;
+
+                               PUGI__ENDSEG();
+
+                               // parse value/attributes
+                               if (ch == '?')
+                               {
+                                       // empty node
+                                       if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s);
+                                       s += (*s == '>');
+
+                                       PUGI__POPNODE();
+                               }
+                               else if (PUGI__IS_CHARTYPE(ch, ct_space))
+                               {
+                                       PUGI__SKIPWS();
+
+                                       // scan for tag end
+                                       char_t* value = s;
+
+                                       PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>'));
+                                       PUGI__CHECK_ERROR(status_bad_pi, s);
+
+                                       if (declaration)
+                                       {
+                                               // replace ending ? with / so that 'element' terminates properly
+                                               *s = '/';
+
+                                               // we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES
+                                               s = value;
+                                       }
+                                       else
+                                       {
+                                               // store value and step over >
+                                               cursor->value = value;
+                                               PUGI__POPNODE();
+
+                                               PUGI__ENDSEG();
+
+                                               s += (*s == '>');
+                                       }
+                               }
+                               else PUGI__THROW_ERROR(status_bad_pi, s);
+                       }
+                       else
+                       {
+                               // scan for tag end
+                               PUGI__SCANFOR(s[0] == '?' && PUGI__ENDSWITH(s[1], '>'));
+                               PUGI__CHECK_ERROR(status_bad_pi, s);
+
+                               s += (s[1] == '>' ? 2 : 1);
+                       }
+
+                       // store from registers
+                       ref_cursor = cursor;
+
+                       return s;
+               }
+
+               char_t* parse_tree(char_t* s, xml_node_struct* root, unsigned int optmsk, char_t endch)
+               {
+                       strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
+                       strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
+                       
+                       char_t ch = 0;
+                       xml_node_struct* cursor = root;
+                       char_t* mark = s;
+
+                       while (*s != 0)
+                       {
+                               if (*s == '<')
+                               {
+                                       ++s;
+
+                               LOC_TAG:
+                                       if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
+                                       {
+                                               PUGI__PUSHNODE(node_element); // Append a new node to the tree.
+
+                                               cursor->name = s;
+
+                                               PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
+                                               PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+
+                                               if (ch == '>')
+                                               {
+                                                       // end of tag
+                                               }
+                                               else if (PUGI__IS_CHARTYPE(ch, ct_space))
+                                               {
+                                               LOC_ATTRIBUTES:
+                                                       while (true)
+                                                       {
+                                                               PUGI__SKIPWS(); // Eat any whitespace.
+                                               
+                                                               if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
+                                                               {
+                                                                       xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for this attribute.
+                                                                       if (!a) PUGI__THROW_ERROR(status_out_of_memory, s);
+
+                                                                       a->name = s; // Save the offset.
+
+                                                                       PUGI__SCANWHILE_UNROLL(PUGI__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator.
+                                                                       PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
+
+                                                                       if (PUGI__IS_CHARTYPE(ch, ct_space))
+                                                                       {
+                                                                               PUGI__SKIPWS(); // Eat any whitespace.
+
+                                                                               ch = *s;
+                                                                               ++s;
+                                                                       }
+                                                                       
+                                                                       if (ch == '=') // '<... #=...'
+                                                                       {
+                                                                               PUGI__SKIPWS(); // Eat any whitespace.
+
+                                                                               if (*s == '"' || *s == '\'') // '<... #="...'
+                                                                               {
+                                                                                       ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
+                                                                                       ++s; // Step over the quote.
+                                                                                       a->value = s; // Save the offset.
+
+                                                                                       s = strconv_attribute(s, ch);
+                                                                               
+                                                                                       if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value);
+
+                                                                                       // After this line the loop continues from the start;
+                                                                                       // Whitespaces, / and > are ok, symbols and EOF are wrong,
+                                                                                       // everything else will be detected
+                                                                                       if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s);
+                                                                               }
+                                                                               else PUGI__THROW_ERROR(status_bad_attribute, s);
+                                                                       }
+                                                                       else PUGI__THROW_ERROR(status_bad_attribute, s);
+                                                               }
+                                                               else if (*s == '/')
+                                                               {
+                                                                       ++s;
+                                                                       
+                                                                       if (*s == '>')
+                                                                       {
+                                                                               PUGI__POPNODE();
+                                                                               s++;
+                                                                               break;
+                                                                       }
+                                                                       else if (*s == 0 && endch == '>')
+                                                                       {
+                                                                               PUGI__POPNODE();
+                                                                               break;
+                                                                       }
+                                                                       else PUGI__THROW_ERROR(status_bad_start_element, s);
+                                                               }
+                                                               else if (*s == '>')
+                                                               {
+                                                                       ++s;
+
+                                                                       break;
+                                                               }
+                                                               else if (*s == 0 && endch == '>')
+                                                               {
+                                                                       break;
+                                                               }
+                                                               else PUGI__THROW_ERROR(status_bad_start_element, s);
+                                                       }
+
+                                                       // !!!
+                                               }
+                                               else if (ch == '/') // '<#.../'
+                                               {
+                                                       if (!PUGI__ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s);
+
+                                                       PUGI__POPNODE(); // Pop.
+
+                                                       s += (*s == '>');
+                                               }
+                                               else if (ch == 0)
+                                               {
+                                                       // we stepped over null terminator, backtrack & handle closing tag
+                                                       --s;
+                                                       
+                                                       if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s);
+                                               }
+                                               else PUGI__THROW_ERROR(status_bad_start_element, s);
+                                       }
+                                       else if (*s == '/')
+                                       {
+                                               ++s;
+
+                                               char_t* name = cursor->name;
+                                               if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+                                               
+                                               while (PUGI__IS_CHARTYPE(*s, ct_symbol))
+                                               {
+                                                       if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+                                               }
+
+                                               if (*name)
+                                               {
+                                                       if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s);
+                                                       else PUGI__THROW_ERROR(status_end_element_mismatch, s);
+                                               }
+                                                       
+                                               PUGI__POPNODE(); // Pop.
+
+                                               PUGI__SKIPWS();
+
+                                               if (*s == 0)
+                                               {
+                                                       if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+                                               }
+                                               else
+                                               {
+                                                       if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
+                                                       ++s;
+                                               }
+                                       }
+                                       else if (*s == '?') // '<?...'
+                                       {
+                                               s = parse_question(s, cursor, optmsk, endch);
+                                               if (!s) return s;
+
+                                               assert(cursor);
+                                               if (PUGI__NODETYPE(cursor) == node_declaration) goto LOC_ATTRIBUTES;
+                                       }
+                                       else if (*s == '!') // '<!...'
+                                       {
+                                               s = parse_exclamation(s, cursor, optmsk, endch);
+                                               if (!s) return s;
+                                       }
+                                       else if (*s == 0 && endch == '?') PUGI__THROW_ERROR(status_bad_pi, s);
+                                       else PUGI__THROW_ERROR(status_unrecognized_tag, s);
+                               }
+                               else
+                               {
+                                       mark = s; // Save this offset while searching for a terminator.
+
+                                       PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
+
+                                       if (*s == '<' || !*s)
+                                       {
+                                               // We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
+                                               assert(mark != s);
+
+                                               if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single) || PUGI__OPTSET(parse_trim_pcdata))
+                                               {
+                                                       continue;
+                                               }
+                                               else if (PUGI__OPTSET(parse_ws_pcdata_single))
+                                               {
+                                                       if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue;
+                                               }
+                                       }
+
+                                       if (!PUGI__OPTSET(parse_trim_pcdata))
+                                               s = mark;
+                                                       
+                                       if (cursor->parent || PUGI__OPTSET(parse_fragment))
+                                       {
+                                               PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
+                                               cursor->value = s; // Save the offset.
+
+                                               s = strconv_pcdata(s);
+                                                               
+                                               PUGI__POPNODE(); // Pop since this is a standalone.
+                                               
+                                               if (!*s) break;
+                                       }
+                                       else
+                                       {
+                                               PUGI__SCANFOR(*s == '<'); // '...<'
+                                               if (!*s) break;
+                                               
+                                               ++s;
+                                       }
+
+                                       // We're after '<'
+                                       goto LOC_TAG;
+                               }
+                       }
+
+                       // check that last tag is closed
+                       if (cursor != root) PUGI__THROW_ERROR(status_end_element_mismatch, s);
+
+                       return s;
+               }
+
+       #ifdef PUGIXML_WCHAR_MODE
+               static char_t* parse_skip_bom(char_t* s)
+               {
+                       unsigned int bom = 0xfeff;
+                       return (s[0] == static_cast<wchar_t>(bom)) ? s + 1 : s;
+               }
+       #else
+               static char_t* parse_skip_bom(char_t* s)
+               {
+                       return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
+               }
+       #endif
+
+               static bool has_element_node_siblings(xml_node_struct* node)
+               {
+                       while (node)
+                       {
+                               if (PUGI__NODETYPE(node) == node_element) return true;
+
+                               node = node->next_sibling;
+                       }
+
+                       return false;
+               }
+
+               static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk)
+               {
+                       // allocator object is a part of document object
+                       xml_allocator& alloc_ = *static_cast<xml_allocator*>(xmldoc);
+
+                       // early-out for empty documents
+                       if (length == 0)
+                               return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element);
+
+                       // get last child of the root before parsing
+                       xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0;
+       
+                       // create parser on stack
+                       xml_parser parser(alloc_);
+
+                       // save last character and make buffer zero-terminated (speeds up parsing)
+                       char_t endch = buffer[length - 1];
+                       buffer[length - 1] = 0;
+                       
+                       // skip BOM to make sure it does not end up as part of parse output
+                       char_t* buffer_data = parse_skip_bom(buffer);
+
+                       // perform actual parsing
+                       parser.parse_tree(buffer_data, root, optmsk, endch);
+
+                       // update allocator state
+                       alloc_ = parser.alloc;
+
+                       xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
+                       assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
+
+                       if (result)
+                       {
+                               // since we removed last character, we have to handle the only possible false positive (stray <)
+                               if (endch == '<')
+                                       return make_parse_result(status_unrecognized_tag, length - 1);
+
+                               // check if there are any element nodes parsed
+                               xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child;
+
+                               if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed))
+                                       return make_parse_result(status_no_document_element, length - 1);
+                       }
+                       else
+                       {
+                               // roll back offset if it occurs on a null terminator in the source buffer
+                               if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
+                                       result.offset--;
+                       }
+
+                       return result;
+               }
+       };
+
+       // Output facilities
+       PUGI__FN xml_encoding get_write_native_encoding()
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               return get_wchar_encoding();
+       #else
+               return encoding_utf8;
+       #endif
+       }
+
+       PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding)
+       {
+               // replace wchar encoding with utf implementation
+               if (encoding == encoding_wchar) return get_wchar_encoding();
+
+               // replace utf16 encoding with utf16 with specific endianness
+               if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+               // replace utf32 encoding with utf32 with specific endianness
+               if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+               // only do autodetection if no explicit encoding is requested
+               if (encoding != encoding_auto) return encoding;
+
+               // assume utf8 encoding
+               return encoding_utf8;
+       }
+
+#ifdef PUGIXML_WCHAR_MODE
+       PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+       {
+               if (length < 1) return 0;
+
+               // discard last character if it's the lead of a surrogate pair 
+               return (sizeof(wchar_t) == 2 && static_cast<unsigned int>(static_cast<uint16_t>(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length;
+       }
+
+       PUGI__FN size_t convert_buffer_output(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+       {
+               // only endian-swapping is required
+               if (need_endian_swap_utf(encoding, get_wchar_encoding()))
+               {
+                       convert_wchar_endian_swap(r_char, data, length);
+
+                       return length * sizeof(char_t);
+               }
+       
+               // convert to utf8
+               if (encoding == encoding_utf8)
+               {
+                       uint8_t* dest = r_u8;
+                       uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(data, length, dest);
+
+                       return static_cast<size_t>(end - dest);
+               }
+
+               // convert to utf16
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       uint16_t* dest = r_u16;
+
+                       // convert to native utf16
+                       uint16_t* end = utf_decoder<utf16_writer>::decode_wchar_block(data, length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+               }
+
+               // convert to utf32
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       uint32_t* dest = r_u32;
+
+                       // convert to native utf32
+                       uint32_t* end = utf_decoder<utf32_writer>::decode_wchar_block(data, length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+               }
+
+               // convert to latin1
+               if (encoding == encoding_latin1)
+               {
+                       uint8_t* dest = r_u8;
+                       uint8_t* end = utf_decoder<latin1_writer>::decode_wchar_block(data, length, dest);
+
+                       return static_cast<size_t>(end - dest);
+               }
+
+               assert(!"Invalid encoding");
+               return 0;
+       }
+#else
+       PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
+       {
+               if (length < 5) return 0;
+
+               for (size_t i = 1; i <= 4; ++i)
+               {
+                       uint8_t ch = static_cast<uint8_t>(data[length - i]);
+
+                       // either a standalone character or a leading one
+                       if ((ch & 0xc0) != 0x80) return length - i;
+               }
+
+               // there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk
+               return length;
+       }
+
+       PUGI__FN size_t convert_buffer_output(char_t* /* r_char */, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
+       {
+               if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
+               {
+                       uint16_t* dest = r_u16;
+
+                       // convert to native utf16
+                       uint16_t* end = utf_decoder<utf16_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint16_t);
+               }
+
+               if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
+               {
+                       uint32_t* dest = r_u32;
+
+                       // convert to native utf32
+                       uint32_t* end = utf_decoder<utf32_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+                       // swap if necessary
+                       xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
+
+                       if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
+
+                       return static_cast<size_t>(end - dest) * sizeof(uint32_t);
+               }
+
+               if (encoding == encoding_latin1)
+               {
+                       uint8_t* dest = r_u8;
+                       uint8_t* end = utf_decoder<latin1_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
+
+                       return static_cast<size_t>(end - dest);
+               }
+
+               assert(!"Invalid encoding");
+               return 0;
+       }
+#endif
+
+       class xml_buffered_writer
+       {
+               xml_buffered_writer(const xml_buffered_writer&);
+               xml_buffered_writer& operator=(const xml_buffered_writer&);
+
+       public:
+               xml_buffered_writer(xml_writer& writer_, xml_encoding user_encoding): writer(writer_), bufsize(0), encoding(get_write_encoding(user_encoding))
+               {
+                       PUGI__STATIC_ASSERT(bufcapacity >= 8);
+               }
+
+               ~xml_buffered_writer()
+               {
+                       flush();
+               }
+
+               size_t flush()
+               {
+                       flush(buffer, bufsize);
+                       bufsize = 0;
+                       return 0;
+               }
+
+               void flush(const char_t* data, size_t size)
+               {
+                       if (size == 0) return;
+
+                       // fast path, just write data
+                       if (encoding == get_write_native_encoding())
+                               writer.write(data, size * sizeof(char_t));
+                       else
+                       {
+                               // convert chunk
+                               size_t result = convert_buffer_output(scratch.data_char, scratch.data_u8, scratch.data_u16, scratch.data_u32, data, size, encoding);
+                               assert(result <= sizeof(scratch));
+
+                               // write data
+                               writer.write(scratch.data_u8, result);
+                       }
+               }
+
+               void write_direct(const char_t* data, size_t length)
+               {
+                       // flush the remaining buffer contents
+                       flush();
+
+                       // handle large chunks
+                       if (length > bufcapacity)
+                       {
+                               if (encoding == get_write_native_encoding())
+                               {
+                                       // fast path, can just write data chunk
+                                       writer.write(data, length * sizeof(char_t));
+                                       return;
+                               }
+
+                               // need to convert in suitable chunks
+                               while (length > bufcapacity)
+                               {
+                                       // get chunk size by selecting such number of characters that are guaranteed to fit into scratch buffer
+                                       // and form a complete codepoint sequence (i.e. discard start of last codepoint if necessary)
+                                       size_t chunk_size = get_valid_length(data, bufcapacity);
+                                       assert(chunk_size);
+
+                                       // convert chunk and write
+                                       flush(data, chunk_size);
+
+                                       // iterate
+                                       data += chunk_size;
+                                       length -= chunk_size;
+                               }
+
+                               // small tail is copied below
+                               bufsize = 0;
+                       }
+
+                       memcpy(buffer + bufsize, data, length * sizeof(char_t));
+                       bufsize += length;
+               }
+
+               void write_buffer(const char_t* data, size_t length)
+               {
+                       size_t offset = bufsize;
+
+                       if (offset + length <= bufcapacity)
+                       {
+                               memcpy(buffer + offset, data, length * sizeof(char_t));
+                               bufsize = offset + length;
+                       }
+                       else
+                       {
+                               write_direct(data, length);
+                       }
+               }
+
+               void write_string(const char_t* data)
+               {
+                       // write the part of the string that fits in the buffer
+                       size_t offset = bufsize;
+
+                       while (*data && offset < bufcapacity)
+                               buffer[offset++] = *data++;
+
+                       // write the rest
+                       if (offset < bufcapacity)
+                       {
+                               bufsize = offset;
+                       }
+                       else
+                       {
+                               // backtrack a bit if we have split the codepoint
+                               size_t length = offset - bufsize;
+                               size_t extra = length - get_valid_length(data - length, length);
+
+                               bufsize = offset - extra;
+
+                               write_direct(data - extra, strlength(data) + extra);
+                       }
+               }
+
+               void write(char_t d0)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 1) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       bufsize = offset + 1;
+               }
+
+               void write(char_t d0, char_t d1)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 2) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       bufsize = offset + 2;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 3) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       bufsize = offset + 3;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2, char_t d3)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 4) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       buffer[offset + 3] = d3;
+                       bufsize = offset + 4;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 5) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       buffer[offset + 3] = d3;
+                       buffer[offset + 4] = d4;
+                       bufsize = offset + 5;
+               }
+
+               void write(char_t d0, char_t d1, char_t d2, char_t d3, char_t d4, char_t d5)
+               {
+                       size_t offset = bufsize;
+                       if (offset > bufcapacity - 6) offset = flush();
+
+                       buffer[offset + 0] = d0;
+                       buffer[offset + 1] = d1;
+                       buffer[offset + 2] = d2;
+                       buffer[offset + 3] = d3;
+                       buffer[offset + 4] = d4;
+                       buffer[offset + 5] = d5;
+                       bufsize = offset + 6;
+               }
+
+               // utf8 maximum expansion: x4 (-> utf32)
+               // utf16 maximum expansion: x2 (-> utf32)
+               // utf32 maximum expansion: x1
+               enum
+               {
+                       bufcapacitybytes =
+                       #ifdef PUGIXML_MEMORY_OUTPUT_STACK
+                               PUGIXML_MEMORY_OUTPUT_STACK
+                       #else
+                               10240
+                       #endif
+                       ,
+                       bufcapacity = bufcapacitybytes / (sizeof(char_t) + 4)
+               };
+
+               char_t buffer[bufcapacity];
+
+               union
+               {
+                       uint8_t data_u8[4 * bufcapacity];
+                       uint16_t data_u16[2 * bufcapacity];
+                       uint32_t data_u32[bufcapacity];
+                       char_t data_char[bufcapacity];
+               } scratch;
+
+               xml_writer& writer;
+               size_t bufsize;
+               xml_encoding encoding;
+       };
+
+       PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type)
+       {
+               while (*s)
+               {
+                       const char_t* prev = s;
+                       
+                       // While *s is a usual symbol
+                       PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type));
+               
+                       writer.write_buffer(prev, static_cast<size_t>(s - prev));
+
+                       switch (*s)
+                       {
+                               case 0: break;
+                               case '&':
+                                       writer.write('&', 'a', 'm', 'p', ';');
+                                       ++s;
+                                       break;
+                               case '<':
+                                       writer.write('&', 'l', 't', ';');
+                                       ++s;
+                                       break;
+                               case '>':
+                                       writer.write('&', 'g', 't', ';');
+                                       ++s;
+                                       break;
+                               case '"':
+                                       writer.write('&', 'q', 'u', 'o', 't', ';');
+                                       ++s;
+                                       break;
+                               default: // s is not a usual symbol
+                               {
+                                       unsigned int ch = static_cast<unsigned int>(*s++);
+                                       assert(ch < 32);
+
+                                       writer.write('&', '#', static_cast<char_t>((ch / 10) + '0'), static_cast<char_t>((ch % 10) + '0'), ';');
+                               }
+                       }
+               }
+       }
+
+       PUGI__FN void text_output(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags)
+       {
+               if (flags & format_no_escapes)
+                       writer.write_string(s);
+               else
+                       text_output_escaped(writer, s, type);
+       }
+
+       PUGI__FN void text_output_cdata(xml_buffered_writer& writer, const char_t* s)
+       {
+               do
+               {
+                       writer.write('<', '!', '[', 'C', 'D');
+                       writer.write('A', 'T', 'A', '[');
+
+                       const char_t* prev = s;
+
+                       // look for ]]> sequence - we can't output it as is since it terminates CDATA
+                       while (*s && !(s[0] == ']' && s[1] == ']' && s[2] == '>')) ++s;
+
+                       // skip ]] if we stopped at ]]>, > will go to the next CDATA section
+                       if (*s) s += 2;
+
+                       writer.write_buffer(prev, static_cast<size_t>(s - prev));
+
+                       writer.write(']', ']', '>');
+               }
+               while (*s);
+       }
+
+       PUGI__FN void text_output_indent(xml_buffered_writer& writer, const char_t* indent, size_t indent_length, unsigned int depth)
+       {
+               switch (indent_length)
+               {
+               case 1:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0]);
+                       break;
+               }
+
+               case 2:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0], indent[1]);
+                       break;
+               }
+
+               case 3:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0], indent[1], indent[2]);
+                       break;
+               }
+
+               case 4:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write(indent[0], indent[1], indent[2], indent[3]);
+                       break;
+               }
+
+               default:
+               {
+                       for (unsigned int i = 0; i < depth; ++i)
+                               writer.write_buffer(indent, indent_length);
+               }
+               }
+       }
+
+       PUGI__FN void node_output_comment(xml_buffered_writer& writer, const char_t* s)
+       {
+               writer.write('<', '!', '-', '-');
+
+               while (*s)
+               {
+                       const char_t* prev = s;
+
+                       // look for -\0 or -- sequence - we can't output it since -- is illegal in comment body
+                       while (*s && !(s[0] == '-' && (s[1] == '-' || s[1] == 0))) ++s;
+
+                       writer.write_buffer(prev, static_cast<size_t>(s - prev));
+
+                       if (*s)
+                       {
+                               assert(*s == '-');
+
+                               writer.write('-', ' ');
+                               ++s;
+                       }
+               }
+
+               writer.write('-', '-', '>');
+       }
+
+       PUGI__FN void node_output_attributes(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+               for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute)
+               {
+                       writer.write(' ');
+                       writer.write_string(a->name ? a->name : default_name);
+                       writer.write('=', '"');
+
+                       if (a->value)
+                               text_output(writer, a->value, ctx_special_attr, flags);
+
+                       writer.write('"');
+               }
+       }
+
+       PUGI__FN bool node_output_start(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+               const char_t* name = node->name ? node->name : default_name;
+
+               writer.write('<');
+               writer.write_string(name);
+
+               if (node->first_attribute)
+                       node_output_attributes(writer, node, flags);
+
+               if (flags & format_raw)
+               {
+                       if (!node->first_child)
+                               writer.write(' ', '/', '>');
+                       else
+                       {
+                               writer.write('>');
+
+                               return true;
+                       }
+               }
+               else
+               {
+                       xml_node_struct* first = node->first_child;
+
+                       if (!first)
+                               writer.write(' ', '/', '>', '\n');
+                       else if (!first->next_sibling && (PUGI__NODETYPE(first) == node_pcdata || PUGI__NODETYPE(first) == node_cdata))
+                       {
+                               writer.write('>');
+
+                               const char_t* value = first->value ? first->value : PUGIXML_TEXT("");
+
+                               if (PUGI__NODETYPE(first) == node_pcdata)
+                                       text_output(writer, value, ctx_special_pcdata, flags);
+                               else
+                                       text_output_cdata(writer, value);
+
+                               writer.write('<', '/');
+                               writer.write_string(name);
+                               writer.write('>', '\n');
+                       }
+                       else
+                       {
+                               writer.write('>', '\n');
+
+                               return true;
+                       }
+               }
+
+               return false;
+       }
+
+       PUGI__FN void node_output_end(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+               const char_t* name = node->name ? node->name : default_name;
+
+               writer.write('<', '/');
+               writer.write_string(name);
+
+               if (flags & format_raw)
+                       writer.write('>');
+               else
+                       writer.write('>', '\n');
+       }
+
+       PUGI__FN void node_output_simple(xml_buffered_writer& writer, xml_node_struct* node, unsigned int flags)
+       {
+               const char_t* default_name = PUGIXML_TEXT(":anonymous");
+
+               switch (PUGI__NODETYPE(node))
+               {
+                       case node_pcdata:
+                               text_output(writer, node->value ? node->value : PUGIXML_TEXT(""), ctx_special_pcdata, flags);
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_cdata:
+                               text_output_cdata(writer, node->value ? node->value : PUGIXML_TEXT(""));
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_comment:
+                               node_output_comment(writer, node->value ? node->value : PUGIXML_TEXT(""));
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_pi:
+                               writer.write('<', '?');
+                               writer.write_string(node->name ? node->name : default_name);
+
+                               if (node->value)
+                               {
+                                       writer.write(' ');
+                                       writer.write_string(node->value);
+                               }
+
+                               writer.write('?', '>');
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_declaration:
+                               writer.write('<', '?');
+                               writer.write_string(node->name ? node->name : default_name);
+                               node_output_attributes(writer, node, flags);
+                               writer.write('?', '>');
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       case node_doctype:
+                               writer.write('<', '!', 'D', 'O', 'C');
+                               writer.write('T', 'Y', 'P', 'E');
+
+                               if (node->value)
+                               {
+                                       writer.write(' ');
+                                       writer.write_string(node->value);
+                               }
+
+                               writer.write('>');
+                               if ((flags & format_raw) == 0) writer.write('\n');
+                               break;
+
+                       default:
+                               assert(!"Invalid node type");
+               }
+       }
+
+       PUGI__FN void node_output(xml_buffered_writer& writer, xml_node_struct* root, const char_t* indent, unsigned int flags, unsigned int depth)
+       {
+               size_t indent_length = ((flags & (format_indent | format_raw)) == format_indent) ? strlength(indent) : 0;
+
+               xml_node_struct* node = root;
+
+               do
+               {
+                       assert(node);
+
+                       // begin writing current node
+                       if (indent_length)
+                               text_output_indent(writer, indent, indent_length, depth);
+
+                       if (PUGI__NODETYPE(node) == node_element)
+                       {
+                               if (node_output_start(writer, node, flags))
+                               {
+                                       node = node->first_child;
+                                       depth++;
+                                       continue;
+                               }
+                       }
+                       else if (PUGI__NODETYPE(node) == node_document)
+                       {
+                               if (node->first_child)
+                               {
+                                       node = node->first_child;
+                                       continue;
+                               }
+                       }
+                       else
+                       {
+                               node_output_simple(writer, node, flags);
+                       }
+
+                       // continue to the next node
+                       while (node != root)
+                       {
+                               if (node->next_sibling)
+                               {
+                                       node = node->next_sibling;
+                                       break;
+                               }
+
+                               node = node->parent;
+
+                               // write closing node
+                               if (PUGI__NODETYPE(node) == node_element)
+                               {
+                                       depth--;
+
+                                       if (indent_length)
+                                               text_output_indent(writer, indent, indent_length, depth);
+
+                                       node_output_end(writer, node, flags);
+                               }
+                       }
+               }
+               while (node != root);
+       }
+
+       PUGI__FN bool has_declaration(xml_node_struct* node)
+       {
+               for (xml_node_struct* child = node->first_child; child; child = child->next_sibling)
+               {
+                       xml_node_type type = PUGI__NODETYPE(child);
+
+                       if (type == node_declaration) return true;
+                       if (type == node_element) return false;
+               }
+
+               return false;
+       }
+
+       PUGI__FN bool is_attribute_of(xml_attribute_struct* attr, xml_node_struct* node)
+       {
+               for (xml_attribute_struct* a = node->first_attribute; a; a = a->next_attribute)
+                       if (a == attr)
+                               return true;
+
+               return false;
+       }
+
+       PUGI__FN bool allow_insert_attribute(xml_node_type parent)
+       {
+               return parent == node_element || parent == node_declaration;
+       }
+
+       PUGI__FN bool allow_insert_child(xml_node_type parent, xml_node_type child)
+       {
+               if (parent != node_document && parent != node_element) return false;
+               if (child == node_document || child == node_null) return false;
+               if (parent != node_document && (child == node_declaration || child == node_doctype)) return false;
+
+               return true;
+       }
+
+       PUGI__FN bool allow_move(xml_node parent, xml_node child)
+       {
+               // check that child can be a child of parent
+               if (!allow_insert_child(parent.type(), child.type()))
+                       return false;
+
+               // check that node is not moved between documents
+               if (parent.root() != child.root())
+                       return false;
+
+               // check that new parent is not in the child subtree
+               xml_node cur = parent;
+
+               while (cur)
+               {
+                       if (cur == child)
+                               return false;
+
+                       cur = cur.parent();
+               }
+
+               return true;
+       }
+
+       PUGI__FN void node_copy_string(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char_t* source, uintptr_t& source_header, xml_allocator* alloc)
+       {
+               assert(!dest && (header & header_mask) == 0);
+
+               if (source)
+               {
+                       if (alloc && (source_header & header_mask) == 0)
+                       {
+                               dest = source;
+
+                               // since strcpy_insitu can reuse document buffer memory we need to mark both source and dest as shared
+                               header |= xml_memory_page_contents_shared_mask;
+                               source_header |= xml_memory_page_contents_shared_mask;
+                       }
+                       else
+                               strcpy_insitu(dest, header, header_mask, source);
+               }
+       }
+
+       PUGI__FN void node_copy_contents(xml_node_struct* dn, xml_node_struct* sn, xml_allocator* shared_alloc)
+       {
+               node_copy_string(dn->name, dn->header, xml_memory_page_name_allocated_mask, sn->name, sn->header, shared_alloc);
+               node_copy_string(dn->value, dn->header, xml_memory_page_value_allocated_mask, sn->value, sn->header, shared_alloc);
+
+               for (xml_attribute_struct* sa = sn->first_attribute; sa; sa = sa->next_attribute)
+               {
+                       xml_attribute_struct* da = append_new_attribute(dn, get_allocator(dn));
+
+                       if (da)
+                       {
+                               node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, shared_alloc);
+                               node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, shared_alloc);
+                       }
+               }
+       }
+
+       PUGI__FN void node_copy_tree(xml_node_struct* dn, xml_node_struct* sn)
+       {
+               xml_allocator& alloc = get_allocator(dn);
+               xml_allocator* shared_alloc = (&alloc == &get_allocator(sn)) ? &alloc : 0;
+
+               node_copy_contents(dn, sn, shared_alloc);
+
+               xml_node_struct* dit = dn;
+               xml_node_struct* sit = sn->first_child;
+
+               while (sit && sit != sn)
+               {
+                       if (sit != dn)
+                       {
+                               xml_node_struct* copy = append_new_node(dit, alloc, PUGI__NODETYPE(sit));
+
+                               if (copy)
+                               {
+                                       node_copy_contents(copy, sit, shared_alloc);
+
+                                       if (sit->first_child)
+                                       {
+                                               dit = copy;
+                                               sit = sit->first_child;
+                                               continue;
+                                       }
+                               }
+                       }
+
+                       // continue to the next node
+                       do
+                       {
+                               if (sit->next_sibling)
+                               {
+                                       sit = sit->next_sibling;
+                                       break;
+                               }
+
+                               sit = sit->parent;
+                               dit = dit->parent;
+                       }
+                       while (sit != sn);
+               }
+       }
+
+       inline bool is_text_node(xml_node_struct* node)
+       {
+               xml_node_type type = PUGI__NODETYPE(node);
+
+               return type == node_pcdata || type == node_cdata;
+       }
+
+       // get value with conversion functions
+       PUGI__FN int get_integer_base(const char_t* value)
+       {
+               const char_t* s = value;
+
+               while (PUGI__IS_CHARTYPE(*s, ct_space))
+                       s++;
+
+               if (*s == '-')
+                       s++;
+
+               return (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) ? 16 : 10;
+       }
+
+       PUGI__FN int get_value_int(const char_t* value, int def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return static_cast<int>(wcstol(value, 0, base));
+       #else
+               return static_cast<int>(strtol(value, 0, base));
+       #endif
+       }
+
+       PUGI__FN unsigned int get_value_uint(const char_t* value, unsigned int def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return static_cast<unsigned int>(wcstoul(value, 0, base));
+       #else
+               return static_cast<unsigned int>(strtoul(value, 0, base));
+       #endif
+       }
+
+       PUGI__FN double get_value_double(const char_t* value, double def)
+       {
+               if (!value) return def;
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcstod(value, 0);
+       #else
+               return strtod(value, 0);
+       #endif
+       }
+
+       PUGI__FN float get_value_float(const char_t* value, float def)
+       {
+               if (!value) return def;
+
+       #ifdef PUGIXML_WCHAR_MODE
+               return static_cast<float>(wcstod(value, 0));
+       #else
+               return static_cast<float>(strtod(value, 0));
+       #endif
+       }
+
+       PUGI__FN bool get_value_bool(const char_t* value, bool def)
+       {
+               if (!value) return def;
+
+               // only look at first char
+               char_t first = *value;
+
+               // 1*, t* (true), T* (True), y* (yes), Y* (YES)
+               return (first == '1' || first == 't' || first == 'T' || first == 'y' || first == 'Y');
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN long long get_value_llong(const char_t* value, long long def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _wcstoi64(value, 0, base);
+               #else
+                       return wcstoll(value, 0, base);
+               #endif
+       #else
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _strtoi64(value, 0, base);
+               #else
+                       return strtoll(value, 0, base);
+               #endif
+       #endif
+       }
+
+       PUGI__FN unsigned long long get_value_ullong(const char_t* value, unsigned long long def)
+       {
+               if (!value) return def;
+
+               int base = get_integer_base(value);
+
+       #ifdef PUGIXML_WCHAR_MODE
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _wcstoui64(value, 0, base);
+               #else
+                       return wcstoull(value, 0, base);
+               #endif
+       #else
+               #ifdef PUGI__MSVC_CRT_VERSION
+                       return _strtoui64(value, 0, base);
+               #else
+                       return strtoull(value, 0, base);
+               #endif
+       #endif
+       }
+#endif
+
+       // set value with conversion functions
+       PUGI__FN bool set_value_buffer(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char (&buf)[128])
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               char_t wbuf[128];
+               impl::widen_ascii(wbuf, buf);
+
+               return strcpy_insitu(dest, header, header_mask, wbuf);
+       #else
+               return strcpy_insitu(dest, header, header_mask, buf);
+       #endif
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, int value)
+       {
+               char buf[128];
+               sprintf(buf, "%d", value);
+       
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned int value)
+       {
+               char buf[128];
+               sprintf(buf, "%u", value);
+
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, float value)
+       {
+               char buf[128];
+               sprintf(buf, "%.9g", value);
+
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+       
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, double value)
+       {
+               char buf[128];
+               sprintf(buf, "%.17g", value);
+
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+       
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, bool value)
+       {
+               return strcpy_insitu(dest, header, header_mask, value ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, long long value)
+       {
+               char buf[128];
+               sprintf(buf, "%lld", value);
+       
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+
+       PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned long long value)
+       {
+               char buf[128];
+               sprintf(buf, "%llu", value);
+       
+               return set_value_buffer(dest, header, header_mask, buf);
+       }
+#endif
+
+       // we need to get length of entire file to load it in memory; the only (relatively) sane way to do it is via seek/tell trick
+       PUGI__FN xml_parse_status get_file_size(FILE* file, size_t& out_result)
+       {
+       #if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+               // there are 64-bit versions of fseek/ftell, let's use them
+               typedef __int64 length_type;
+
+               _fseeki64(file, 0, SEEK_END);
+               length_type length = _ftelli64(file);
+               _fseeki64(file, 0, SEEK_SET);
+       #elif defined(__MINGW32__) && !defined(__NO_MINGW_LFS) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR))
+               // there are 64-bit versions of fseek/ftell, let's use them
+               typedef off64_t length_type;
+
+               fseeko64(file, 0, SEEK_END);
+               length_type length = ftello64(file);
+               fseeko64(file, 0, SEEK_SET);
+       #else
+               // if this is a 32-bit OS, long is enough; if this is a unix system, long is 64-bit, which is enough; otherwise we can't do anything anyway.
+               typedef long length_type;
+
+               fseek(file, 0, SEEK_END);
+               length_type length = ftell(file);
+               fseek(file, 0, SEEK_SET);
+       #endif
+
+               // check for I/O errors
+               if (length < 0) return status_io_error;
+               
+               // check for overflow
+               size_t result = static_cast<size_t>(length);
+
+               if (static_cast<length_type>(result) != length) return status_out_of_memory;
+
+               // finalize
+               out_result = result;
+
+               return status_ok;
+       }
+
+       PUGI__FN size_t zero_terminate_buffer(void* buffer, size_t size, xml_encoding encoding) 
+       {
+               // We only need to zero-terminate if encoding conversion does not do it for us
+       #ifdef PUGIXML_WCHAR_MODE
+               xml_encoding wchar_encoding = get_wchar_encoding();
+
+               if (encoding == wchar_encoding || need_endian_swap_utf(encoding, wchar_encoding))
+               {
+                       size_t length = size / sizeof(char_t);
+
+                       static_cast<char_t*>(buffer)[length] = 0;
+                       return (length + 1) * sizeof(char_t);
+               }
+       #else
+               if (encoding == encoding_utf8)
+               {
+                       static_cast<char*>(buffer)[size] = 0;
+                       return size + 1;
+               }
+       #endif
+
+               return size;
+       }
+
+       PUGI__FN xml_parse_result load_file_impl(xml_document& doc, FILE* file, unsigned int options, xml_encoding encoding)
+       {
+               if (!file) return make_parse_result(status_file_not_found);
+
+               // get file size (can result in I/O errors)
+               size_t size = 0;
+               xml_parse_status size_status = get_file_size(file, size);
+
+               if (size_status != status_ok)
+               {
+                       fclose(file);
+                       return make_parse_result(size_status);
+               }
+               
+               size_t max_suffix_size = sizeof(char_t);
+
+               // allocate buffer for the whole file
+               char* contents = static_cast<char*>(xml_memory::allocate(size + max_suffix_size));
+
+               if (!contents)
+               {
+                       fclose(file);
+                       return make_parse_result(status_out_of_memory);
+               }
+
+               // read file in memory
+               size_t read_size = fread(contents, 1, size, file);
+               fclose(file);
+
+               if (read_size != size)
+               {
+                       xml_memory::deallocate(contents);
+                       return make_parse_result(status_io_error);
+               }
+
+               xml_encoding real_encoding = get_buffer_encoding(encoding, contents, size);
+               
+               return doc.load_buffer_inplace_own(contents, zero_terminate_buffer(contents, size, real_encoding), options, real_encoding);
+       }
+
+#ifndef PUGIXML_NO_STL
+       template <typename T> struct xml_stream_chunk
+       {
+               static xml_stream_chunk* create()
+               {
+                       void* memory = xml_memory::allocate(sizeof(xml_stream_chunk));
+                       
+                       return new (memory) xml_stream_chunk();
+               }
+
+               static void destroy(void* ptr)
+               {
+                       xml_stream_chunk* chunk = static_cast<xml_stream_chunk*>(ptr);
+
+                       // free chunk chain
+                       while (chunk)
+                       {
+                               xml_stream_chunk* next_ = chunk->next;
+
+                               xml_memory::deallocate(chunk);
+
+                               chunk = next_;
+                       }
+               }
+
+               xml_stream_chunk(): next(0), size(0)
+               {
+               }
+
+               xml_stream_chunk* next;
+               size_t size;
+
+               T data[xml_memory_page_size / sizeof(T)];
+       };
+
+       template <typename T> PUGI__FN xml_parse_status load_stream_data_noseek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+       {
+               buffer_holder chunks(0, xml_stream_chunk<T>::destroy);
+
+               // read file to a chunk list
+               size_t total = 0;
+               xml_stream_chunk<T>* last = 0;
+
+               while (!stream.eof())
+               {
+                       // allocate new chunk
+                       xml_stream_chunk<T>* chunk = xml_stream_chunk<T>::create();
+                       if (!chunk) return status_out_of_memory;
+
+                       // append chunk to list
+                       if (last) last = last->next = chunk;
+                       else chunks.data = last = chunk;
+
+                       // read data to chunk
+                       stream.read(chunk->data, static_cast<std::streamsize>(sizeof(chunk->data) / sizeof(T)));
+                       chunk->size = static_cast<size_t>(stream.gcount()) * sizeof(T);
+
+                       // read may set failbit | eofbit in case gcount() is less than read length, so check for other I/O errors
+                       if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+                       // guard against huge files (chunk size is small enough to make this overflow check work)
+                       if (total + chunk->size < total) return status_out_of_memory;
+                       total += chunk->size;
+               }
+
+               size_t max_suffix_size = sizeof(char_t);
+
+               // copy chunk list to a contiguous buffer
+               char* buffer = static_cast<char*>(xml_memory::allocate(total + max_suffix_size));
+               if (!buffer) return status_out_of_memory;
+
+               char* write = buffer;
+
+               for (xml_stream_chunk<T>* chunk = static_cast<xml_stream_chunk<T>*>(chunks.data); chunk; chunk = chunk->next)
+               {
+                       assert(write + chunk->size <= buffer + total);
+                       memcpy(write, chunk->data, chunk->size);
+                       write += chunk->size;
+               }
+
+               assert(write == buffer + total);
+
+               // return buffer
+               *out_buffer = buffer;
+               *out_size = total;
+
+               return status_ok;
+       }
+
+       template <typename T> PUGI__FN xml_parse_status load_stream_data_seek(std::basic_istream<T>& stream, void** out_buffer, size_t* out_size)
+       {
+               // get length of remaining data in stream
+               typename std::basic_istream<T>::pos_type pos = stream.tellg();
+               stream.seekg(0, std::ios::end);
+               std::streamoff length = stream.tellg() - pos;
+               stream.seekg(pos);
+
+               if (stream.fail() || pos < 0) return status_io_error;
+
+               // guard against huge files
+               size_t read_length = static_cast<size_t>(length);
+
+               if (static_cast<std::streamsize>(read_length) != length || length < 0) return status_out_of_memory;
+
+               size_t max_suffix_size = sizeof(char_t);
+
+               // read stream data into memory (guard against stream exceptions with buffer holder)
+               buffer_holder buffer(xml_memory::allocate(read_length * sizeof(T) + max_suffix_size), xml_memory::deallocate);
+               if (!buffer.data) return status_out_of_memory;
+
+               stream.read(static_cast<T*>(buffer.data), static_cast<std::streamsize>(read_length));
+
+               // read may set failbit | eofbit in case gcount() is less than read_length (i.e. line ending conversion), so check for other I/O errors
+               if (stream.bad() || (!stream.eof() && stream.fail())) return status_io_error;
+
+               // return buffer
+               size_t actual_length = static_cast<size_t>(stream.gcount());
+               assert(actual_length <= read_length);
+               
+               *out_buffer = buffer.release();
+               *out_size = actual_length * sizeof(T);
+
+               return status_ok;
+       }
+
+       template <typename T> PUGI__FN xml_parse_result load_stream_impl(xml_document& doc, std::basic_istream<T>& stream, unsigned int options, xml_encoding encoding)
+       {
+               void* buffer = 0;
+               size_t size = 0;
+               xml_parse_status status = status_ok;
+
+               // if stream has an error bit set, bail out (otherwise tellg() can fail and we'll clear error bits)
+               if (stream.fail()) return make_parse_result(status_io_error);
+
+               // load stream to memory (using seek-based implementation if possible, since it's faster and takes less memory)
+               if (stream.tellg() < 0)
+               {
+                       stream.clear(); // clear error flags that could be set by a failing tellg
+                       status = load_stream_data_noseek(stream, &buffer, &size);
+               }
+               else
+                       status = load_stream_data_seek(stream, &buffer, &size);
+
+               if (status != status_ok) return make_parse_result(status);
+
+               xml_encoding real_encoding = get_buffer_encoding(encoding, buffer, size);
+               
+               return doc.load_buffer_inplace_own(buffer, zero_terminate_buffer(buffer, size, real_encoding), options, real_encoding);
+       }
+#endif
+
+#if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__) || (defined(__MINGW32__) && (!defined(__STRICT_ANSI__) || defined(__MINGW64_VERSION_MAJOR)))
+       PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+       {
+               return _wfopen(path, mode);
+       }
+#else
+       PUGI__FN char* convert_path_heap(const wchar_t* str)
+       {
+               assert(str);
+
+               // first pass: get length in utf8 characters
+               size_t length = strlength_wide(str);
+               size_t size = as_utf8_begin(str, length);
+
+               // allocate resulting string
+               char* result = static_cast<char*>(xml_memory::allocate(size + 1));
+               if (!result) return 0;
+
+               // second pass: convert to utf8
+               as_utf8_end(result, size, str, length);
+
+               return result;
+       }
+
+       PUGI__FN FILE* open_file_wide(const wchar_t* path, const wchar_t* mode)
+       {
+               // there is no standard function to open wide paths, so our best bet is to try utf8 path
+               char* path_utf8 = convert_path_heap(path);
+               if (!path_utf8) return 0;
+
+               // convert mode to ASCII (we mirror _wfopen interface)
+               char mode_ascii[4] = {0};
+               for (size_t i = 0; mode[i]; ++i) mode_ascii[i] = static_cast<char>(mode[i]);
+
+               // try to open the utf8 path
+               FILE* result = fopen(path_utf8, mode_ascii);
+
+               // free dummy buffer
+               xml_memory::deallocate(path_utf8);
+
+               return result;
+       }
+#endif
+
+       PUGI__FN bool save_file_impl(const xml_document& doc, FILE* file, const char_t* indent, unsigned int flags, xml_encoding encoding)
+       {
+               if (!file) return false;
+
+               xml_writer_file writer(file);
+               doc.save(writer, indent, flags, encoding);
+
+               int result = ferror(file);
+
+               fclose(file);
+
+               return result == 0;
+       }
+
+       PUGI__FN xml_parse_result load_buffer_impl(xml_document_struct* doc, xml_node_struct* root, void* contents, size_t size, unsigned int options, xml_encoding encoding, bool is_mutable, bool own, char_t** out_buffer)
+       {
+               // check input buffer
+               assert(contents || size == 0);
+
+               // get actual encoding
+               xml_encoding buffer_encoding = impl::get_buffer_encoding(encoding, contents, size);
+
+               // get private buffer
+               char_t* buffer = 0;
+               size_t length = 0;
+
+               if (!impl::convert_buffer(buffer, length, buffer_encoding, contents, size, is_mutable)) return impl::make_parse_result(status_out_of_memory);
+               
+               // delete original buffer if we performed a conversion
+               if (own && buffer != contents && contents) impl::xml_memory::deallocate(contents);
+
+               // store buffer for offset_debug
+               doc->buffer = buffer;
+
+               // parse
+               xml_parse_result res = impl::xml_parser::parse(buffer, length, doc, root, options);
+
+               // remember encoding
+               res.encoding = buffer_encoding;
+
+               // grab onto buffer if it's our buffer, user is responsible for deallocating contents himself
+               if (own || buffer != contents) *out_buffer = buffer;
+
+               return res;
+       }
+PUGI__NS_END
+
+namespace pugi
+{
+       PUGI__FN xml_writer_file::xml_writer_file(void* file_): file(file_)
+       {
+       }
+
+       PUGI__FN void xml_writer_file::write(const void* data, size_t size)
+       {
+               size_t result = fwrite(data, 1, size, static_cast<FILE*>(file));
+               (void)!result; // unfortunately we can't do proper error handling here
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream): narrow_stream(&stream), wide_stream(0)
+       {
+       }
+
+       PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream): narrow_stream(0), wide_stream(&stream)
+       {
+       }
+
+       PUGI__FN void xml_writer_stream::write(const void* data, size_t size)
+       {
+               if (narrow_stream)
+               {
+                       assert(!wide_stream);
+                       narrow_stream->write(reinterpret_cast<const char*>(data), static_cast<std::streamsize>(size));
+               }
+               else
+               {
+                       assert(wide_stream);
+                       assert(size % sizeof(wchar_t) == 0);
+
+                       wide_stream->write(reinterpret_cast<const wchar_t*>(data), static_cast<std::streamsize>(size / sizeof(wchar_t)));
+               }
+       }
+#endif
+
+       PUGI__FN xml_tree_walker::xml_tree_walker(): _depth(0)
+       {
+       }
+       
+       PUGI__FN xml_tree_walker::~xml_tree_walker()
+       {
+       }
+
+       PUGI__FN int xml_tree_walker::depth() const
+       {
+               return _depth;
+       }
+
+       PUGI__FN bool xml_tree_walker::begin(xml_node&)
+       {
+               return true;
+       }
+
+       PUGI__FN bool xml_tree_walker::end(xml_node&)
+       {
+               return true;
+       }
+
+       PUGI__FN xml_attribute::xml_attribute(): _attr(0)
+       {
+       }
+
+       PUGI__FN xml_attribute::xml_attribute(xml_attribute_struct* attr): _attr(attr)
+       {
+       }
+
+       PUGI__FN static void unspecified_bool_xml_attribute(xml_attribute***)
+       {
+       }
+
+       PUGI__FN xml_attribute::operator xml_attribute::unspecified_bool_type() const
+       {
+               return _attr ? unspecified_bool_xml_attribute : 0;
+       }
+
+       PUGI__FN bool xml_attribute::operator!() const
+       {
+               return !_attr;
+       }
+
+       PUGI__FN bool xml_attribute::operator==(const xml_attribute& r) const
+       {
+               return (_attr == r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator!=(const xml_attribute& r) const
+       {
+               return (_attr != r._attr);
+       }
+
+       PUGI__FN bool xml_attribute::operator<(const xml_attribute& r) const
+       {
+               return (_attr < r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator>(const xml_attribute& r) const
+       {
+               return (_attr > r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator<=(const xml_attribute& r) const
+       {
+               return (_attr <= r._attr);
+       }
+       
+       PUGI__FN bool xml_attribute::operator>=(const xml_attribute& r) const
+       {
+               return (_attr >= r._attr);
+       }
+
+       PUGI__FN xml_attribute xml_attribute::next_attribute() const
+       {
+               return _attr ? xml_attribute(_attr->next_attribute) : xml_attribute();
+       }
+
+       PUGI__FN xml_attribute xml_attribute::previous_attribute() const
+       {
+               return _attr && _attr->prev_attribute_c->next_attribute ? xml_attribute(_attr->prev_attribute_c) : xml_attribute();
+       }
+
+       PUGI__FN const char_t* xml_attribute::as_string(const char_t* def) const
+       {
+               return (_attr && _attr->value) ? _attr->value : def;
+       }
+
+       PUGI__FN int xml_attribute::as_int(int def) const
+       {
+               return impl::get_value_int(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN unsigned int xml_attribute::as_uint(unsigned int def) const
+       {
+               return impl::get_value_uint(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN double xml_attribute::as_double(double def) const
+       {
+               return impl::get_value_double(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN float xml_attribute::as_float(float def) const
+       {
+               return impl::get_value_float(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN bool xml_attribute::as_bool(bool def) const
+       {
+               return impl::get_value_bool(_attr ? _attr->value : 0, def);
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN long long xml_attribute::as_llong(long long def) const
+       {
+               return impl::get_value_llong(_attr ? _attr->value : 0, def);
+       }
+
+       PUGI__FN unsigned long long xml_attribute::as_ullong(unsigned long long def) const
+       {
+               return impl::get_value_ullong(_attr ? _attr->value : 0, def);
+       }
+#endif
+
+       PUGI__FN bool xml_attribute::empty() const
+       {
+               return !_attr;
+       }
+
+       PUGI__FN const char_t* xml_attribute::name() const
+       {
+               return (_attr && _attr->name) ? _attr->name : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* xml_attribute::value() const
+       {
+               return (_attr && _attr->value) ? _attr->value : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN size_t xml_attribute::hash_value() const
+       {
+               return static_cast<size_t>(reinterpret_cast<uintptr_t>(_attr) / sizeof(xml_attribute_struct));
+       }
+
+       PUGI__FN xml_attribute_struct* xml_attribute::internal_object() const
+       {
+               return _attr;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(const char_t* rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+       
+       PUGI__FN xml_attribute& xml_attribute::operator=(int rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(unsigned int rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(double rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+       
+       PUGI__FN xml_attribute& xml_attribute::operator=(float rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+       
+       PUGI__FN xml_attribute& xml_attribute::operator=(bool rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN xml_attribute& xml_attribute::operator=(long long rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute::operator=(unsigned long long rhs)
+       {
+               set_value(rhs);
+               return *this;
+       }
+#endif
+
+       PUGI__FN bool xml_attribute::set_name(const char_t* rhs)
+       {
+               if (!_attr) return false;
+               
+               return impl::strcpy_insitu(_attr->name, _attr->header, impl::xml_memory_page_name_allocated_mask, rhs);
+       }
+               
+       PUGI__FN bool xml_attribute::set_value(const char_t* rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::strcpy_insitu(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(int rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(unsigned int rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(double rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+       
+       PUGI__FN bool xml_attribute::set_value(float rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+       
+       PUGI__FN bool xml_attribute::set_value(bool rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN bool xml_attribute::set_value(long long rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+
+       PUGI__FN bool xml_attribute::set_value(unsigned long long rhs)
+       {
+               if (!_attr) return false;
+
+               return impl::set_value_convert(_attr->value, _attr->header, impl::xml_memory_page_value_allocated_mask, rhs);
+       }
+#endif
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xml_attribute& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xml_attribute& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN xml_node::xml_node(): _root(0)
+       {
+       }
+
+       PUGI__FN xml_node::xml_node(xml_node_struct* p): _root(p)
+       {
+       }
+       
+       PUGI__FN static void unspecified_bool_xml_node(xml_node***)
+       {
+       }
+
+       PUGI__FN xml_node::operator xml_node::unspecified_bool_type() const
+       {
+               return _root ? unspecified_bool_xml_node : 0;
+       }
+
+       PUGI__FN bool xml_node::operator!() const
+       {
+               return !_root;
+       }
+
+       PUGI__FN xml_node::iterator xml_node::begin() const
+       {
+               return iterator(_root ? _root->first_child : 0, _root);
+       }
+
+       PUGI__FN xml_node::iterator xml_node::end() const
+       {
+               return iterator(0, _root);
+       }
+       
+       PUGI__FN xml_node::attribute_iterator xml_node::attributes_begin() const
+       {
+               return attribute_iterator(_root ? _root->first_attribute : 0, _root);
+       }
+
+       PUGI__FN xml_node::attribute_iterator xml_node::attributes_end() const
+       {
+               return attribute_iterator(0, _root);
+       }
+       
+       PUGI__FN xml_object_range<xml_node_iterator> xml_node::children() const
+       {
+               return xml_object_range<xml_node_iterator>(begin(), end());
+       }
+
+       PUGI__FN xml_object_range<xml_named_node_iterator> xml_node::children(const char_t* name_) const
+       {
+               return xml_object_range<xml_named_node_iterator>(xml_named_node_iterator(child(name_)._root, _root, name_), xml_named_node_iterator(0, _root, name_));
+       }
+
+       PUGI__FN xml_object_range<xml_attribute_iterator> xml_node::attributes() const
+       {
+               return xml_object_range<xml_attribute_iterator>(attributes_begin(), attributes_end());
+       }
+
+       PUGI__FN bool xml_node::operator==(const xml_node& r) const
+       {
+               return (_root == r._root);
+       }
+
+       PUGI__FN bool xml_node::operator!=(const xml_node& r) const
+       {
+               return (_root != r._root);
+       }
+
+       PUGI__FN bool xml_node::operator<(const xml_node& r) const
+       {
+               return (_root < r._root);
+       }
+       
+       PUGI__FN bool xml_node::operator>(const xml_node& r) const
+       {
+               return (_root > r._root);
+       }
+       
+       PUGI__FN bool xml_node::operator<=(const xml_node& r) const
+       {
+               return (_root <= r._root);
+       }
+       
+       PUGI__FN bool xml_node::operator>=(const xml_node& r) const
+       {
+               return (_root >= r._root);
+       }
+
+       PUGI__FN bool xml_node::empty() const
+       {
+               return !_root;
+       }
+       
+       PUGI__FN const char_t* xml_node::name() const
+       {
+               return (_root && _root->name) ? _root->name : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN xml_node_type xml_node::type() const
+       {
+               return _root ? PUGI__NODETYPE(_root) : node_null;
+       }
+       
+       PUGI__FN const char_t* xml_node::value() const
+       {
+               return (_root && _root->value) ? _root->value : PUGIXML_TEXT("");
+       }
+       
+       PUGI__FN xml_node xml_node::child(const char_t* name_) const
+       {
+               if (!_root) return xml_node();
+
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_attribute xml_node::attribute(const char_t* name_) const
+       {
+               if (!_root) return xml_attribute();
+
+               for (xml_attribute_struct* i = _root->first_attribute; i; i = i->next_attribute)
+                       if (i->name && impl::strequal(name_, i->name))
+                               return xml_attribute(i);
+               
+               return xml_attribute();
+       }
+       
+       PUGI__FN xml_node xml_node::next_sibling(const char_t* name_) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->next_sibling; i; i = i->next_sibling)
+                       if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::next_sibling() const
+       {
+               return _root ? xml_node(_root->next_sibling) : xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::previous_sibling(const char_t* name_) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->prev_sibling_c; i->next_sibling; i = i->prev_sibling_c)
+                       if (i->name && impl::strequal(name_, i->name)) return xml_node(i);
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::previous_sibling() const
+       {
+               if (!_root) return xml_node();
+               
+               if (_root->prev_sibling_c->next_sibling) return xml_node(_root->prev_sibling_c);
+               else return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::parent() const
+       {
+               return _root ? xml_node(_root->parent) : xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::root() const
+       {
+               return _root ? xml_node(&impl::get_document(_root)) : xml_node();
+       }
+
+       PUGI__FN xml_text xml_node::text() const
+       {
+               return xml_text(_root);
+       }
+
+       PUGI__FN const char_t* xml_node::child_value() const
+       {
+               if (!_root) return PUGIXML_TEXT("");
+               
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (i->value && impl::is_text_node(i))
+                               return i->value;
+
+               return PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* xml_node::child_value(const char_t* name_) const
+       {
+               return child(name_).child_value();
+       }
+
+       PUGI__FN xml_attribute xml_node::first_attribute() const
+       {
+               return _root ? xml_attribute(_root->first_attribute) : xml_attribute();
+       }
+
+       PUGI__FN xml_attribute xml_node::last_attribute() const
+       {
+               return _root && _root->first_attribute ? xml_attribute(_root->first_attribute->prev_attribute_c) : xml_attribute();
+       }
+
+       PUGI__FN xml_node xml_node::first_child() const
+       {
+               return _root ? xml_node(_root->first_child) : xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::last_child() const
+       {
+               return _root && _root->first_child ? xml_node(_root->first_child->prev_sibling_c) : xml_node();
+       }
+
+       PUGI__FN bool xml_node::set_name(const char_t* rhs)
+       {
+               switch (type())
+               {
+               case node_pi:
+               case node_declaration:
+               case node_element:
+                       return impl::strcpy_insitu(_root->name, _root->header, impl::xml_memory_page_name_allocated_mask, rhs);
+
+               default:
+                       return false;
+               }
+       }
+               
+       PUGI__FN bool xml_node::set_value(const char_t* rhs)
+       {
+               switch (type())
+               {
+               case node_pi:
+               case node_cdata:
+               case node_pcdata:
+               case node_comment:
+               case node_doctype:
+                       return impl::strcpy_insitu(_root->value, _root->header, impl::xml_memory_page_value_allocated_mask, rhs);
+
+               default:
+                       return false;
+               }
+       }
+
+       PUGI__FN xml_attribute xml_node::append_attribute(const char_t* name_)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::append_attribute(a._attr, _root);
+
+               a.set_name(name_);
+               
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::prepend_attribute(const char_t* name_)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::prepend_attribute(a._attr, _root);
+
+               a.set_name(name_);
+
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_attribute_after(const char_t* name_, const xml_attribute& attr)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::insert_attribute_after(a._attr, attr._attr, _root);
+
+               a.set_name(name_);
+
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_attribute_before(const char_t* name_, const xml_attribute& attr)
+       {
+               if (!impl::allow_insert_attribute(type())) return xml_attribute();
+               if (!attr || !impl::is_attribute_of(attr._attr, _root)) return xml_attribute();
+               
+               xml_attribute a(impl::allocate_attribute(impl::get_allocator(_root)));
+               if (!a) return xml_attribute();
+
+               impl::insert_attribute_before(a._attr, attr._attr, _root);
+
+               a.set_name(name_);
+
+               return a;
+       }
+
+       PUGI__FN xml_attribute xml_node::append_copy(const xml_attribute& proto)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = append_attribute(proto.name());
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_attribute xml_node::prepend_copy(const xml_attribute& proto)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = prepend_attribute(proto.name());
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_copy_after(const xml_attribute& proto, const xml_attribute& attr)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = insert_attribute_after(proto.name(), attr);
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_attribute xml_node::insert_copy_before(const xml_attribute& proto, const xml_attribute& attr)
+       {
+               if (!proto) return xml_attribute();
+
+               xml_attribute result = insert_attribute_before(proto.name(), attr);
+               result.set_value(proto.value());
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::append_child(xml_node_type type_)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::append_node(n._root, _root);
+
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_child(xml_node_type type_)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::prepend_node(n._root, _root);
+                               
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_before(xml_node_type type_, const xml_node& node)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+       
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_before(n._root, node._root);
+
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_after(xml_node_type type_, const xml_node& node)
+       {
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+       
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_after(n._root, node._root);
+
+               if (type_ == node_declaration) n.set_name(PUGIXML_TEXT("xml"));
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::append_child(const char_t* name_)
+       {
+               xml_node result = append_child(node_element);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_child(const char_t* name_)
+       {
+               xml_node result = prepend_child(node_element);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_after(const char_t* name_, const xml_node& node)
+       {
+               xml_node result = insert_child_after(node_element, node);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::insert_child_before(const char_t* name_, const xml_node& node)
+       {
+               xml_node result = insert_child_before(node_element, node);
+
+               result.set_name(name_);
+
+               return result;
+       }
+
+       PUGI__FN xml_node xml_node::append_copy(const xml_node& proto)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::append_node(n._root, _root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_copy(const xml_node& proto)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::prepend_node(n._root, _root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_copy_after(const xml_node& proto, const xml_node& node)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_after(n._root, node._root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::insert_copy_before(const xml_node& proto, const xml_node& node)
+       {
+               xml_node_type type_ = proto.type();
+               if (!impl::allow_insert_child(type(), type_)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+
+               xml_node n(impl::allocate_node(impl::get_allocator(_root), type_));
+               if (!n) return xml_node();
+
+               impl::insert_node_before(n._root, node._root);
+               impl::node_copy_tree(n._root, proto._root);
+
+               return n;
+       }
+
+       PUGI__FN xml_node xml_node::append_move(const xml_node& moved)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::append_node(moved._root, _root);
+
+               return moved;
+       }
+
+       PUGI__FN xml_node xml_node::prepend_move(const xml_node& moved)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::prepend_node(moved._root, _root);
+
+               return moved;
+       }
+
+       PUGI__FN xml_node xml_node::insert_move_after(const xml_node& moved, const xml_node& node)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+               if (moved._root == node._root) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::insert_node_after(moved._root, node._root);
+
+               return moved;
+       }
+
+       PUGI__FN xml_node xml_node::insert_move_before(const xml_node& moved, const xml_node& node)
+       {
+               if (!impl::allow_move(*this, moved)) return xml_node();
+               if (!node._root || node._root->parent != _root) return xml_node();
+               if (moved._root == node._root) return xml_node();
+
+               // disable document_buffer_order optimization since moving nodes around changes document order without changing buffer pointers
+               impl::get_document(_root).header |= impl::xml_memory_page_contents_shared_mask;
+
+               impl::remove_node(moved._root);
+               impl::insert_node_before(moved._root, node._root);
+
+               return moved;
+       }
+
+       PUGI__FN bool xml_node::remove_attribute(const char_t* name_)
+       {
+               return remove_attribute(attribute(name_));
+       }
+
+       PUGI__FN bool xml_node::remove_attribute(const xml_attribute& a)
+       {
+               if (!_root || !a._attr) return false;
+               if (!impl::is_attribute_of(a._attr, _root)) return false;
+
+               impl::remove_attribute(a._attr, _root);
+               impl::destroy_attribute(a._attr, impl::get_allocator(_root));
+
+               return true;
+       }
+
+       PUGI__FN bool xml_node::remove_child(const char_t* name_)
+       {
+               return remove_child(child(name_));
+       }
+
+       PUGI__FN bool xml_node::remove_child(const xml_node& n)
+       {
+               if (!_root || !n._root || n._root->parent != _root) return false;
+
+               impl::remove_node(n._root);
+               impl::destroy_node(n._root, impl::get_allocator(_root));
+
+               return true;
+       }
+
+       PUGI__FN xml_parse_result xml_node::append_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               // append_buffer is only valid for elements/documents
+               if (!impl::allow_insert_child(type(), node_element)) return impl::make_parse_result(status_append_invalid_root);
+
+               // get document node
+               impl::xml_document_struct* doc = &impl::get_document(_root);
+
+               // disable document_buffer_order optimization since in a document with multiple buffers comparing buffer pointers does not make sense
+               doc->header |= impl::xml_memory_page_contents_shared_mask;
+               
+               // get extra buffer element (we'll store the document fragment buffer there so that we can deallocate it later)
+               impl::xml_memory_page* page = 0;
+               impl::xml_extra_buffer* extra = static_cast<impl::xml_extra_buffer*>(doc->allocate_memory(sizeof(impl::xml_extra_buffer), page));
+               (void)page;
+
+               if (!extra) return impl::make_parse_result(status_out_of_memory);
+
+               // save name; name of the root has to be NULL before parsing - otherwise closing node mismatches will not be detected at the top level
+               char_t* rootname = _root->name;
+               _root->name = 0;
+
+               // parse
+               char_t* buffer = 0;
+               xml_parse_result res = impl::load_buffer_impl(doc, _root, const_cast<void*>(contents), size, options, encoding, false, false, &buffer);
+
+               // restore name
+               _root->name = rootname;
+
+               // add extra buffer to the list
+               extra->buffer = buffer;
+               extra->next = doc->extra_buffers;
+               doc->extra_buffers = extra;
+
+               return res;
+       }
+
+       PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* name_, const char_t* attr_name, const char_t* attr_value) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (i->name && impl::strequal(name_, i->name))
+                       {
+                               for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+                                       if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value : PUGIXML_TEXT("")))
+                                               return xml_node(i);
+                       }
+
+               return xml_node();
+       }
+
+       PUGI__FN xml_node xml_node::find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const
+       {
+               if (!_root) return xml_node();
+               
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       for (xml_attribute_struct* a = i->first_attribute; a; a = a->next_attribute)
+                               if (a->name && impl::strequal(attr_name, a->name) && impl::strequal(attr_value, a->value ? a->value : PUGIXML_TEXT("")))
+                                       return xml_node(i);
+
+               return xml_node();
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN string_t xml_node::path(char_t delimiter) const
+       {
+               xml_node cursor = *this; // Make a copy.
+               
+               string_t result = cursor.name();
+
+               while (cursor.parent())
+               {
+                       cursor = cursor.parent();
+                       
+                       string_t temp = cursor.name();
+                       temp += delimiter;
+                       temp += result;
+                       result.swap(temp);
+               }
+
+               return result;
+       }
+#endif
+
+       PUGI__FN xml_node xml_node::first_element_by_path(const char_t* path_, char_t delimiter) const
+       {
+               xml_node found = *this; // Current search context.
+
+               if (!_root || !path_ || !path_[0]) return found;
+
+               if (path_[0] == delimiter)
+               {
+                       // Absolute path; e.g. '/foo/bar'
+                       found = found.root();
+                       ++path_;
+               }
+
+               const char_t* path_segment = path_;
+
+               while (*path_segment == delimiter) ++path_segment;
+
+               const char_t* path_segment_end = path_segment;
+
+               while (*path_segment_end && *path_segment_end != delimiter) ++path_segment_end;
+
+               if (path_segment == path_segment_end) return found;
+
+               const char_t* next_segment = path_segment_end;
+
+               while (*next_segment == delimiter) ++next_segment;
+
+               if (*path_segment == '.' && path_segment + 1 == path_segment_end)
+                       return found.first_element_by_path(next_segment, delimiter);
+               else if (*path_segment == '.' && *(path_segment+1) == '.' && path_segment + 2 == path_segment_end)
+                       return found.parent().first_element_by_path(next_segment, delimiter);
+               else
+               {
+                       for (xml_node_struct* j = found._root->first_child; j; j = j->next_sibling)
+                       {
+                               if (j->name && impl::strequalrange(j->name, path_segment, static_cast<size_t>(path_segment_end - path_segment)))
+                               {
+                                       xml_node subsearch = xml_node(j).first_element_by_path(next_segment, delimiter);
+
+                                       if (subsearch) return subsearch;
+                               }
+                       }
+
+                       return xml_node();
+               }
+       }
+
+       PUGI__FN bool xml_node::traverse(xml_tree_walker& walker)
+       {
+               walker._depth = -1;
+               
+               xml_node arg_begin = *this;
+               if (!walker.begin(arg_begin)) return false;
+
+               xml_node cur = first_child();
+                               
+               if (cur)
+               {
+                       ++walker._depth;
+
+                       do 
+                       {
+                               xml_node arg_for_each = cur;
+                               if (!walker.for_each(arg_for_each))
+                                       return false;
+                                               
+                               if (cur.first_child())
+                               {
+                                       ++walker._depth;
+                                       cur = cur.first_child();
+                               }
+                               else if (cur.next_sibling())
+                                       cur = cur.next_sibling();
+                               else
+                               {
+                                       // Borland C++ workaround
+                                       while (!cur.next_sibling() && cur != *this && !cur.parent().empty())
+                                       {
+                                               --walker._depth;
+                                               cur = cur.parent();
+                                       }
+                                               
+                                       if (cur != *this)
+                                               cur = cur.next_sibling();
+                               }
+                       }
+                       while (cur && cur != *this);
+               }
+
+               assert(walker._depth == -1);
+
+               xml_node arg_end = *this;
+               return walker.end(arg_end);
+       }
+
+       PUGI__FN size_t xml_node::hash_value() const
+       {
+               return static_cast<size_t>(reinterpret_cast<uintptr_t>(_root) / sizeof(xml_node_struct));
+       }
+
+       PUGI__FN xml_node_struct* xml_node::internal_object() const
+       {
+               return _root;
+       }
+
+       PUGI__FN void xml_node::print(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+       {
+               if (!_root) return;
+
+               impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+               impl::node_output(buffered_writer, _root, indent, flags, depth);
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN void xml_node::print(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding, unsigned int depth) const
+       {
+               xml_writer_stream writer(stream);
+
+               print(writer, indent, flags, encoding, depth);
+       }
+
+       PUGI__FN void xml_node::print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const
+       {
+               xml_writer_stream writer(stream);
+
+               print(writer, indent, flags, encoding_wchar, depth);
+       }
+#endif
+
+       PUGI__FN ptrdiff_t xml_node::offset_debug() const
+       {
+               if (!_root) return -1;
+
+               impl::xml_document_struct& doc = impl::get_document(_root);
+
+               // we can determine the offset reliably only if there is exactly once parse buffer
+               if (!doc.buffer || doc.extra_buffers) return -1;
+
+               switch (type())
+               {
+               case node_document:
+                       return 0;
+
+               case node_element:
+               case node_declaration:
+               case node_pi:
+                       return _root->name && (_root->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0 ? _root->name - doc.buffer : -1;
+
+               case node_pcdata:
+               case node_cdata:
+               case node_comment:
+               case node_doctype:
+                       return _root->value && (_root->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0 ? _root->value - doc.buffer : -1;
+
+               default:
+                       return -1;
+               }
+       }
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xml_node& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xml_node& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN xml_text::xml_text(xml_node_struct* root): _root(root)
+       {
+       }
+
+       PUGI__FN xml_node_struct* xml_text::_data() const
+       {
+               if (!_root || impl::is_text_node(_root)) return _root;
+
+               for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling)
+                       if (impl::is_text_node(node))
+                               return node;
+
+               return 0;
+       }
+
+       PUGI__FN xml_node_struct* xml_text::_data_new()
+       {
+               xml_node_struct* d = _data();
+               if (d) return d;
+
+               return xml_node(_root).append_child(node_pcdata).internal_object();
+       }
+
+       PUGI__FN xml_text::xml_text(): _root(0)
+       {
+       }
+
+       PUGI__FN static void unspecified_bool_xml_text(xml_text***)
+       {
+       }
+
+       PUGI__FN xml_text::operator xml_text::unspecified_bool_type() const
+       {
+               return _data() ? unspecified_bool_xml_text : 0;
+       }
+
+       PUGI__FN bool xml_text::operator!() const
+       {
+               return !_data();
+       }
+
+       PUGI__FN bool xml_text::empty() const
+       {
+               return _data() == 0;
+       }
+
+       PUGI__FN const char_t* xml_text::get() const
+       {
+               xml_node_struct* d = _data();
+
+               return (d && d->value) ? d->value : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* xml_text::as_string(const char_t* def) const
+       {
+               xml_node_struct* d = _data();
+
+               return (d && d->value) ? d->value : def;
+       }
+
+       PUGI__FN int xml_text::as_int(int def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_int(d ? d->value : 0, def);
+       }
+
+       PUGI__FN unsigned int xml_text::as_uint(unsigned int def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_uint(d ? d->value : 0, def);
+       }
+
+       PUGI__FN double xml_text::as_double(double def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_double(d ? d->value : 0, def);
+       }
+
+       PUGI__FN float xml_text::as_float(float def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_float(d ? d->value : 0, def);
+       }
+
+       PUGI__FN bool xml_text::as_bool(bool def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_bool(d ? d->value : 0, def);
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN long long xml_text::as_llong(long long def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_llong(d ? d->value : 0, def);
+       }
+
+       PUGI__FN unsigned long long xml_text::as_ullong(unsigned long long def) const
+       {
+               xml_node_struct* d = _data();
+
+               return impl::get_value_ullong(d ? d->value : 0, def);
+       }
+#endif
+
+       PUGI__FN bool xml_text::set(const char_t* rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::strcpy_insitu(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(int rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(unsigned int rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(float rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(double rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(bool rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN bool xml_text::set(long long rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+
+       PUGI__FN bool xml_text::set(unsigned long long rhs)
+       {
+               xml_node_struct* dn = _data_new();
+
+               return dn ? impl::set_value_convert(dn->value, dn->header, impl::xml_memory_page_value_allocated_mask, rhs) : false;
+       }
+#endif
+
+       PUGI__FN xml_text& xml_text::operator=(const char_t* rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(int rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(unsigned int rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(double rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(float rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(bool rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+#ifdef PUGIXML_HAS_LONG_LONG
+       PUGI__FN xml_text& xml_text::operator=(long long rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+
+       PUGI__FN xml_text& xml_text::operator=(unsigned long long rhs)
+       {
+               set(rhs);
+               return *this;
+       }
+#endif
+
+       PUGI__FN xml_node xml_text::data() const
+       {
+               return xml_node(_data());
+       }
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xml_text& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xml_text& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN xml_node_iterator::xml_node_iterator()
+       {
+       }
+
+       PUGI__FN xml_node_iterator::xml_node_iterator(const xml_node& node): _wrap(node), _parent(node.parent())
+       {
+       }
+
+       PUGI__FN xml_node_iterator::xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+       {
+       }
+
+       PUGI__FN bool xml_node_iterator::operator==(const xml_node_iterator& rhs) const
+       {
+               return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root;
+       }
+       
+       PUGI__FN bool xml_node_iterator::operator!=(const xml_node_iterator& rhs) const
+       {
+               return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root;
+       }
+
+       PUGI__FN xml_node& xml_node_iterator::operator*() const
+       {
+               assert(_wrap._root);
+               return _wrap;
+       }
+
+       PUGI__FN xml_node* xml_node_iterator::operator->() const
+       {
+               assert(_wrap._root);
+               return const_cast<xml_node*>(&_wrap); // BCC32 workaround
+       }
+
+       PUGI__FN const xml_node_iterator& xml_node_iterator::operator++()
+       {
+               assert(_wrap._root);
+               _wrap._root = _wrap._root->next_sibling;
+               return *this;
+       }
+
+       PUGI__FN xml_node_iterator xml_node_iterator::operator++(int)
+       {
+               xml_node_iterator temp = *this;
+               ++*this;
+               return temp;
+       }
+
+       PUGI__FN const xml_node_iterator& xml_node_iterator::operator--()
+       {
+               _wrap = _wrap._root ? _wrap.previous_sibling() : _parent.last_child();
+               return *this;
+       }
+
+       PUGI__FN xml_node_iterator xml_node_iterator::operator--(int)
+       {
+               xml_node_iterator temp = *this;
+               --*this;
+               return temp;
+       }
+
+       PUGI__FN xml_attribute_iterator::xml_attribute_iterator()
+       {
+       }
+
+       PUGI__FN xml_attribute_iterator::xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent): _wrap(attr), _parent(parent)
+       {
+       }
+
+       PUGI__FN xml_attribute_iterator::xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent): _wrap(ref), _parent(parent)
+       {
+       }
+
+       PUGI__FN bool xml_attribute_iterator::operator==(const xml_attribute_iterator& rhs) const
+       {
+               return _wrap._attr == rhs._wrap._attr && _parent._root == rhs._parent._root;
+       }
+       
+       PUGI__FN bool xml_attribute_iterator::operator!=(const xml_attribute_iterator& rhs) const
+       {
+               return _wrap._attr != rhs._wrap._attr || _parent._root != rhs._parent._root;
+       }
+
+       PUGI__FN xml_attribute& xml_attribute_iterator::operator*() const
+       {
+               assert(_wrap._attr);
+               return _wrap;
+       }
+
+       PUGI__FN xml_attribute* xml_attribute_iterator::operator->() const
+       {
+               assert(_wrap._attr);
+               return const_cast<xml_attribute*>(&_wrap); // BCC32 workaround
+       }
+
+       PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator++()
+       {
+               assert(_wrap._attr);
+               _wrap._attr = _wrap._attr->next_attribute;
+               return *this;
+       }
+
+       PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator++(int)
+       {
+               xml_attribute_iterator temp = *this;
+               ++*this;
+               return temp;
+       }
+
+       PUGI__FN const xml_attribute_iterator& xml_attribute_iterator::operator--()
+       {
+               _wrap = _wrap._attr ? _wrap.previous_attribute() : _parent.last_attribute();
+               return *this;
+       }
+
+       PUGI__FN xml_attribute_iterator xml_attribute_iterator::operator--(int)
+       {
+               xml_attribute_iterator temp = *this;
+               --*this;
+               return temp;
+       }
+
+       PUGI__FN xml_named_node_iterator::xml_named_node_iterator(): _name(0)
+       {
+       }
+
+       PUGI__FN xml_named_node_iterator::xml_named_node_iterator(const xml_node& node, const char_t* name): _wrap(node), _parent(node.parent()), _name(name)
+       {
+       }
+
+       PUGI__FN xml_named_node_iterator::xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name): _wrap(ref), _parent(parent), _name(name)
+       {
+       }
+
+       PUGI__FN bool xml_named_node_iterator::operator==(const xml_named_node_iterator& rhs) const
+       {
+               return _wrap._root == rhs._wrap._root && _parent._root == rhs._parent._root;
+       }
+
+       PUGI__FN bool xml_named_node_iterator::operator!=(const xml_named_node_iterator& rhs) const
+       {
+               return _wrap._root != rhs._wrap._root || _parent._root != rhs._parent._root;
+       }
+
+       PUGI__FN xml_node& xml_named_node_iterator::operator*() const
+       {
+               assert(_wrap._root);
+               return _wrap;
+       }
+
+       PUGI__FN xml_node* xml_named_node_iterator::operator->() const
+       {
+               assert(_wrap._root);
+               return const_cast<xml_node*>(&_wrap); // BCC32 workaround
+       }
+
+       PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator++()
+       {
+               assert(_wrap._root);
+               _wrap = _wrap.next_sibling(_name);
+               return *this;
+       }
+
+       PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator++(int)
+       {
+               xml_named_node_iterator temp = *this;
+               ++*this;
+               return temp;
+       }
+
+       PUGI__FN const xml_named_node_iterator& xml_named_node_iterator::operator--()
+       {
+               if (_wrap._root)
+                       _wrap = _wrap.previous_sibling(_name);
+               else
+               {
+                       _wrap = _parent.last_child();
+
+                       if (!impl::strequal(_wrap.name(), _name))
+                               _wrap = _wrap.previous_sibling(_name);
+               }
+
+               return *this;
+       }
+
+       PUGI__FN xml_named_node_iterator xml_named_node_iterator::operator--(int)
+       {
+               xml_named_node_iterator temp = *this;
+               --*this;
+               return temp;
+       }
+
+       PUGI__FN xml_parse_result::xml_parse_result(): status(status_internal_error), offset(0), encoding(encoding_auto)
+       {
+       }
+
+       PUGI__FN xml_parse_result::operator bool() const
+       {
+               return status == status_ok;
+       }
+
+       PUGI__FN const char* xml_parse_result::description() const
+       {
+               switch (status)
+               {
+               case status_ok: return "No error";
+
+               case status_file_not_found: return "File was not found";
+               case status_io_error: return "Error reading from file/stream";
+               case status_out_of_memory: return "Could not allocate memory";
+               case status_internal_error: return "Internal error occurred";
+
+               case status_unrecognized_tag: return "Could not determine tag type";
+
+               case status_bad_pi: return "Error parsing document declaration/processing instruction";
+               case status_bad_comment: return "Error parsing comment";
+               case status_bad_cdata: return "Error parsing CDATA section";
+               case status_bad_doctype: return "Error parsing document type declaration";
+               case status_bad_pcdata: return "Error parsing PCDATA section";
+               case status_bad_start_element: return "Error parsing start element tag";
+               case status_bad_attribute: return "Error parsing element attribute";
+               case status_bad_end_element: return "Error parsing end element tag";
+               case status_end_element_mismatch: return "Start-end tags mismatch";
+
+               case status_append_invalid_root: return "Unable to append nodes: root is not an element or document";
+
+               case status_no_document_element: return "No document element found";
+
+               default: return "Unknown error";
+               }
+       }
+
+       PUGI__FN xml_document::xml_document(): _buffer(0)
+       {
+               create();
+       }
+
+       PUGI__FN xml_document::~xml_document()
+       {
+               destroy();
+       }
+
+       PUGI__FN void xml_document::reset()
+       {
+               destroy();
+               create();
+       }
+
+       PUGI__FN void xml_document::reset(const xml_document& proto)
+       {
+               reset();
+
+               for (xml_node cur = proto.first_child(); cur; cur = cur.next_sibling())
+                       append_copy(cur);
+       }
+
+       PUGI__FN void xml_document::create()
+       {
+               assert(!_root);
+
+               // initialize sentinel page
+               PUGI__STATIC_ASSERT(sizeof(impl::xml_memory_page) + sizeof(impl::xml_document_struct) + impl::xml_memory_page_alignment - sizeof(void*) <= sizeof(_memory));
+
+               // align upwards to page boundary
+               void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(_memory) + (impl::xml_memory_page_alignment - 1)) & ~(impl::xml_memory_page_alignment - 1));
+
+               // prepare page structure
+               impl::xml_memory_page* page = impl::xml_memory_page::construct(page_memory);
+               assert(page);
+
+               page->busy_size = impl::xml_memory_page_size;
+
+               // allocate new root
+               _root = new (reinterpret_cast<char*>(page) + sizeof(impl::xml_memory_page)) impl::xml_document_struct(page);
+               _root->prev_sibling_c = _root;
+
+               // setup sentinel page
+               page->allocator = static_cast<impl::xml_document_struct*>(_root);
+
+               // verify the document allocation
+               assert(reinterpret_cast<char*>(_root) + sizeof(impl::xml_document_struct) <= _memory + sizeof(_memory));
+       }
+
+       PUGI__FN void xml_document::destroy()
+       {
+               assert(_root);
+
+               // destroy static storage
+               if (_buffer)
+               {
+                       impl::xml_memory::deallocate(_buffer);
+                       _buffer = 0;
+               }
+
+               // destroy extra buffers (note: no need to destroy linked list nodes, they're allocated using document allocator)
+               for (impl::xml_extra_buffer* extra = static_cast<impl::xml_document_struct*>(_root)->extra_buffers; extra; extra = extra->next)
+               {
+                       if (extra->buffer) impl::xml_memory::deallocate(extra->buffer);
+               }
+
+               // destroy dynamic storage, leave sentinel page (it's in static memory)
+               impl::xml_memory_page* root_page = reinterpret_cast<impl::xml_memory_page*>(_root->header & impl::xml_memory_page_pointer_mask);
+               assert(root_page && !root_page->prev);
+               assert(reinterpret_cast<char*>(root_page) >= _memory && reinterpret_cast<char*>(root_page) < _memory + sizeof(_memory));
+
+               for (impl::xml_memory_page* page = root_page->next; page; )
+               {
+                       impl::xml_memory_page* next = page->next;
+
+                       impl::xml_allocator::deallocate_page(page);
+
+                       page = next;
+               }
+
+               _root = 0;
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN xml_parse_result xml_document::load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_stream_impl(*this, stream, options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options)
+       {
+               reset();
+
+               return impl::load_stream_impl(*this, stream, options, encoding_wchar);
+       }
+#endif
+
+       PUGI__FN xml_parse_result xml_document::load_string(const char_t* contents, unsigned int options)
+       {
+               // Force native encoding (skip autodetection)
+       #ifdef PUGIXML_WCHAR_MODE
+               xml_encoding encoding = encoding_wchar;
+       #else
+               xml_encoding encoding = encoding_utf8;
+       #endif
+
+               return load_buffer(contents, impl::strlength(contents) * sizeof(char_t), options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load(const char_t* contents, unsigned int options)
+       {
+               return load_string(contents, options);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_file(const char* path_, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               FILE* file = fopen(path_, "rb");
+
+               return impl::load_file_impl(*this, file, options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_file(const wchar_t* path_, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               FILE* file = impl::open_file_wide(path_, L"rb");
+
+               return impl::load_file_impl(*this, file, options, encoding);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_buffer(const void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, const_cast<void*>(contents), size, options, encoding, false, false, &_buffer);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_buffer_inplace(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, contents, size, options, encoding, true, false, &_buffer);
+       }
+
+       PUGI__FN xml_parse_result xml_document::load_buffer_inplace_own(void* contents, size_t size, unsigned int options, xml_encoding encoding)
+       {
+               reset();
+
+               return impl::load_buffer_impl(static_cast<impl::xml_document_struct*>(_root), _root, contents, size, options, encoding, true, true, &_buffer);
+       }
+
+       PUGI__FN void xml_document::save(xml_writer& writer, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               impl::xml_buffered_writer buffered_writer(writer, encoding);
+
+               if ((flags & format_write_bom) && encoding != encoding_latin1)
+               {
+                       // BOM always represents the codepoint U+FEFF, so just write it in native encoding
+               #ifdef PUGIXML_WCHAR_MODE
+                       unsigned int bom = 0xfeff;
+                       buffered_writer.write(static_cast<wchar_t>(bom));
+               #else
+                       buffered_writer.write('\xef', '\xbb', '\xbf');
+               #endif
+               }
+
+               if (!(flags & format_no_declaration) && !impl::has_declaration(_root))
+               {
+                       buffered_writer.write_string(PUGIXML_TEXT("<?xml version=\"1.0\""));
+                       if (encoding == encoding_latin1) buffered_writer.write_string(PUGIXML_TEXT(" encoding=\"ISO-8859-1\""));
+                       buffered_writer.write('?', '>');
+                       if (!(flags & format_raw)) buffered_writer.write('\n');
+               }
+
+               impl::node_output(buffered_writer, _root, indent, flags, 0);
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN void xml_document::save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               xml_writer_stream writer(stream);
+
+               save(writer, indent, flags, encoding);
+       }
+
+       PUGI__FN void xml_document::save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent, unsigned int flags) const
+       {
+               xml_writer_stream writer(stream);
+
+               save(writer, indent, flags, encoding_wchar);
+       }
+#endif
+
+       PUGI__FN bool xml_document::save_file(const char* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               FILE* file = fopen(path_, (flags & format_save_file_text) ? "w" : "wb");
+               return impl::save_file_impl(*this, file, indent, flags, encoding);
+       }
+
+       PUGI__FN bool xml_document::save_file(const wchar_t* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const
+       {
+               FILE* file = impl::open_file_wide(path_, (flags & format_save_file_text) ? L"w" : L"wb");
+               return impl::save_file_impl(*this, file, indent, flags, encoding);
+       }
+
+       PUGI__FN xml_node xml_document::document_element() const
+       {
+               assert(_root);
+
+               for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling)
+                       if (PUGI__NODETYPE(i) == node_element)
+                               return xml_node(i);
+
+               return xml_node();
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str)
+       {
+               assert(str);
+
+               return impl::as_utf8_impl(str, impl::strlength_wide(str));
+       }
+
+       PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t>& str)
+       {
+               return impl::as_utf8_impl(str.c_str(), str.size());
+       }
+       
+       PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const char* str)
+       {
+               assert(str);
+
+               return impl::as_wide_impl(str, strlen(str));
+       }
+       
+       PUGI__FN std::basic_string<wchar_t> PUGIXML_FUNCTION as_wide(const std::string& str)
+       {
+               return impl::as_wide_impl(str.c_str(), str.size());
+       }
+#endif
+
+       PUGI__FN void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate)
+       {
+               impl::xml_memory::allocate = allocate;
+               impl::xml_memory::deallocate = deallocate;
+       }
+
+       PUGI__FN allocation_function PUGIXML_FUNCTION get_memory_allocation_function()
+       {
+               return impl::xml_memory::allocate;
+       }
+
+       PUGI__FN deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function()
+       {
+               return impl::xml_memory::deallocate;
+       }
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+       PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_attribute_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag _Iter_cat(const pugi::xml_named_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection
+       PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_attribute_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+
+       PUGI__FN std::bidirectional_iterator_tag __iterator_category(const pugi::xml_named_node_iterator&)
+       {
+               return std::bidirectional_iterator_tag();
+       }
+}
+#endif
+
+#ifndef PUGIXML_NO_XPATH
+// STL replacements
+PUGI__NS_BEGIN
+       struct equal_to
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs == rhs;
+               }
+       };
+
+       struct not_equal_to
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs != rhs;
+               }
+       };
+
+       struct less
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs < rhs;
+               }
+       };
+
+       struct less_equal
+       {
+               template <typename T> bool operator()(const T& lhs, const T& rhs) const
+               {
+                       return lhs <= rhs;
+               }
+       };
+
+       template <typename T> void swap(T& lhs, T& rhs)
+       {
+               T temp = lhs;
+               lhs = rhs;
+               rhs = temp;
+       }
+
+       template <typename I, typename Pred> I min_element(I begin, I end, const Pred& pred)
+       {
+               I result = begin;
+
+               for (I it = begin + 1; it != end; ++it)
+                       if (pred(*it, *result))
+                               result = it;
+
+               return result;
+       }
+
+       template <typename I> void reverse(I begin, I end)
+       {
+               while (end - begin > 1) swap(*begin++, *--end);
+       }
+
+       template <typename I> I unique(I begin, I end)
+       {
+               // fast skip head
+               while (end - begin > 1 && *begin != *(begin + 1)) begin++;
+
+               if (begin == end) return begin;
+
+               // last written element
+               I write = begin++; 
+
+               // merge unique elements
+               while (begin != end)
+               {
+                       if (*begin != *write)
+                               *++write = *begin++;
+                       else
+                               begin++;
+               }
+
+               // past-the-end (write points to live element)
+               return write + 1;
+       }
+
+       template <typename I> void copy_backwards(I begin, I end, I target)
+       {
+               while (begin != end) *--target = *--end;
+       }
+
+       template <typename I, typename Pred, typename T> void insertion_sort(I begin, I end, const Pred& pred, T*)
+       {
+               assert(begin != end);
+
+               for (I it = begin + 1; it != end; ++it)
+               {
+                       T val = *it;
+
+                       if (pred(val, *begin))
+                       {
+                               // move to front
+                               copy_backwards(begin, it, it + 1);
+                               *begin = val;
+                       }
+                       else
+                       {
+                               I hole = it;
+
+                               // move hole backwards
+                               while (pred(val, *(hole - 1)))
+                               {
+                                       *hole = *(hole - 1);
+                                       hole--;
+                               }
+
+                               // fill hole with element
+                               *hole = val;
+                       }
+               }
+       }
+
+       // std variant for elements with ==
+       template <typename I, typename Pred> void partition(I begin, I middle, I end, const Pred& pred, I* out_eqbeg, I* out_eqend)
+       {
+               I eqbeg = middle, eqend = middle + 1;
+
+               // expand equal range
+               while (eqbeg != begin && *(eqbeg - 1) == *eqbeg) --eqbeg;
+               while (eqend != end && *eqend == *eqbeg) ++eqend;
+
+               // process outer elements
+               I ltend = eqbeg, gtbeg = eqend;
+
+               for (;;)
+               {
+                       // find the element from the right side that belongs to the left one
+                       for (; gtbeg != end; ++gtbeg)
+                               if (!pred(*eqbeg, *gtbeg))
+                               {
+                                       if (*gtbeg == *eqbeg) swap(*gtbeg, *eqend++);
+                                       else break;
+                               }
+
+                       // find the element from the left side that belongs to the right one
+                       for (; ltend != begin; --ltend)
+                               if (!pred(*(ltend - 1), *eqbeg))
+                               {
+                                       if (*eqbeg == *(ltend - 1)) swap(*(ltend - 1), *--eqbeg);
+                                       else break;
+                               }
+
+                       // scanned all elements
+                       if (gtbeg == end && ltend == begin)
+                       {
+                               *out_eqbeg = eqbeg;
+                               *out_eqend = eqend;
+                               return;
+                       }
+
+                       // make room for elements by moving equal area
+                       if (gtbeg == end)
+                       {
+                               if (--ltend != --eqbeg) swap(*ltend, *eqbeg);
+                               swap(*eqbeg, *--eqend);
+                       }
+                       else if (ltend == begin)
+                       {
+                               if (eqend != gtbeg) swap(*eqbeg, *eqend);
+                               ++eqend;
+                               swap(*gtbeg++, *eqbeg++);
+                       }
+                       else swap(*gtbeg++, *--ltend);
+               }
+       }
+
+       template <typename I, typename Pred> void median3(I first, I middle, I last, const Pred& pred)
+       {
+               if (pred(*middle, *first)) swap(*middle, *first);
+               if (pred(*last, *middle)) swap(*last, *middle);
+               if (pred(*middle, *first)) swap(*middle, *first);
+       }
+
+       template <typename I, typename Pred> void median(I first, I middle, I last, const Pred& pred)
+       {
+               if (last - first <= 40)
+               {
+                       // median of three for small chunks
+                       median3(first, middle, last, pred);
+               }
+               else
+               {
+                       // median of nine
+                       size_t step = (last - first + 1) / 8;
+
+                       median3(first, first + step, first + 2 * step, pred);
+                       median3(middle - step, middle, middle + step, pred);
+                       median3(last - 2 * step, last - step, last, pred);
+                       median3(first + step, middle, last - step, pred);
+               }
+       }
+
+       template <typename I, typename Pred> void sort(I begin, I end, const Pred& pred)
+       {
+               // sort large chunks
+               while (end - begin > 32)
+               {
+                       // find median element
+                       I middle = begin + (end - begin) / 2;
+                       median(begin, middle, end - 1, pred);
+
+                       // partition in three chunks (< = >)
+                       I eqbeg, eqend;
+                       partition(begin, middle, end, pred, &eqbeg, &eqend);
+
+                       // loop on larger half
+                       if (eqbeg - begin > end - eqend)
+                       {
+                               sort(eqend, end, pred);
+                               end = eqbeg;
+                       }
+                       else
+                       {
+                               sort(begin, eqbeg, pred);
+                               begin = eqend;
+                       }
+               }
+
+               // insertion sort small chunk
+               if (begin != end) insertion_sort(begin, end, pred, &*begin);
+       }
+PUGI__NS_END
+
+// Allocator used for AST and evaluation stacks
+PUGI__NS_BEGIN
+       struct xpath_memory_block
+       {       
+               xpath_memory_block* next;
+               size_t capacity;
+
+               char data[
+       #ifdef PUGIXML_MEMORY_XPATH_PAGE_SIZE
+                       PUGIXML_MEMORY_XPATH_PAGE_SIZE
+       #else
+                       4096
+       #endif
+               ];
+       };
+               
+       class xpath_allocator
+       {
+               xpath_memory_block* _root;
+               size_t _root_size;
+
+       public:
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               jmp_buf* error_handler;
+       #endif
+
+               xpath_allocator(xpath_memory_block* root, size_t root_size = 0): _root(root), _root_size(root_size)
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       error_handler = 0;
+               #endif
+               }
+               
+               void* allocate_nothrow(size_t size)
+               {
+                       // align size so that we're able to store pointers in subsequent blocks
+                       size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+                       if (_root_size + size <= _root->capacity)
+                       {
+                               void* buf = _root->data + _root_size;
+                               _root_size += size;
+                               return buf;
+                       }
+                       else
+                       {
+                               // make sure we have at least 1/4th of the page free after allocation to satisfy subsequent allocation requests
+                               size_t block_capacity_base = sizeof(_root->data);
+                               size_t block_capacity_req = size + block_capacity_base / 4;
+                               size_t block_capacity = (block_capacity_base > block_capacity_req) ? block_capacity_base : block_capacity_req;
+
+                               size_t block_size = block_capacity + offsetof(xpath_memory_block, data);
+
+                               xpath_memory_block* block = static_cast<xpath_memory_block*>(xml_memory::allocate(block_size));
+                               if (!block) return 0;
+                               
+                               block->next = _root;
+                               block->capacity = block_capacity;
+                               
+                               _root = block;
+                               _root_size = size;
+                               
+                               return block->data;
+                       }
+               }
+
+               void* allocate(size_t size)
+               {
+                       void* result = allocate_nothrow(size);
+
+                       if (!result)
+                       {
+                       #ifdef PUGIXML_NO_EXCEPTIONS
+                               assert(error_handler);
+                               longjmp(*error_handler, 1);
+                       #else
+                               throw std::bad_alloc();
+                       #endif
+                       }
+
+                       return result;
+               }
+
+               void* reallocate(void* ptr, size_t old_size, size_t new_size)
+               {
+                       // align size so that we're able to store pointers in subsequent blocks
+                       old_size = (old_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+                       new_size = (new_size + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+
+                       // we can only reallocate the last object
+                       assert(ptr == 0 || static_cast<char*>(ptr) + old_size == _root->data + _root_size);
+
+                       // adjust root size so that we have not allocated the object at all
+                       bool only_object = (_root_size == old_size);
+
+                       if (ptr) _root_size -= old_size;
+
+                       // allocate a new version (this will obviously reuse the memory if possible)
+                       void* result = allocate(new_size);
+                       assert(result);
+
+                       // we have a new block
+                       if (result != ptr && ptr)
+                       {
+                               // copy old data
+                               assert(new_size >= old_size);
+                               memcpy(result, ptr, old_size);
+
+                               // free the previous page if it had no other objects
+                               if (only_object)
+                               {
+                                       assert(_root->data == result);
+                                       assert(_root->next);
+
+                                       xpath_memory_block* next = _root->next->next;
+
+                                       if (next)
+                                       {
+                                               // deallocate the whole page, unless it was the first one
+                                               xml_memory::deallocate(_root->next);
+                                               _root->next = next;
+                                       }
+                               }
+                       }
+
+                       return result;
+               }
+
+               void revert(const xpath_allocator& state)
+               {
+                       // free all new pages
+                       xpath_memory_block* cur = _root;
+
+                       while (cur != state._root)
+                       {
+                               xpath_memory_block* next = cur->next;
+
+                               xml_memory::deallocate(cur);
+
+                               cur = next;
+                       }
+
+                       // restore state
+                       _root = state._root;
+                       _root_size = state._root_size;
+               }
+
+               void release()
+               {
+                       xpath_memory_block* cur = _root;
+                       assert(cur);
+
+                       while (cur->next)
+                       {
+                               xpath_memory_block* next = cur->next;
+
+                               xml_memory::deallocate(cur);
+
+                               cur = next;
+                       }
+               }
+       };
+
+       struct xpath_allocator_capture
+       {
+               xpath_allocator_capture(xpath_allocator* alloc): _target(alloc), _state(*alloc)
+               {
+               }
+
+               ~xpath_allocator_capture()
+               {
+                       _target->revert(_state);
+               }
+
+               xpath_allocator* _target;
+               xpath_allocator _state;
+       };
+
+       struct xpath_stack
+       {
+               xpath_allocator* result;
+               xpath_allocator* temp;
+       };
+
+       struct xpath_stack_data
+       {
+               xpath_memory_block blocks[2];
+               xpath_allocator result;
+               xpath_allocator temp;
+               xpath_stack stack;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               jmp_buf error_handler;
+       #endif
+
+               xpath_stack_data(): result(blocks + 0), temp(blocks + 1)
+               {
+                       blocks[0].next = blocks[1].next = 0;
+                       blocks[0].capacity = blocks[1].capacity = sizeof(blocks[0].data);
+
+                       stack.result = &result;
+                       stack.temp = &temp;
+
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       result.error_handler = temp.error_handler = &error_handler;
+               #endif
+               }
+
+               ~xpath_stack_data()
+               {
+                       result.release();
+                       temp.release();
+               }
+       };
+PUGI__NS_END
+
+// String class
+PUGI__NS_BEGIN
+       class xpath_string
+       {
+               const char_t* _buffer;
+               bool _uses_heap;
+               size_t _length_heap;
+
+               static char_t* duplicate_string(const char_t* string, size_t length, xpath_allocator* alloc)
+               {
+                       char_t* result = static_cast<char_t*>(alloc->allocate((length + 1) * sizeof(char_t)));
+                       assert(result);
+
+                       memcpy(result, string, length * sizeof(char_t));
+                       result[length] = 0;
+
+                       return result;
+               }
+
+               xpath_string(const char_t* buffer, bool uses_heap_, size_t length_heap): _buffer(buffer), _uses_heap(uses_heap_), _length_heap(length_heap)
+               {
+               }
+
+       public:
+               static xpath_string from_const(const char_t* str)
+               {
+                       return xpath_string(str, false, 0);
+               }
+
+               static xpath_string from_heap_preallocated(const char_t* begin, const char_t* end)
+               {
+                       assert(begin <= end && *end == 0);
+
+                       return xpath_string(begin, true, static_cast<size_t>(end - begin));
+               }
+
+               static xpath_string from_heap(const char_t* begin, const char_t* end, xpath_allocator* alloc)
+               {
+                       assert(begin <= end);
+
+                       size_t length = static_cast<size_t>(end - begin);
+
+                       return length == 0 ? xpath_string() : xpath_string(duplicate_string(begin, length, alloc), true, length);
+               }
+
+               xpath_string(): _buffer(PUGIXML_TEXT("")), _uses_heap(false), _length_heap(0)
+               {
+               }
+
+               void append(const xpath_string& o, xpath_allocator* alloc)
+               {
+                       // skip empty sources
+                       if (!*o._buffer) return;
+
+                       // fast append for constant empty target and constant source
+                       if (!*_buffer && !_uses_heap && !o._uses_heap)
+                       {
+                               _buffer = o._buffer;
+                       }
+                       else
+                       {
+                               // need to make heap copy
+                               size_t target_length = length();
+                               size_t source_length = o.length();
+                               size_t result_length = target_length + source_length;
+
+                               // allocate new buffer
+                               char_t* result = static_cast<char_t*>(alloc->reallocate(_uses_heap ? const_cast<char_t*>(_buffer) : 0, (target_length + 1) * sizeof(char_t), (result_length + 1) * sizeof(char_t)));
+                               assert(result);
+
+                               // append first string to the new buffer in case there was no reallocation
+                               if (!_uses_heap) memcpy(result, _buffer, target_length * sizeof(char_t));
+
+                               // append second string to the new buffer
+                               memcpy(result + target_length, o._buffer, source_length * sizeof(char_t));
+                               result[result_length] = 0;
+
+                               // finalize
+                               _buffer = result;
+                               _uses_heap = true;
+                               _length_heap = result_length;
+                       }
+               }
+
+               const char_t* c_str() const
+               {
+                       return _buffer;
+               }
+
+               size_t length() const
+               {
+                       return _uses_heap ? _length_heap : strlength(_buffer);
+               }
+               
+               char_t* data(xpath_allocator* alloc)
+               {
+                       // make private heap copy
+                       if (!_uses_heap)
+                       {
+                               size_t length_ = strlength(_buffer);
+
+                               _buffer = duplicate_string(_buffer, length_, alloc);
+                               _uses_heap = true;
+                               _length_heap = length_;
+                       }
+
+                       return const_cast<char_t*>(_buffer);
+               }
+
+               bool empty() const
+               {
+                       return *_buffer == 0;
+               }
+
+               bool operator==(const xpath_string& o) const
+               {
+                       return strequal(_buffer, o._buffer);
+               }
+
+               bool operator!=(const xpath_string& o) const
+               {
+                       return !strequal(_buffer, o._buffer);
+               }
+
+               bool uses_heap() const
+               {
+                       return _uses_heap;
+               }
+       };
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+       PUGI__FN bool starts_with(const char_t* string, const char_t* pattern)
+       {
+               while (*pattern && *string == *pattern)
+               {
+                       string++;
+                       pattern++;
+               }
+
+               return *pattern == 0;
+       }
+
+       PUGI__FN const char_t* find_char(const char_t* s, char_t c)
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcschr(s, c);
+       #else
+               return strchr(s, c);
+       #endif
+       }
+
+       PUGI__FN const char_t* find_substring(const char_t* s, const char_t* p)
+       {
+       #ifdef PUGIXML_WCHAR_MODE
+               // MSVC6 wcsstr bug workaround (if s is empty it always returns 0)
+               return (*p == 0) ? s : wcsstr(s, p);
+       #else
+               return strstr(s, p);
+       #endif
+       }
+
+       // Converts symbol to lower case, if it is an ASCII one
+       PUGI__FN char_t tolower_ascii(char_t ch)
+       {
+               return static_cast<unsigned int>(ch - 'A') < 26 ? static_cast<char_t>(ch | ' ') : ch;
+       }
+
+       PUGI__FN xpath_string string_value(const xpath_node& na, xpath_allocator* alloc)
+       {
+               if (na.attribute())
+                       return xpath_string::from_const(na.attribute().value());
+               else
+               {
+                       xml_node n = na.node();
+
+                       switch (n.type())
+                       {
+                       case node_pcdata:
+                       case node_cdata:
+                       case node_comment:
+                       case node_pi:
+                               return xpath_string::from_const(n.value());
+                       
+                       case node_document:
+                       case node_element:
+                       {
+                               xpath_string result;
+
+                               xml_node cur = n.first_child();
+                               
+                               while (cur && cur != n)
+                               {
+                                       if (cur.type() == node_pcdata || cur.type() == node_cdata)
+                                               result.append(xpath_string::from_const(cur.value()), alloc);
+
+                                       if (cur.first_child())
+                                               cur = cur.first_child();
+                                       else if (cur.next_sibling())
+                                               cur = cur.next_sibling();
+                                       else
+                                       {
+                                               while (!cur.next_sibling() && cur != n)
+                                                       cur = cur.parent();
+
+                                               if (cur != n) cur = cur.next_sibling();
+                                       }
+                               }
+                               
+                               return result;
+                       }
+                       
+                       default:
+                               return xpath_string();
+                       }
+               }
+       }
+       
+       PUGI__FN bool node_is_before_sibling(xml_node_struct* ln, xml_node_struct* rn)
+       {
+               assert(ln->parent == rn->parent);
+
+               // there is no common ancestor (the shared parent is null), nodes are from different documents
+               if (!ln->parent) return ln < rn;
+
+               // determine sibling order
+               xml_node_struct* ls = ln;
+               xml_node_struct* rs = rn;
+
+               while (ls && rs)
+               {
+                       if (ls == rn) return true;
+                       if (rs == ln) return false;
+
+                       ls = ls->next_sibling;
+                       rs = rs->next_sibling;
+               }
+
+               // if rn sibling chain ended ln must be before rn
+               return !rs;
+       }
+       
+       PUGI__FN bool node_is_before(xml_node_struct* ln, xml_node_struct* rn)
+       {
+               // find common ancestor at the same depth, if any
+               xml_node_struct* lp = ln;
+               xml_node_struct* rp = rn;
+
+               while (lp && rp && lp->parent != rp->parent)
+               {
+                       lp = lp->parent;
+                       rp = rp->parent;
+               }
+
+               // parents are the same!
+               if (lp && rp) return node_is_before_sibling(lp, rp);
+
+               // nodes are at different depths, need to normalize heights
+               bool left_higher = !lp;
+
+               while (lp)
+               {
+                       lp = lp->parent;
+                       ln = ln->parent;
+               }
+
+               while (rp)
+               {
+                       rp = rp->parent;
+                       rn = rn->parent;
+               }
+
+               // one node is the ancestor of the other
+               if (ln == rn) return left_higher;
+
+               // find common ancestor... again
+               while (ln->parent != rn->parent)
+               {
+                       ln = ln->parent;
+                       rn = rn->parent;
+               }
+
+               return node_is_before_sibling(ln, rn);
+       }
+
+       PUGI__FN bool node_is_ancestor(xml_node_struct* parent, xml_node_struct* node)
+       {
+               while (node && node != parent) node = node->parent;
+
+               return parent && node == parent;
+       }
+
+       PUGI__FN const void* document_buffer_order(const xpath_node& xnode)
+       {
+               xml_node_struct* node = xnode.node().internal_object();
+
+               if (node)
+               {
+                       if ((get_document(node).header & xml_memory_page_contents_shared_mask) == 0)
+                       {
+                               if (node->name && (node->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return node->name;
+                               if (node->value && (node->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return node->value;
+                       }
+
+                       return 0;
+               }
+
+               xml_attribute_struct* attr = xnode.attribute().internal_object();
+
+               if (attr)
+               {
+                       if ((get_document(attr).header & xml_memory_page_contents_shared_mask) == 0)
+                       {
+                               if ((attr->header & impl::xml_memory_page_name_allocated_or_shared_mask) == 0) return attr->name;
+                               if ((attr->header & impl::xml_memory_page_value_allocated_or_shared_mask) == 0) return attr->value;
+                       }
+
+                       return 0;
+               }
+
+               return 0;
+       }
+       
+       struct document_order_comparator
+       {
+               bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+               {
+                       // optimized document order based check
+                       const void* lo = document_buffer_order(lhs);
+                       const void* ro = document_buffer_order(rhs);
+
+                       if (lo && ro) return lo < ro;
+
+                       // slow comparison
+                       xml_node ln = lhs.node(), rn = rhs.node();
+
+                       // compare attributes
+                       if (lhs.attribute() && rhs.attribute())
+                       {
+                               // shared parent
+                               if (lhs.parent() == rhs.parent())
+                               {
+                                       // determine sibling order
+                                       for (xml_attribute a = lhs.attribute(); a; a = a.next_attribute())
+                                               if (a == rhs.attribute())
+                                                       return true;
+                                       
+                                       return false;
+                               }
+                               
+                               // compare attribute parents
+                               ln = lhs.parent();
+                               rn = rhs.parent();
+                       }
+                       else if (lhs.attribute())
+                       {
+                               // attributes go after the parent element
+                               if (lhs.parent() == rhs.node()) return false;
+                               
+                               ln = lhs.parent();
+                       }
+                       else if (rhs.attribute())
+                       {
+                               // attributes go after the parent element
+                               if (rhs.parent() == lhs.node()) return true;
+                               
+                               rn = rhs.parent();
+                       }
+
+                       if (ln == rn) return false;
+
+                       if (!ln || !rn) return ln < rn;
+                       
+                       return node_is_before(ln.internal_object(), rn.internal_object());
+               }
+       };
+
+       struct duplicate_comparator
+       {
+               bool operator()(const xpath_node& lhs, const xpath_node& rhs) const
+               {
+                       if (lhs.attribute()) return rhs.attribute() ? lhs.attribute() < rhs.attribute() : true;
+                       else return rhs.attribute() ? false : lhs.node() < rhs.node();
+               }
+       };
+       
+       PUGI__FN double gen_nan()
+       {
+       #if defined(__STDC_IEC_559__) || ((FLT_RADIX - 0 == 2) && (FLT_MAX_EXP - 0 == 128) && (FLT_MANT_DIG - 0 == 24))
+               union { float f; uint32_t i; } u[sizeof(float) == sizeof(uint32_t) ? 1 : -1];
+               u[0].i = 0x7fc00000;
+               return u[0].f;
+       #else
+               // fallback
+               const volatile double zero = 0.0;
+               return zero / zero;
+       #endif
+       }
+       
+       PUGI__FN bool is_nan(double value)
+       {
+       #if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+               return !!_isnan(value);
+       #elif defined(fpclassify) && defined(FP_NAN)
+               return fpclassify(value) == FP_NAN;
+       #else
+               // fallback
+               const volatile double v = value;
+               return v != v;
+       #endif
+       }
+       
+       PUGI__FN const char_t* convert_number_to_string_special(double value)
+       {
+       #if defined(PUGI__MSVC_CRT_VERSION) || defined(__BORLANDC__)
+               if (_finite(value)) return (value == 0) ? PUGIXML_TEXT("0") : 0;
+               if (_isnan(value)) return PUGIXML_TEXT("NaN");
+               return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+       #elif defined(fpclassify) && defined(FP_NAN) && defined(FP_INFINITE) && defined(FP_ZERO)
+               switch (fpclassify(value))
+               {
+               case FP_NAN:
+                       return PUGIXML_TEXT("NaN");
+
+               case FP_INFINITE:
+                       return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+
+               case FP_ZERO:
+                       return PUGIXML_TEXT("0");
+
+               default:
+                       return 0;
+               }
+       #else
+               // fallback
+               const volatile double v = value;
+
+               if (v == 0) return PUGIXML_TEXT("0");
+               if (v != v) return PUGIXML_TEXT("NaN");
+               if (v * 2 == v) return value > 0 ? PUGIXML_TEXT("Infinity") : PUGIXML_TEXT("-Infinity");
+               return 0;
+       #endif
+       }
+       
+       PUGI__FN bool convert_number_to_boolean(double value)
+       {
+               return (value != 0 && !is_nan(value));
+       }
+       
+       PUGI__FN void truncate_zeros(char* begin, char* end)
+       {
+               while (begin != end && end[-1] == '0') end--;
+
+               *end = 0;
+       }
+
+       // gets mantissa digits in the form of 0.xxxxx with 0. implied and the exponent
+#if defined(PUGI__MSVC_CRT_VERSION) && PUGI__MSVC_CRT_VERSION >= 1400 && !defined(_WIN32_WCE)
+       PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+       {
+               // get base values
+               int sign, exponent;
+               _ecvt_s(buffer, buffer_size, value, DBL_DIG + 1, &exponent, &sign);
+
+               // truncate redundant zeros
+               truncate_zeros(buffer, buffer + strlen(buffer));
+
+               // fill results
+               *out_mantissa = buffer;
+               *out_exponent = exponent;
+       }
+#else
+       PUGI__FN void convert_number_to_mantissa_exponent(double value, char* buffer, size_t buffer_size, char** out_mantissa, int* out_exponent)
+       {
+               // get a scientific notation value with IEEE DBL_DIG decimals
+               sprintf(buffer, "%.*e", DBL_DIG, value);
+               assert(strlen(buffer) < buffer_size);
+               (void)!buffer_size;
+
+               // get the exponent (possibly negative)
+               char* exponent_string = strchr(buffer, 'e');
+               assert(exponent_string);
+
+               int exponent = atoi(exponent_string + 1);
+
+               // extract mantissa string: skip sign
+               char* mantissa = buffer[0] == '-' ? buffer + 1 : buffer;
+               assert(mantissa[0] != '0' && mantissa[1] == '.');
+
+               // divide mantissa by 10 to eliminate integer part
+               mantissa[1] = mantissa[0];
+               mantissa++;
+               exponent++;
+
+               // remove extra mantissa digits and zero-terminate mantissa
+               truncate_zeros(mantissa, exponent_string);
+
+               // fill results
+               *out_mantissa = mantissa;
+               *out_exponent = exponent;
+       }
+#endif
+
+       PUGI__FN xpath_string convert_number_to_string(double value, xpath_allocator* alloc)
+       {
+               // try special number conversion
+               const char_t* special = convert_number_to_string_special(value);
+               if (special) return xpath_string::from_const(special);
+
+               // get mantissa + exponent form
+               char mantissa_buffer[32];
+
+               char* mantissa;
+               int exponent;
+               convert_number_to_mantissa_exponent(value, mantissa_buffer, sizeof(mantissa_buffer), &mantissa, &exponent);
+
+               // allocate a buffer of suitable length for the number
+               size_t result_size = strlen(mantissa_buffer) + (exponent > 0 ? exponent : -exponent) + 4;
+               char_t* result = static_cast<char_t*>(alloc->allocate(sizeof(char_t) * result_size));
+               assert(result);
+
+               // make the number!
+               char_t* s = result;
+
+               // sign
+               if (value < 0) *s++ = '-';
+
+               // integer part
+               if (exponent <= 0)
+               {
+                       *s++ = '0';
+               }
+               else
+               {
+                       while (exponent > 0)
+                       {
+                               assert(*mantissa == 0 || static_cast<unsigned int>(static_cast<unsigned int>(*mantissa) - '0') <= 9);
+                               *s++ = *mantissa ? *mantissa++ : '0';
+                               exponent--;
+                       }
+               }
+
+               // fractional part
+               if (*mantissa)
+               {
+                       // decimal point
+                       *s++ = '.';
+
+                       // extra zeroes from negative exponent
+                       while (exponent < 0)
+                       {
+                               *s++ = '0';
+                               exponent++;
+                       }
+
+                       // extra mantissa digits
+                       while (*mantissa)
+                       {
+                               assert(static_cast<unsigned int>(*mantissa - '0') <= 9);
+                               *s++ = *mantissa++;
+                       }
+               }
+
+               // zero-terminate
+               assert(s < result + result_size);
+               *s = 0;
+
+               return xpath_string::from_heap_preallocated(result, s);
+       }
+       
+       PUGI__FN bool check_string_to_number_format(const char_t* string)
+       {
+               // parse leading whitespace
+               while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+               // parse sign
+               if (*string == '-') ++string;
+
+               if (!*string) return false;
+
+               // if there is no integer part, there should be a decimal part with at least one digit
+               if (!PUGI__IS_CHARTYPEX(string[0], ctx_digit) && (string[0] != '.' || !PUGI__IS_CHARTYPEX(string[1], ctx_digit))) return false;
+
+               // parse integer part
+               while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+
+               // parse decimal part
+               if (*string == '.')
+               {
+                       ++string;
+
+                       while (PUGI__IS_CHARTYPEX(*string, ctx_digit)) ++string;
+               }
+
+               // parse trailing whitespace
+               while (PUGI__IS_CHARTYPE(*string, ct_space)) ++string;
+
+               return *string == 0;
+       }
+
+       PUGI__FN double convert_string_to_number(const char_t* string)
+       {
+               // check string format
+               if (!check_string_to_number_format(string)) return gen_nan();
+
+               // parse string
+       #ifdef PUGIXML_WCHAR_MODE
+               return wcstod(string, 0);
+       #else
+               return atof(string);
+       #endif
+       }
+
+       PUGI__FN bool convert_string_to_number_scratch(char_t (&buffer)[32], const char_t* begin, const char_t* end, double* out_result)
+       {
+               size_t length = static_cast<size_t>(end - begin);
+               char_t* scratch = buffer;
+
+               if (length >= sizeof(buffer) / sizeof(buffer[0]))
+               {
+                       // need to make dummy on-heap copy
+                       scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!scratch) return false;
+               }
+
+               // copy string to zero-terminated buffer and perform conversion
+               memcpy(scratch, begin, length * sizeof(char_t));
+               scratch[length] = 0;
+
+               *out_result = convert_string_to_number(scratch);
+
+               // free dummy buffer
+               if (scratch != buffer) xml_memory::deallocate(scratch);
+
+               return true;
+       }
+       
+       PUGI__FN double round_nearest(double value)
+       {
+               return floor(value + 0.5);
+       }
+
+       PUGI__FN double round_nearest_nzero(double value)
+       {
+               // same as round_nearest, but returns -0 for [-0.5, -0]
+               // ceil is used to differentiate between +0 and -0 (we return -0 for [-0.5, -0] and +0 for +0)
+               return (value >= -0.5 && value <= 0) ? ceil(value) : floor(value + 0.5);
+       }
+       
+       PUGI__FN const char_t* qualified_name(const xpath_node& node)
+       {
+               return node.attribute() ? node.attribute().name() : node.node().name();
+       }
+       
+       PUGI__FN const char_t* local_name(const xpath_node& node)
+       {
+               const char_t* name = qualified_name(node);
+               const char_t* p = find_char(name, ':');
+               
+               return p ? p + 1 : name;
+       }
+
+       struct namespace_uri_predicate
+       {
+               const char_t* prefix;
+               size_t prefix_length;
+
+               namespace_uri_predicate(const char_t* name)
+               {
+                       const char_t* pos = find_char(name, ':');
+
+                       prefix = pos ? name : 0;
+                       prefix_length = pos ? static_cast<size_t>(pos - name) : 0;
+               }
+
+               bool operator()(xml_attribute a) const
+               {
+                       const char_t* name = a.name();
+
+                       if (!starts_with(name, PUGIXML_TEXT("xmlns"))) return false;
+
+                       return prefix ? name[5] == ':' && strequalrange(name + 6, prefix, prefix_length) : name[5] == 0;
+               }
+       };
+
+       PUGI__FN const char_t* namespace_uri(xml_node node)
+       {
+               namespace_uri_predicate pred = node.name();
+               
+               xml_node p = node;
+               
+               while (p)
+               {
+                       xml_attribute a = p.find_attribute(pred);
+                       
+                       if (a) return a.value();
+                       
+                       p = p.parent();
+               }
+               
+               return PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* namespace_uri(xml_attribute attr, xml_node parent)
+       {
+               namespace_uri_predicate pred = attr.name();
+               
+               // Default namespace does not apply to attributes
+               if (!pred.prefix) return PUGIXML_TEXT("");
+               
+               xml_node p = parent;
+               
+               while (p)
+               {
+                       xml_attribute a = p.find_attribute(pred);
+                       
+                       if (a) return a.value();
+                       
+                       p = p.parent();
+               }
+               
+               return PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const char_t* namespace_uri(const xpath_node& node)
+       {
+               return node.attribute() ? namespace_uri(node.attribute(), node.parent()) : namespace_uri(node.node());
+       }
+
+       PUGI__FN void normalize_space(char_t* buffer)
+       {
+               char_t* write = buffer;
+
+               for (char_t* it = buffer; *it; )
+               {
+                       char_t ch = *it++;
+
+                       if (PUGI__IS_CHARTYPE(ch, ct_space))
+                       {
+                               // replace whitespace sequence with single space
+                               while (PUGI__IS_CHARTYPE(*it, ct_space)) it++;
+
+                               // avoid leading spaces
+                               if (write != buffer) *write++ = ' ';
+                       }
+                       else *write++ = ch;
+               }
+
+               // remove trailing space
+               if (write != buffer && PUGI__IS_CHARTYPE(write[-1], ct_space)) write--;
+
+               // zero-terminate
+               *write = 0;
+       }
+
+       PUGI__FN void translate(char_t* buffer, const char_t* from, const char_t* to, size_t to_length)
+       {
+               char_t* write = buffer;
+
+               while (*buffer)
+               {
+                       PUGI__DMC_VOLATILE char_t ch = *buffer++;
+
+                       const char_t* pos = find_char(from, ch);
+
+                       if (!pos)
+                               *write++ = ch; // do not process
+                       else if (static_cast<size_t>(pos - from) < to_length)
+                               *write++ = to[pos - from]; // replace
+               }
+
+               // zero-terminate
+               *write = 0;
+       }
+
+       PUGI__FN unsigned char* translate_table_generate(xpath_allocator* alloc, const char_t* from, const char_t* to)
+       {
+               unsigned char table[128] = {0};
+
+               while (*from)
+               {
+                       unsigned int fc = static_cast<unsigned int>(*from);
+                       unsigned int tc = static_cast<unsigned int>(*to);
+
+                       if (fc >= 128 || tc >= 128)
+                               return 0;
+
+                       // code=128 means "skip character"
+                       if (!table[fc])
+                               table[fc] = static_cast<unsigned char>(tc ? tc : 128);
+
+                       from++;
+                       if (tc) to++;
+               }
+
+               for (int i = 0; i < 128; ++i)
+                       if (!table[i])
+                               table[i] = static_cast<unsigned char>(i);
+
+               void* result = alloc->allocate_nothrow(sizeof(table));
+
+               if (result)
+               {
+                       memcpy(result, table, sizeof(table));
+               }
+
+               return static_cast<unsigned char*>(result);
+       }
+
+       PUGI__FN void translate_table(char_t* buffer, const unsigned char* table)
+       {
+               char_t* write = buffer;
+
+               while (*buffer)
+               {
+                       char_t ch = *buffer++;
+                       unsigned int index = static_cast<unsigned int>(ch);
+
+                       if (index < 128)
+                       {
+                               unsigned char code = table[index];
+
+                               // code=128 means "skip character" (table size is 128 so 128 can be a special value)
+                               // this code skips these characters without extra branches
+                               *write = static_cast<char_t>(code);
+                               write += 1 - (code >> 7);
+                       }
+                       else
+                       {
+                               *write++ = ch;
+                       }
+               }
+
+               // zero-terminate
+               *write = 0;
+       }
+
+       inline bool is_xpath_attribute(const char_t* name)
+       {
+               return !(starts_with(name, PUGIXML_TEXT("xmlns")) && (name[5] == 0 || name[5] == ':'));
+       }
+
+       struct xpath_variable_boolean: xpath_variable
+       {
+               xpath_variable_boolean(): value(false)
+               {
+               }
+
+               bool value;
+               char_t name[1];
+       };
+
+       struct xpath_variable_number: xpath_variable
+       {
+               xpath_variable_number(): value(0)
+               {
+               }
+
+               double value;
+               char_t name[1];
+       };
+
+       struct xpath_variable_string: xpath_variable
+       {
+               xpath_variable_string(): value(0)
+               {
+               }
+
+               ~xpath_variable_string()
+               {
+                       if (value) xml_memory::deallocate(value);
+               }
+
+               char_t* value;
+               char_t name[1];
+       };
+
+       struct xpath_variable_node_set: xpath_variable
+       {
+               xpath_node_set value;
+               char_t name[1];
+       };
+
+       static const xpath_node_set dummy_node_set;
+
+       PUGI__FN unsigned int hash_string(const char_t* str)
+       {
+               // Jenkins one-at-a-time hash (http://en.wikipedia.org/wiki/Jenkins_hash_function#one-at-a-time)
+               unsigned int result = 0;
+
+               while (*str)
+               {
+                       result += static_cast<unsigned int>(*str++);
+                       result += result << 10;
+                       result ^= result >> 6;
+               }
+       
+               result += result << 3;
+               result ^= result >> 11;
+               result += result << 15;
+       
+               return result;
+       }
+
+       template <typename T> PUGI__FN T* new_xpath_variable(const char_t* name)
+       {
+               size_t length = strlength(name);
+               if (length == 0) return 0; // empty variable names are invalid
+
+               // $$ we can't use offsetof(T, name) because T is non-POD, so we just allocate additional length characters
+               void* memory = xml_memory::allocate(sizeof(T) + length * sizeof(char_t));
+               if (!memory) return 0;
+
+               T* result = new (memory) T();
+
+               memcpy(result->name, name, (length + 1) * sizeof(char_t));
+
+               return result;
+       }
+
+       PUGI__FN xpath_variable* new_xpath_variable(xpath_value_type type, const char_t* name)
+       {
+               switch (type)
+               {
+               case xpath_type_node_set:
+                       return new_xpath_variable<xpath_variable_node_set>(name);
+
+               case xpath_type_number:
+                       return new_xpath_variable<xpath_variable_number>(name);
+
+               case xpath_type_string:
+                       return new_xpath_variable<xpath_variable_string>(name);
+
+               case xpath_type_boolean:
+                       return new_xpath_variable<xpath_variable_boolean>(name);
+
+               default:
+                       return 0;
+               }
+       }
+
+       template <typename T> PUGI__FN void delete_xpath_variable(T* var)
+       {
+               var->~T();
+               xml_memory::deallocate(var);
+       }
+
+       PUGI__FN void delete_xpath_variable(xpath_value_type type, xpath_variable* var)
+       {
+               switch (type)
+               {
+               case xpath_type_node_set:
+                       delete_xpath_variable(static_cast<xpath_variable_node_set*>(var));
+                       break;
+
+               case xpath_type_number:
+                       delete_xpath_variable(static_cast<xpath_variable_number*>(var));
+                       break;
+
+               case xpath_type_string:
+                       delete_xpath_variable(static_cast<xpath_variable_string*>(var));
+                       break;
+
+               case xpath_type_boolean:
+                       delete_xpath_variable(static_cast<xpath_variable_boolean*>(var));
+                       break;
+
+               default:
+                       assert(!"Invalid variable type");
+               }
+       }
+
+       PUGI__FN xpath_variable* get_variable_scratch(char_t (&buffer)[32], xpath_variable_set* set, const char_t* begin, const char_t* end)
+       {
+               size_t length = static_cast<size_t>(end - begin);
+               char_t* scratch = buffer;
+
+               if (length >= sizeof(buffer) / sizeof(buffer[0]))
+               {
+                       // need to make dummy on-heap copy
+                       scratch = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t)));
+                       if (!scratch) return 0;
+               }
+
+               // copy string to zero-terminated buffer and perform lookup
+               memcpy(scratch, begin, length * sizeof(char_t));
+               scratch[length] = 0;
+
+               xpath_variable* result = set->get(scratch);
+
+               // free dummy buffer
+               if (scratch != buffer) xml_memory::deallocate(scratch);
+
+               return result;
+       }
+PUGI__NS_END
+
+// Internal node set class
+PUGI__NS_BEGIN
+       PUGI__FN xpath_node_set::type_t xpath_get_order(const xpath_node* begin, const xpath_node* end)
+       {
+               if (end - begin < 2)
+                       return xpath_node_set::type_sorted;
+
+               document_order_comparator cmp;
+
+               bool first = cmp(begin[0], begin[1]);
+
+               for (const xpath_node* it = begin + 1; it + 1 < end; ++it)
+                       if (cmp(it[0], it[1]) != first)
+                               return xpath_node_set::type_unsorted;
+
+               return first ? xpath_node_set::type_sorted : xpath_node_set::type_sorted_reverse;
+       }
+
+       PUGI__FN xpath_node_set::type_t xpath_sort(xpath_node* begin, xpath_node* end, xpath_node_set::type_t type, bool rev)
+       {
+               xpath_node_set::type_t order = rev ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted;
+
+               if (type == xpath_node_set::type_unsorted)
+               {
+                       xpath_node_set::type_t sorted = xpath_get_order(begin, end);
+
+                       if (sorted == xpath_node_set::type_unsorted)
+                       {
+                               sort(begin, end, document_order_comparator());
+
+                               type = xpath_node_set::type_sorted;
+                       }
+                       else
+                               type = sorted;
+               }
+               
+               if (type != order) reverse(begin, end);
+                       
+               return order;
+       }
+
+       PUGI__FN xpath_node xpath_first(const xpath_node* begin, const xpath_node* end, xpath_node_set::type_t type)
+       {
+               if (begin == end) return xpath_node();
+
+               switch (type)
+               {
+               case xpath_node_set::type_sorted:
+                       return *begin;
+
+               case xpath_node_set::type_sorted_reverse:
+                       return *(end - 1);
+
+               case xpath_node_set::type_unsorted:
+                       return *min_element(begin, end, document_order_comparator());
+
+               default:
+                       assert(!"Invalid node set type");
+                       return xpath_node();
+               }
+       }
+
+       class xpath_node_set_raw
+       {
+               xpath_node_set::type_t _type;
+
+               xpath_node* _begin;
+               xpath_node* _end;
+               xpath_node* _eos;
+
+       public:
+               xpath_node_set_raw(): _type(xpath_node_set::type_unsorted), _begin(0), _end(0), _eos(0)
+               {
+               }
+
+               xpath_node* begin() const
+               {
+                       return _begin;
+               }
+
+               xpath_node* end() const
+               {
+                       return _end;
+               }
+
+               bool empty() const
+               {
+                       return _begin == _end;
+               }
+
+               size_t size() const
+               {
+                       return static_cast<size_t>(_end - _begin);
+               }
+
+               xpath_node first() const
+               {
+                       return xpath_first(_begin, _end, _type);
+               }
+
+               void push_back_grow(const xpath_node& node, xpath_allocator* alloc);
+
+               void push_back(const xpath_node& node, xpath_allocator* alloc)
+               {
+                       if (_end != _eos)
+                               *_end++ = node;
+                       else
+                               push_back_grow(node, alloc);
+               }
+
+               void append(const xpath_node* begin_, const xpath_node* end_, xpath_allocator* alloc)
+               {
+                       if (begin_ == end_) return;
+
+                       size_t size_ = static_cast<size_t>(_end - _begin);
+                       size_t capacity = static_cast<size_t>(_eos - _begin);
+                       size_t count = static_cast<size_t>(end_ - begin_);
+
+                       if (size_ + count > capacity)
+                       {
+                               // reallocate the old array or allocate a new one
+                               xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), (size_ + count) * sizeof(xpath_node)));
+                               assert(data);
+
+                               // finalize
+                               _begin = data;
+                               _end = data + size_;
+                               _eos = data + size_ + count;
+                       }
+
+                       memcpy(_end, begin_, count * sizeof(xpath_node));
+                       _end += count;
+               }
+
+               void sort_do()
+               {
+                       _type = xpath_sort(_begin, _end, _type, false);
+               }
+
+               void truncate(xpath_node* pos)
+               {
+                       assert(_begin <= pos && pos <= _end);
+
+                       _end = pos;
+               }
+
+               void remove_duplicates()
+               {
+                       if (_type == xpath_node_set::type_unsorted)
+                               sort(_begin, _end, duplicate_comparator());
+               
+                       _end = unique(_begin, _end);
+               }
+
+               xpath_node_set::type_t type() const
+               {
+                       return _type;
+               }
+
+               void set_type(xpath_node_set::type_t value)
+               {
+                       _type = value;
+               }
+       };
+
+       PUGI__FN_NO_INLINE void xpath_node_set_raw::push_back_grow(const xpath_node& node, xpath_allocator* alloc)
+       {
+               size_t capacity = static_cast<size_t>(_eos - _begin);
+
+               // get new capacity (1.5x rule)
+               size_t new_capacity = capacity + capacity / 2 + 1;
+
+               // reallocate the old array or allocate a new one
+               xpath_node* data = static_cast<xpath_node*>(alloc->reallocate(_begin, capacity * sizeof(xpath_node), new_capacity * sizeof(xpath_node)));
+               assert(data);
+
+               // finalize
+               _begin = data;
+               _end = data + capacity;
+               _eos = data + new_capacity;
+
+               // push
+               *_end++ = node;
+       }
+PUGI__NS_END
+
+PUGI__NS_BEGIN
+       struct xpath_context
+       {
+               xpath_node n;
+               size_t position, size;
+
+               xpath_context(const xpath_node& n_, size_t position_, size_t size_): n(n_), position(position_), size(size_)
+               {
+               }
+       };
+
+       enum lexeme_t
+       {
+               lex_none = 0,
+               lex_equal,
+               lex_not_equal,
+               lex_less,
+               lex_greater,
+               lex_less_or_equal,
+               lex_greater_or_equal,
+               lex_plus,
+               lex_minus,
+               lex_multiply,
+               lex_union,
+               lex_var_ref,
+               lex_open_brace,
+               lex_close_brace,
+               lex_quoted_string,
+               lex_number,
+               lex_slash,
+               lex_double_slash,
+               lex_open_square_brace,
+               lex_close_square_brace,
+               lex_string,
+               lex_comma,
+               lex_axis_attribute,
+               lex_dot,
+               lex_double_dot,
+               lex_double_colon,
+               lex_eof
+       };
+
+       struct xpath_lexer_string
+       {
+               const char_t* begin;
+               const char_t* end;
+
+               xpath_lexer_string(): begin(0), end(0)
+               {
+               }
+
+               bool operator==(const char_t* other) const
+               {
+                       size_t length = static_cast<size_t>(end - begin);
+
+                       return strequalrange(other, begin, length);
+               }
+       };
+
+       class xpath_lexer
+       {
+               const char_t* _cur;
+               const char_t* _cur_lexeme_pos;
+               xpath_lexer_string _cur_lexeme_contents;
+
+               lexeme_t _cur_lexeme;
+
+       public:
+               explicit xpath_lexer(const char_t* query): _cur(query)
+               {
+                       next();
+               }
+               
+               const char_t* state() const
+               {
+                       return _cur;
+               }
+               
+               void next()
+               {
+                       const char_t* cur = _cur;
+
+                       while (PUGI__IS_CHARTYPE(*cur, ct_space)) ++cur;
+
+                       // save lexeme position for error reporting
+                       _cur_lexeme_pos = cur;
+
+                       switch (*cur)
+                       {
+                       case 0:
+                               _cur_lexeme = lex_eof;
+                               break;
+                       
+                       case '>':
+                               if (*(cur+1) == '=')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_greater_or_equal;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_greater;
+                               }
+                               break;
+
+                       case '<':
+                               if (*(cur+1) == '=')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_less_or_equal;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_less;
+                               }
+                               break;
+
+                       case '!':
+                               if (*(cur+1) == '=')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_not_equal;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+                               break;
+
+                       case '=':
+                               cur += 1;
+                               _cur_lexeme = lex_equal;
+
+                               break;
+                       
+                       case '+':
+                               cur += 1;
+                               _cur_lexeme = lex_plus;
+
+                               break;
+
+                       case '-':
+                               cur += 1;
+                               _cur_lexeme = lex_minus;
+
+                               break;
+
+                       case '*':
+                               cur += 1;
+                               _cur_lexeme = lex_multiply;
+
+                               break;
+
+                       case '|':
+                               cur += 1;
+                               _cur_lexeme = lex_union;
+
+                               break;
+                       
+                       case '$':
+                               cur += 1;
+
+                               if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+                               {
+                                       _cur_lexeme_contents.begin = cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+                                       if (cur[0] == ':' && PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // qname
+                                       {
+                                               cur++; // :
+
+                                               while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+                                       }
+
+                                       _cur_lexeme_contents.end = cur;
+                               
+                                       _cur_lexeme = lex_var_ref;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+
+                               break;
+
+                       case '(':
+                               cur += 1;
+                               _cur_lexeme = lex_open_brace;
+
+                               break;
+
+                       case ')':
+                               cur += 1;
+                               _cur_lexeme = lex_close_brace;
+
+                               break;
+                       
+                       case '[':
+                               cur += 1;
+                               _cur_lexeme = lex_open_square_brace;
+
+                               break;
+
+                       case ']':
+                               cur += 1;
+                               _cur_lexeme = lex_close_square_brace;
+
+                               break;
+
+                       case ',':
+                               cur += 1;
+                               _cur_lexeme = lex_comma;
+
+                               break;
+
+                       case '/':
+                               if (*(cur+1) == '/')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_double_slash;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_slash;
+                               }
+                               break;
+               
+                       case '.':
+                               if (*(cur+1) == '.')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_double_dot;
+                               }
+                               else if (PUGI__IS_CHARTYPEX(*(cur+1), ctx_digit))
+                               {
+                                       _cur_lexeme_contents.begin = cur; // .
+
+                                       ++cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+
+                                       _cur_lexeme_contents.end = cur;
+                                       
+                                       _cur_lexeme = lex_number;
+                               }
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_dot;
+                               }
+                               break;
+
+                       case '@':
+                               cur += 1;
+                               _cur_lexeme = lex_axis_attribute;
+
+                               break;
+
+                       case '"':
+                       case '\'':
+                       {
+                               char_t terminator = *cur;
+
+                               ++cur;
+
+                               _cur_lexeme_contents.begin = cur;
+                               while (*cur && *cur != terminator) cur++;
+                               _cur_lexeme_contents.end = cur;
+                               
+                               if (!*cur)
+                                       _cur_lexeme = lex_none;
+                               else
+                               {
+                                       cur += 1;
+                                       _cur_lexeme = lex_quoted_string;
+                               }
+
+                               break;
+                       }
+
+                       case ':':
+                               if (*(cur+1) == ':')
+                               {
+                                       cur += 2;
+                                       _cur_lexeme = lex_double_colon;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+                               break;
+
+                       default:
+                               if (PUGI__IS_CHARTYPEX(*cur, ctx_digit))
+                               {
+                                       _cur_lexeme_contents.begin = cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+                               
+                                       if (*cur == '.')
+                                       {
+                                               cur++;
+
+                                               while (PUGI__IS_CHARTYPEX(*cur, ctx_digit)) cur++;
+                                       }
+
+                                       _cur_lexeme_contents.end = cur;
+
+                                       _cur_lexeme = lex_number;
+                               }
+                               else if (PUGI__IS_CHARTYPEX(*cur, ctx_start_symbol))
+                               {
+                                       _cur_lexeme_contents.begin = cur;
+
+                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+
+                                       if (cur[0] == ':')
+                                       {
+                                               if (cur[1] == '*') // namespace test ncname:*
+                                               {
+                                                       cur += 2; // :*
+                                               }
+                                               else if (PUGI__IS_CHARTYPEX(cur[1], ctx_symbol)) // namespace test qname
+                                               {
+                                                       cur++; // :
+
+                                                       while (PUGI__IS_CHARTYPEX(*cur, ctx_symbol)) cur++;
+                                               }
+                                       }
+
+                                       _cur_lexeme_contents.end = cur;
+                               
+                                       _cur_lexeme = lex_string;
+                               }
+                               else
+                               {
+                                       _cur_lexeme = lex_none;
+                               }
+                       }
+
+                       _cur = cur;
+               }
+
+               lexeme_t current() const
+               {
+                       return _cur_lexeme;
+               }
+
+               const char_t* current_pos() const
+               {
+                       return _cur_lexeme_pos;
+               }
+
+               const xpath_lexer_string& contents() const
+               {
+                       assert(_cur_lexeme == lex_var_ref || _cur_lexeme == lex_number || _cur_lexeme == lex_string || _cur_lexeme == lex_quoted_string);
+
+                       return _cur_lexeme_contents;
+               }
+       };
+
+       enum ast_type_t
+       {
+               ast_unknown,
+               ast_op_or,                                              // left or right
+               ast_op_and,                                             // left and right
+               ast_op_equal,                                   // left = right
+               ast_op_not_equal,                               // left != right
+               ast_op_less,                                    // left < right
+               ast_op_greater,                                 // left > right
+               ast_op_less_or_equal,                   // left <= right
+               ast_op_greater_or_equal,                // left >= right
+               ast_op_add,                                             // left + right
+               ast_op_subtract,                                // left - right
+               ast_op_multiply,                                // left * right
+               ast_op_divide,                                  // left / right
+               ast_op_mod,                                             // left % right
+               ast_op_negate,                                  // left - right
+               ast_op_union,                                   // left | right
+               ast_predicate,                                  // apply predicate to set; next points to next predicate
+               ast_filter,                                             // select * from left where right
+               ast_string_constant,                    // string constant
+               ast_number_constant,                    // number constant
+               ast_variable,                                   // variable
+               ast_func_last,                                  // last()
+               ast_func_position,                              // position()
+               ast_func_count,                                 // count(left)
+               ast_func_id,                                    // id(left)
+               ast_func_local_name_0,                  // local-name()
+               ast_func_local_name_1,                  // local-name(left)
+               ast_func_namespace_uri_0,               // namespace-uri()
+               ast_func_namespace_uri_1,               // namespace-uri(left)
+               ast_func_name_0,                                // name()
+               ast_func_name_1,                                // name(left)
+               ast_func_string_0,                              // string()
+               ast_func_string_1,                              // string(left)
+               ast_func_concat,                                // concat(left, right, siblings)
+               ast_func_starts_with,                   // starts_with(left, right)
+               ast_func_contains,                              // contains(left, right)
+               ast_func_substring_before,              // substring-before(left, right)
+               ast_func_substring_after,               // substring-after(left, right)
+               ast_func_substring_2,                   // substring(left, right)
+               ast_func_substring_3,                   // substring(left, right, third)
+               ast_func_string_length_0,               // string-length()
+               ast_func_string_length_1,               // string-length(left)
+               ast_func_normalize_space_0,             // normalize-space()
+               ast_func_normalize_space_1,             // normalize-space(left)
+               ast_func_translate,                             // translate(left, right, third)
+               ast_func_boolean,                               // boolean(left)
+               ast_func_not,                                   // not(left)
+               ast_func_true,                                  // true()
+               ast_func_false,                                 // false()
+               ast_func_lang,                                  // lang(left)
+               ast_func_number_0,                              // number()
+               ast_func_number_1,                              // number(left)
+               ast_func_sum,                                   // sum(left)
+               ast_func_floor,                                 // floor(left)
+               ast_func_ceiling,                               // ceiling(left)
+               ast_func_round,                                 // round(left)
+               ast_step,                                               // process set left with step
+               ast_step_root,                                  // select root node
+
+               ast_opt_translate_table,                // translate(left, right, third) where right/third are constants
+               ast_opt_compare_attribute               // @name = 'string'
+       };
+
+       enum axis_t
+       {
+               axis_ancestor,
+               axis_ancestor_or_self,
+               axis_attribute,
+               axis_child,
+               axis_descendant,
+               axis_descendant_or_self,
+               axis_following,
+               axis_following_sibling,
+               axis_namespace,
+               axis_parent,
+               axis_preceding,
+               axis_preceding_sibling,
+               axis_self
+       };
+       
+       enum nodetest_t
+       {
+               nodetest_none,
+               nodetest_name,
+               nodetest_type_node,
+               nodetest_type_comment,
+               nodetest_type_pi,
+               nodetest_type_text,
+               nodetest_pi,
+               nodetest_all,
+               nodetest_all_in_namespace
+       };
+
+       enum predicate_t
+       {
+               predicate_default,
+               predicate_posinv,
+               predicate_constant,
+               predicate_constant_one
+       };
+
+       enum nodeset_eval_t
+       {
+               nodeset_eval_all,
+               nodeset_eval_any,
+               nodeset_eval_first
+       };
+
+       template <axis_t N> struct axis_to_type
+       {
+               static const axis_t axis;
+       };
+
+       template <axis_t N> const axis_t axis_to_type<N>::axis = N;
+               
+       class xpath_ast_node
+       {
+       private:
+               // node type
+               char _type;
+               char _rettype;
+
+               // for ast_step
+               char _axis;
+
+               // for ast_step/ast_predicate/ast_filter
+               char _test;
+
+               // tree node structure
+               xpath_ast_node* _left;
+               xpath_ast_node* _right;
+               xpath_ast_node* _next;
+
+               union
+               {
+                       // value for ast_string_constant
+                       const char_t* string;
+                       // value for ast_number_constant
+                       double number;
+                       // variable for ast_variable
+                       xpath_variable* variable;
+                       // node test for ast_step (node name/namespace/node type/pi target)
+                       const char_t* nodetest;
+                       // table for ast_opt_translate_table
+                       const unsigned char* table;
+               } _data;
+
+               xpath_ast_node(const xpath_ast_node&);
+               xpath_ast_node& operator=(const xpath_ast_node&);
+
+               template <class Comp> static bool compare_eq(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+               {
+                       xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+                       if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+                       {
+                               if (lt == xpath_type_boolean || rt == xpath_type_boolean)
+                                       return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+                               else if (lt == xpath_type_number || rt == xpath_type_number)
+                                       return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+                               else if (lt == xpath_type_string || rt == xpath_type_string)
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       xpath_string ls = lhs->eval_string(c, stack);
+                                       xpath_string rs = rhs->eval_string(c, stack);
+
+                                       return comp(ls, rs);
+                               }
+                       }
+                       else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all);
+                               xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                               for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture cri(stack.result);
+
+                                               if (comp(string_value(*li, stack.result), string_value(*ri, stack.result)))
+                                                       return true;
+                                       }
+
+                               return false;
+                       }
+                       else
+                       {
+                               if (lt == xpath_type_node_set)
+                               {
+                                       swap(lhs, rhs);
+                                       swap(lt, rt);
+                               }
+
+                               if (lt == xpath_type_boolean)
+                                       return comp(lhs->eval_boolean(c, stack), rhs->eval_boolean(c, stack));
+                               else if (lt == xpath_type_number)
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       double l = lhs->eval_number(c, stack);
+                                       xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture cri(stack.result);
+
+                                               if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+                                                       return true;
+                                       }
+
+                                       return false;
+                               }
+                               else if (lt == xpath_type_string)
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       xpath_string l = lhs->eval_string(c, stack);
+                                       xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture cri(stack.result);
+
+                                               if (comp(l, string_value(*ri, stack.result)))
+                                                       return true;
+                                       }
+
+                                       return false;
+                               }
+                       }
+
+                       assert(!"Wrong types");
+                       return false;
+               }
+
+               static bool eval_once(xpath_node_set::type_t type, nodeset_eval_t eval)
+               {
+                       return type == xpath_node_set::type_sorted ? eval != nodeset_eval_all : eval == nodeset_eval_any;
+               }
+
+               template <class Comp> static bool compare_rel(xpath_ast_node* lhs, xpath_ast_node* rhs, const xpath_context& c, const xpath_stack& stack, const Comp& comp)
+               {
+                       xpath_value_type lt = lhs->rettype(), rt = rhs->rettype();
+
+                       if (lt != xpath_type_node_set && rt != xpath_type_node_set)
+                               return comp(lhs->eval_number(c, stack), rhs->eval_number(c, stack));
+                       else if (lt == xpath_type_node_set && rt == xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all);
+                               xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                               for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       double l = convert_string_to_number(string_value(*li, stack.result).c_str());
+
+                                       for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                                       {
+                                               xpath_allocator_capture crii(stack.result);
+
+                                               if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+                                                       return true;
+                                       }
+                               }
+
+                               return false;
+                       }
+                       else if (lt != xpath_type_node_set && rt == xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               double l = lhs->eval_number(c, stack);
+                               xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all);
+
+                               for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       if (comp(l, convert_string_to_number(string_value(*ri, stack.result).c_str())))
+                                               return true;
+                               }
+
+                               return false;
+                       }
+                       else if (lt == xpath_type_node_set && rt != xpath_type_node_set)
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all);
+                               double r = rhs->eval_number(c, stack);
+
+                               for (const xpath_node* li = ls.begin(); li != ls.end(); ++li)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       if (comp(convert_string_to_number(string_value(*li, stack.result).c_str()), r))
+                                               return true;
+                               }
+
+                               return false;
+                       }
+                       else
+                       {
+                               assert(!"Wrong types");
+                               return false;
+                       }
+               }
+
+               static void apply_predicate_boolean(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once)
+               {
+                       assert(ns.size() >= first);
+                       assert(expr->rettype() != xpath_type_number);
+
+                       size_t i = 1;
+                       size_t size = ns.size() - first;
+
+                       xpath_node* last = ns.begin() + first;
+
+                       // remove_if... or well, sort of
+                       for (xpath_node* it = last; it != ns.end(); ++it, ++i)
+                       {
+                               xpath_context c(*it, i, size);
+
+                               if (expr->eval_boolean(c, stack))
+                               {
+                                       *last++ = *it;
+
+                                       if (once) break;
+                               }
+                       }
+
+                       ns.truncate(last);
+               }
+
+               static void apply_predicate_number(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack, bool once)
+               {
+                       assert(ns.size() >= first);
+                       assert(expr->rettype() == xpath_type_number);
+
+                       size_t i = 1;
+                       size_t size = ns.size() - first;
+
+                       xpath_node* last = ns.begin() + first;
+
+                       // remove_if... or well, sort of
+                       for (xpath_node* it = last; it != ns.end(); ++it, ++i)
+                       {
+                               xpath_context c(*it, i, size);
+
+                               if (expr->eval_number(c, stack) == i)
+                               {
+                                       *last++ = *it;
+
+                                       if (once) break;
+                               }
+                       }
+
+                       ns.truncate(last);
+               }
+
+               static void apply_predicate_number_const(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack)
+               {
+                       assert(ns.size() >= first);
+                       assert(expr->rettype() == xpath_type_number);
+
+                       size_t size = ns.size() - first;
+
+                       xpath_node* last = ns.begin() + first;
+
+                       xpath_context c(xpath_node(), 1, size);
+
+                       double er = expr->eval_number(c, stack);
+
+                       if (er >= 1.0 && er <= size)
+                       {
+                               size_t eri = static_cast<size_t>(er);
+
+                               if (er == eri)
+                               {
+                                       xpath_node r = last[eri - 1];
+
+                                       *last++ = r;
+                               }
+                       }
+
+                       ns.truncate(last);
+               }
+
+               void apply_predicate(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, bool once)
+               {
+                       if (ns.size() == first) return;
+
+                       assert(_type == ast_filter || _type == ast_predicate);
+
+                       if (_test == predicate_constant || _test == predicate_constant_one)
+                               apply_predicate_number_const(ns, first, _right, stack);
+                       else if (_right->rettype() == xpath_type_number)
+                               apply_predicate_number(ns, first, _right, stack, once);
+                       else
+                               apply_predicate_boolean(ns, first, _right, stack, once);
+               }
+
+               void apply_predicates(xpath_node_set_raw& ns, size_t first, const xpath_stack& stack, nodeset_eval_t eval)
+               {
+                       if (ns.size() == first) return;
+
+                       bool last_once = eval_once(ns.type(), eval);
+
+                       for (xpath_ast_node* pred = _right; pred; pred = pred->_next)
+                               pred->apply_predicate(ns, first, stack, !pred->_next && last_once);
+               }
+
+               bool step_push(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* parent, xpath_allocator* alloc)
+               {
+                       assert(a);
+
+                       const char_t* name = a->name ? a->name : PUGIXML_TEXT("");
+
+                       switch (_test)
+                       {
+                       case nodetest_name:
+                               if (strequal(name, _data.nodetest) && is_xpath_attribute(name))
+                               {
+                                       ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_node:
+                       case nodetest_all:
+                               if (is_xpath_attribute(name))
+                               {
+                                       ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_all_in_namespace:
+                               if (starts_with(name, _data.nodetest) && is_xpath_attribute(name))
+                               {
+                                       ns.push_back(xpath_node(xml_attribute(a), xml_node(parent)), alloc);
+                                       return true;
+                               }
+                               break;
+                       
+                       default:
+                               ;
+                       }
+
+                       return false;
+               }
+               
+               bool step_push(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc)
+               {
+                       assert(n);
+
+                       xml_node_type type = PUGI__NODETYPE(n);
+
+                       switch (_test)
+                       {
+                       case nodetest_name:
+                               if (type == node_element && n->name && strequal(n->name, _data.nodetest))
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_node:
+                               ns.push_back(xml_node(n), alloc);
+                               return true;
+                               
+                       case nodetest_type_comment:
+                               if (type == node_comment)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_text:
+                               if (type == node_pcdata || type == node_cdata)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_type_pi:
+                               if (type == node_pi)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                                                                       
+                       case nodetest_pi:
+                               if (type == node_pi && n->name && strequal(n->name, _data.nodetest))
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_all:
+                               if (type == node_element)
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+                               
+                       case nodetest_all_in_namespace:
+                               if (type == node_element && n->name && starts_with(n->name, _data.nodetest))
+                               {
+                                       ns.push_back(xml_node(n), alloc);
+                                       return true;
+                               }
+                               break;
+
+                       default:
+                               assert(!"Unknown axis");
+                       }
+
+                       return false;
+               }
+
+               template <class T> void step_fill(xpath_node_set_raw& ns, xml_node_struct* n, xpath_allocator* alloc, bool once, T)
+               {
+                       const axis_t axis = T::axis;
+
+                       switch (axis)
+                       {
+                       case axis_attribute:
+                       {
+                               for (xml_attribute_struct* a = n->first_attribute; a; a = a->next_attribute)
+                                       if (step_push(ns, a, n, alloc) & once)
+                                               return;
+                               
+                               break;
+                       }
+                       
+                       case axis_child:
+                       {
+                               for (xml_node_struct* c = n->first_child; c; c = c->next_sibling)
+                                       if (step_push(ns, c, alloc) & once)
+                                               return;
+                                       
+                               break;
+                       }
+                       
+                       case axis_descendant:
+                       case axis_descendant_or_self:
+                       {
+                               if (axis == axis_descendant_or_self)
+                                       if (step_push(ns, n, alloc) & once)
+                                               return;
+                                       
+                               xml_node_struct* cur = n->first_child;
+                               
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                                       
+                                       if (cur->first_child)
+                                               cur = cur->first_child;
+                                       else
+                                       {
+                                               while (!cur->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (cur == n) return;
+                                               }
+                                       
+                                               cur = cur->next_sibling;
+                                       }
+                               }
+                               
+                               break;
+                       }
+                       
+                       case axis_following_sibling:
+                       {
+                               for (xml_node_struct* c = n->next_sibling; c; c = c->next_sibling)
+                                       if (step_push(ns, c, alloc) & once)
+                                               return;
+                               
+                               break;
+                       }
+                       
+                       case axis_preceding_sibling:
+                       {
+                               for (xml_node_struct* c = n->prev_sibling_c; c->next_sibling; c = c->prev_sibling_c)
+                                       if (step_push(ns, c, alloc) & once)
+                                               return;
+                               
+                               break;
+                       }
+                       
+                       case axis_following:
+                       {
+                               xml_node_struct* cur = n;
+
+                               // exit from this node so that we don't include descendants
+                               while (!cur->next_sibling)
+                               {
+                                       cur = cur->parent;
+
+                                       if (!cur) return;
+                               }
+
+                               cur = cur->next_sibling;
+
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+
+                                       if (cur->first_child)
+                                               cur = cur->first_child;
+                                       else
+                                       {
+                                               while (!cur->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (!cur) return;
+                                               }
+
+                                               cur = cur->next_sibling;
+                                       }
+                               }
+
+                               break;
+                       }
+
+                       case axis_preceding:
+                       {
+                               xml_node_struct* cur = n;
+
+                               // exit from this node so that we don't include descendants
+                               while (!cur->prev_sibling_c->next_sibling)
+                               {
+                                       cur = cur->parent;
+
+                                       if (!cur) return;
+                               }
+
+                               cur = cur->prev_sibling_c;
+
+                               while (cur)
+                               {
+                                       if (cur->first_child)
+                                               cur = cur->first_child->prev_sibling_c;
+                                       else
+                                       {
+                                               // leaf node, can't be ancestor
+                                               if (step_push(ns, cur, alloc) & once)
+                                                       return;
+
+                                               while (!cur->prev_sibling_c->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (!cur) return;
+
+                                                       if (!node_is_ancestor(cur, n))
+                                                               if (step_push(ns, cur, alloc) & once)
+                                                                       return;
+                                               }
+
+                                               cur = cur->prev_sibling_c;
+                                       }
+                               }
+
+                               break;
+                       }
+                       
+                       case axis_ancestor:
+                       case axis_ancestor_or_self:
+                       {
+                               if (axis == axis_ancestor_or_self)
+                                       if (step_push(ns, n, alloc) & once)
+                                               return;
+
+                               xml_node_struct* cur = n->parent;
+                               
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                                       
+                                       cur = cur->parent;
+                               }
+                               
+                               break;
+                       }
+
+                       case axis_self:
+                       {
+                               step_push(ns, n, alloc);
+
+                               break;
+                       }
+
+                       case axis_parent:
+                       {
+                               if (n->parent)
+                                       step_push(ns, n->parent, alloc);
+
+                               break;
+                       }
+                               
+                       default:
+                               assert(!"Unimplemented axis");
+                       }
+               }
+               
+               template <class T> void step_fill(xpath_node_set_raw& ns, xml_attribute_struct* a, xml_node_struct* p, xpath_allocator* alloc, bool once, T v)
+               {
+                       const axis_t axis = T::axis;
+
+                       switch (axis)
+                       {
+                       case axis_ancestor:
+                       case axis_ancestor_or_self:
+                       {
+                               if (axis == axis_ancestor_or_self && _test == nodetest_type_node) // reject attributes based on principal node type test
+                                       if (step_push(ns, a, p, alloc) & once)
+                                               return;
+
+                               xml_node_struct* cur = p;
+                               
+                               while (cur)
+                               {
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                                       
+                                       cur = cur->parent;
+                               }
+                               
+                               break;
+                       }
+
+                       case axis_descendant_or_self:
+                       case axis_self:
+                       {
+                               if (_test == nodetest_type_node) // reject attributes based on principal node type test
+                                       step_push(ns, a, p, alloc);
+
+                               break;
+                       }
+
+                       case axis_following:
+                       {
+                               xml_node_struct* cur = p;
+                               
+                               while (cur)
+                               {
+                                       if (cur->first_child)
+                                               cur = cur->first_child;
+                                       else
+                                       {
+                                               while (!cur->next_sibling)
+                                               {
+                                                       cur = cur->parent;
+
+                                                       if (!cur) return;
+                                               }
+
+                                               cur = cur->next_sibling;
+                                       }
+
+                                       if (step_push(ns, cur, alloc) & once)
+                                               return;
+                               }
+
+                               break;
+                       }
+
+                       case axis_parent:
+                       {
+                               step_push(ns, p, alloc);
+
+                               break;
+                       }
+
+                       case axis_preceding:
+                       {
+                               // preceding:: axis does not include attribute nodes and attribute ancestors (they are the same as parent's ancestors), so we can reuse node preceding
+                               step_fill(ns, p, alloc, once, v);
+                               break;
+                       }
+                       
+                       default:
+                               assert(!"Unimplemented axis");
+                       }
+               }
+
+               template <class T> void step_fill(xpath_node_set_raw& ns, const xpath_node& xn, xpath_allocator* alloc, bool once, T v)
+               {
+                       const axis_t axis = T::axis;
+                       const bool axis_has_attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self);
+
+                       if (xn.node())
+                               step_fill(ns, xn.node().internal_object(), alloc, once, v);
+                       else if (axis_has_attributes && xn.attribute() && xn.parent())
+                               step_fill(ns, xn.attribute().internal_object(), xn.parent().internal_object(), alloc, once, v);
+               }
+
+               template <class T> xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval, T v)
+               {
+                       const axis_t axis = T::axis;
+                       const bool axis_reverse = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling);
+                       const xpath_node_set::type_t axis_type = axis_reverse ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted;
+
+                       bool once =
+                               (axis == axis_attribute && _test == nodetest_name) ||
+                               (!_right && eval_once(axis_type, eval)) ||
+                               (_right && !_right->_next && _right->_test == predicate_constant_one);
+
+                       xpath_node_set_raw ns;
+                       ns.set_type(axis_type);
+
+                       if (_left)
+                       {
+                               xpath_node_set_raw s = _left->eval_node_set(c, stack, nodeset_eval_all);
+
+                               // self axis preserves the original order
+                               if (axis == axis_self) ns.set_type(s.type());
+
+                               for (const xpath_node* it = s.begin(); it != s.end(); ++it)
+                               {
+                                       size_t size = ns.size();
+
+                                       // in general, all axes generate elements in a particular order, but there is no order guarantee if axis is applied to two nodes
+                                       if (axis != axis_self && size != 0) ns.set_type(xpath_node_set::type_unsorted);
+                                       
+                                       step_fill(ns, *it, stack.result, once, v);
+                                       if (_right) apply_predicates(ns, size, stack, eval);
+                               }
+                       }
+                       else
+                       {
+                               step_fill(ns, c.n, stack.result, once, v);
+                               if (_right) apply_predicates(ns, 0, stack, eval);
+                       }
+
+                       // child, attribute and self axes always generate unique set of nodes
+                       // for other axis, if the set stayed sorted, it stayed unique because the traversal algorithms do not visit the same node twice
+                       if (axis != axis_child && axis != axis_attribute && axis != axis_self && ns.type() == xpath_node_set::type_unsorted)
+                               ns.remove_duplicates();
+
+                       return ns;
+               }
+               
+       public:
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, const char_t* value):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+               {
+                       assert(type == ast_string_constant);
+                       _data.string = value;
+               }
+
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, double value):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+               {
+                       assert(type == ast_number_constant);
+                       _data.number = value;
+               }
+               
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_variable* value):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(0), _right(0), _next(0)
+               {
+                       assert(type == ast_variable);
+                       _data.variable = value;
+               }
+               
+               xpath_ast_node(ast_type_t type, xpath_value_type rettype_, xpath_ast_node* left = 0, xpath_ast_node* right = 0):
+                       _type(static_cast<char>(type)), _rettype(static_cast<char>(rettype_)), _axis(0), _test(0), _left(left), _right(right), _next(0)
+               {
+               }
+
+               xpath_ast_node(ast_type_t type, xpath_ast_node* left, axis_t axis, nodetest_t test, const char_t* contents):
+                       _type(static_cast<char>(type)), _rettype(xpath_type_node_set), _axis(static_cast<char>(axis)), _test(static_cast<char>(test)), _left(left), _right(0), _next(0)
+               {
+                       assert(type == ast_step);
+                       _data.nodetest = contents;
+               }
+
+               xpath_ast_node(ast_type_t type, xpath_ast_node* left, xpath_ast_node* right, predicate_t test):
+                       _type(static_cast<char>(type)), _rettype(xpath_type_node_set), _axis(0), _test(static_cast<char>(test)), _left(left), _right(right), _next(0)
+               {
+                       assert(type == ast_filter || type == ast_predicate);
+               }
+
+               void set_next(xpath_ast_node* value)
+               {
+                       _next = value;
+               }
+
+               void set_right(xpath_ast_node* value)
+               {
+                       _right = value;
+               }
+
+               bool eval_boolean(const xpath_context& c, const xpath_stack& stack)
+               {
+                       switch (_type)
+                       {
+                       case ast_op_or:
+                               return _left->eval_boolean(c, stack) || _right->eval_boolean(c, stack);
+                               
+                       case ast_op_and:
+                               return _left->eval_boolean(c, stack) && _right->eval_boolean(c, stack);
+                               
+                       case ast_op_equal:
+                               return compare_eq(_left, _right, c, stack, equal_to());
+
+                       case ast_op_not_equal:
+                               return compare_eq(_left, _right, c, stack, not_equal_to());
+       
+                       case ast_op_less:
+                               return compare_rel(_left, _right, c, stack, less());
+                       
+                       case ast_op_greater:
+                               return compare_rel(_right, _left, c, stack, less());
+
+                       case ast_op_less_or_equal:
+                               return compare_rel(_left, _right, c, stack, less_equal());
+                       
+                       case ast_op_greater_or_equal:
+                               return compare_rel(_right, _left, c, stack, less_equal());
+
+                       case ast_func_starts_with:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_string lr = _left->eval_string(c, stack);
+                               xpath_string rr = _right->eval_string(c, stack);
+
+                               return starts_with(lr.c_str(), rr.c_str());
+                       }
+
+                       case ast_func_contains:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_string lr = _left->eval_string(c, stack);
+                               xpath_string rr = _right->eval_string(c, stack);
+
+                               return find_substring(lr.c_str(), rr.c_str()) != 0;
+                       }
+
+                       case ast_func_boolean:
+                               return _left->eval_boolean(c, stack);
+                               
+                       case ast_func_not:
+                               return !_left->eval_boolean(c, stack);
+                               
+                       case ast_func_true:
+                               return true;
+                               
+                       case ast_func_false:
+                               return false;
+
+                       case ast_func_lang:
+                       {
+                               if (c.n.attribute()) return false;
+                               
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_string lang = _left->eval_string(c, stack);
+                               
+                               for (xml_node n = c.n.node(); n; n = n.parent())
+                               {
+                                       xml_attribute a = n.attribute(PUGIXML_TEXT("xml:lang"));
+                                       
+                                       if (a)
+                                       {
+                                               const char_t* value = a.value();
+                                               
+                                               // strnicmp / strncasecmp is not portable
+                                               for (const char_t* lit = lang.c_str(); *lit; ++lit)
+                                               {
+                                                       if (tolower_ascii(*lit) != tolower_ascii(*value)) return false;
+                                                       ++value;
+                                               }
+                                               
+                                               return *value == 0 || *value == '-';
+                                       }
+                               }
+                               
+                               return false;
+                       }
+
+                       case ast_opt_compare_attribute:
+                       {
+                               const char_t* value = (_right->_type == ast_string_constant) ? _right->_data.string : _right->_data.variable->get_string();
+
+                               xml_attribute attr = c.n.node().attribute(_left->_data.nodetest);
+
+                               return attr && strequal(attr.value(), value) && is_xpath_attribute(attr.name());
+                       }
+
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_boolean)
+                                       return _data.variable->get_boolean();
+                       }
+                       // fallthrough
+
+                       default:
+                       {
+                               switch (_rettype)
+                               {
+                               case xpath_type_number:
+                                       return convert_number_to_boolean(eval_number(c, stack));
+                                       
+                               case xpath_type_string:
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return !eval_string(c, stack).empty();
+                               }
+                                       
+                               case xpath_type_node_set:                               
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return !eval_node_set(c, stack, nodeset_eval_any).empty();
+                               }
+
+                               default:
+                                       assert(!"Wrong expression for return type boolean");
+                                       return false;
+                               }
+                       }
+                       }
+               }
+
+               double eval_number(const xpath_context& c, const xpath_stack& stack)
+               {
+                       switch (_type)
+                       {
+                       case ast_op_add:
+                               return _left->eval_number(c, stack) + _right->eval_number(c, stack);
+                               
+                       case ast_op_subtract:
+                               return _left->eval_number(c, stack) - _right->eval_number(c, stack);
+
+                       case ast_op_multiply:
+                               return _left->eval_number(c, stack) * _right->eval_number(c, stack);
+
+                       case ast_op_divide:
+                               return _left->eval_number(c, stack) / _right->eval_number(c, stack);
+
+                       case ast_op_mod:
+                               return fmod(_left->eval_number(c, stack), _right->eval_number(c, stack));
+
+                       case ast_op_negate:
+                               return -_left->eval_number(c, stack);
+
+                       case ast_number_constant:
+                               return _data.number;
+
+                       case ast_func_last:
+                               return static_cast<double>(c.size);
+                       
+                       case ast_func_position:
+                               return static_cast<double>(c.position);
+
+                       case ast_func_count:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return static_cast<double>(_left->eval_node_set(c, stack, nodeset_eval_all).size());
+                       }
+                       
+                       case ast_func_string_length_0:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return static_cast<double>(string_value(c.n, stack.result).length());
+                       }
+                       
+                       case ast_func_string_length_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return static_cast<double>(_left->eval_string(c, stack).length());
+                       }
+                       
+                       case ast_func_number_0:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               return convert_string_to_number(string_value(c.n, stack.result).c_str());
+                       }
+                       
+                       case ast_func_number_1:
+                               return _left->eval_number(c, stack);
+
+                       case ast_func_sum:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               double r = 0;
+                               
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_all);
+                               
+                               for (const xpath_node* it = ns.begin(); it != ns.end(); ++it)
+                               {
+                                       xpath_allocator_capture cri(stack.result);
+
+                                       r += convert_string_to_number(string_value(*it, stack.result).c_str());
+                               }
+                       
+                               return r;
+                       }
+
+                       case ast_func_floor:
+                       {
+                               double r = _left->eval_number(c, stack);
+                               
+                               return r == r ? floor(r) : r;
+                       }
+
+                       case ast_func_ceiling:
+                       {
+                               double r = _left->eval_number(c, stack);
+                               
+                               return r == r ? ceil(r) : r;
+                       }
+
+                       case ast_func_round:
+                               return round_nearest_nzero(_left->eval_number(c, stack));
+                       
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_number)
+                                       return _data.variable->get_number();
+                       }
+                       // fallthrough
+
+                       default:
+                       {
+                               switch (_rettype)
+                               {
+                               case xpath_type_boolean:
+                                       return eval_boolean(c, stack) ? 1 : 0;
+                                       
+                               case xpath_type_string:
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return convert_string_to_number(eval_string(c, stack).c_str());
+                               }
+                                       
+                               case xpath_type_node_set:
+                               {
+                                       xpath_allocator_capture cr(stack.result);
+
+                                       return convert_string_to_number(eval_string(c, stack).c_str());
+                               }
+                                       
+                               default:
+                                       assert(!"Wrong expression for return type number");
+                                       return 0;
+                               }
+                               
+                       }
+                       }
+               }
+               
+               xpath_string eval_string_concat(const xpath_context& c, const xpath_stack& stack)
+               {
+                       assert(_type == ast_func_concat);
+
+                       xpath_allocator_capture ct(stack.temp);
+
+                       // count the string number
+                       size_t count = 1;
+                       for (xpath_ast_node* nc = _right; nc; nc = nc->_next) count++;
+
+                       // gather all strings
+                       xpath_string static_buffer[4];
+                       xpath_string* buffer = static_buffer;
+
+                       // allocate on-heap for large concats
+                       if (count > sizeof(static_buffer) / sizeof(static_buffer[0]))
+                       {
+                               buffer = static_cast<xpath_string*>(stack.temp->allocate(count * sizeof(xpath_string)));
+                               assert(buffer);
+                       }
+
+                       // evaluate all strings to temporary stack
+                       xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                       buffer[0] = _left->eval_string(c, swapped_stack);
+
+                       size_t pos = 1;
+                       for (xpath_ast_node* n = _right; n; n = n->_next, ++pos) buffer[pos] = n->eval_string(c, swapped_stack);
+                       assert(pos == count);
+
+                       // get total length
+                       size_t length = 0;
+                       for (size_t i = 0; i < count; ++i) length += buffer[i].length();
+
+                       // create final string
+                       char_t* result = static_cast<char_t*>(stack.result->allocate((length + 1) * sizeof(char_t)));
+                       assert(result);
+
+                       char_t* ri = result;
+
+                       for (size_t j = 0; j < count; ++j)
+                               for (const char_t* bi = buffer[j].c_str(); *bi; ++bi)
+                                       *ri++ = *bi;
+
+                       *ri = 0;
+
+                       return xpath_string::from_heap_preallocated(result, ri);
+               }
+
+               xpath_string eval_string(const xpath_context& c, const xpath_stack& stack)
+               {
+                       switch (_type)
+                       {
+                       case ast_string_constant:
+                               return xpath_string::from_const(_data.string);
+                       
+                       case ast_func_local_name_0:
+                       {
+                               xpath_node na = c.n;
+                               
+                               return xpath_string::from_const(local_name(na));
+                       }
+
+                       case ast_func_local_name_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first);
+                               xpath_node na = ns.first();
+                               
+                               return xpath_string::from_const(local_name(na));
+                       }
+
+                       case ast_func_name_0:
+                       {
+                               xpath_node na = c.n;
+                               
+                               return xpath_string::from_const(qualified_name(na));
+                       }
+
+                       case ast_func_name_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first);
+                               xpath_node na = ns.first();
+                               
+                               return xpath_string::from_const(qualified_name(na));
+                       }
+
+                       case ast_func_namespace_uri_0:
+                       {
+                               xpath_node na = c.n;
+                               
+                               return xpath_string::from_const(namespace_uri(na));
+                       }
+
+                       case ast_func_namespace_uri_1:
+                       {
+                               xpath_allocator_capture cr(stack.result);
+
+                               xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first);
+                               xpath_node na = ns.first();
+                               
+                               return xpath_string::from_const(namespace_uri(na));
+                       }
+
+                       case ast_func_string_0:
+                               return string_value(c.n, stack.result);
+
+                       case ast_func_string_1:
+                               return _left->eval_string(c, stack);
+
+                       case ast_func_concat:
+                               return eval_string_concat(c, stack);
+
+                       case ast_func_substring_before:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               xpath_string p = _right->eval_string(c, swapped_stack);
+
+                               const char_t* pos = find_substring(s.c_str(), p.c_str());
+                               
+                               return pos ? xpath_string::from_heap(s.c_str(), pos, stack.result) : xpath_string();
+                       }
+                       
+                       case ast_func_substring_after:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               xpath_string p = _right->eval_string(c, swapped_stack);
+                               
+                               const char_t* pos = find_substring(s.c_str(), p.c_str());
+                               if (!pos) return xpath_string();
+
+                               const char_t* rbegin = pos + p.length();
+                               const char_t* rend = s.c_str() + s.length();
+
+                               return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin);
+                       }
+
+                       case ast_func_substring_2:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               size_t s_length = s.length();
+
+                               double first = round_nearest(_right->eval_number(c, stack));
+                               
+                               if (is_nan(first)) return xpath_string(); // NaN
+                               else if (first >= s_length + 1) return xpath_string();
+                               
+                               size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+                               assert(1 <= pos && pos <= s_length + 1);
+
+                               const char_t* rbegin = s.c_str() + (pos - 1);
+                               const char_t* rend = s.c_str() + s.length();
+                               
+                               return s.uses_heap() ? xpath_string::from_heap(rbegin, rend, stack.result) : xpath_string::from_const(rbegin);
+                       }
+                       
+                       case ast_func_substring_3:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, swapped_stack);
+                               size_t s_length = s.length();
+
+                               double first = round_nearest(_right->eval_number(c, stack));
+                               double last = first + round_nearest(_right->_next->eval_number(c, stack));
+                               
+                               if (is_nan(first) || is_nan(last)) return xpath_string();
+                               else if (first >= s_length + 1) return xpath_string();
+                               else if (first >= last) return xpath_string();
+                               else if (last < 1) return xpath_string();
+                               
+                               size_t pos = first < 1 ? 1 : static_cast<size_t>(first);
+                               size_t end = last >= s_length + 1 ? s_length + 1 : static_cast<size_t>(last);
+
+                               assert(1 <= pos && pos <= end && end <= s_length + 1);
+                               const char_t* rbegin = s.c_str() + (pos - 1);
+                               const char_t* rend = s.c_str() + (end - 1);
+
+                               return (end == s_length + 1 && !s.uses_heap()) ? xpath_string::from_const(rbegin) : xpath_string::from_heap(rbegin, rend, stack.result);
+                       }
+
+                       case ast_func_normalize_space_0:
+                       {
+                               xpath_string s = string_value(c.n, stack.result);
+
+                               normalize_space(s.data(stack.result));
+
+                               return s;
+                       }
+
+                       case ast_func_normalize_space_1:
+                       {
+                               xpath_string s = _left->eval_string(c, stack);
+
+                               normalize_space(s.data(stack.result));
+                       
+                               return s;
+                       }
+
+                       case ast_func_translate:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_string s = _left->eval_string(c, stack);
+                               xpath_string from = _right->eval_string(c, swapped_stack);
+                               xpath_string to = _right->_next->eval_string(c, swapped_stack);
+
+                               translate(s.data(stack.result), from.c_str(), to.c_str(), to.length());
+
+                               return s;
+                       }
+
+                       case ast_opt_translate_table:
+                       {
+                               xpath_string s = _left->eval_string(c, stack);
+
+                               translate_table(s.data(stack.result), _data.table);
+
+                               return s;
+                       }
+
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_string)
+                                       return xpath_string::from_const(_data.variable->get_string());
+                       }
+                       // fallthrough
+
+                       default:
+                       {
+                               switch (_rettype)
+                               {
+                               case xpath_type_boolean:
+                                       return xpath_string::from_const(eval_boolean(c, stack) ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false"));
+                                       
+                               case xpath_type_number:
+                                       return convert_number_to_string(eval_number(c, stack), stack.result);
+                                       
+                               case xpath_type_node_set:
+                               {
+                                       xpath_allocator_capture cr(stack.temp);
+
+                                       xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                                       xpath_node_set_raw ns = eval_node_set(c, swapped_stack, nodeset_eval_first);
+                                       return ns.empty() ? xpath_string() : string_value(ns.first(), stack.result);
+                               }
+                               
+                               default:
+                                       assert(!"Wrong expression for return type string");
+                                       return xpath_string();
+                               }
+                       }
+                       }
+               }
+
+               xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval)
+               {
+                       switch (_type)
+                       {
+                       case ast_op_union:
+                       {
+                               xpath_allocator_capture cr(stack.temp);
+
+                               xpath_stack swapped_stack = {stack.temp, stack.result};
+
+                               xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack, eval);
+                               xpath_node_set_raw rs = _right->eval_node_set(c, stack, eval);
+
+                               // we can optimize merging two sorted sets, but this is a very rare operation, so don't bother
+                               rs.set_type(xpath_node_set::type_unsorted);
+
+                               rs.append(ls.begin(), ls.end(), stack.result);
+                               rs.remove_duplicates();
+
+                               return rs;
+                       }
+
+                       case ast_filter:
+                       {
+                               xpath_node_set_raw set = _left->eval_node_set(c, stack, _test == predicate_constant_one ? nodeset_eval_first : nodeset_eval_all);
+
+                               // either expression is a number or it contains position() call; sort by document order
+                               if (_test != predicate_posinv) set.sort_do();
+
+                               bool once = eval_once(set.type(), eval);
+
+                               apply_predicate(set, 0, stack, once);
+                       
+                               return set;
+                       }
+                       
+                       case ast_func_id:
+                               return xpath_node_set_raw();
+                       
+                       case ast_step:
+                       {
+                               switch (_axis)
+                               {
+                               case axis_ancestor:
+                                       return step_do(c, stack, eval, axis_to_type<axis_ancestor>());
+                                       
+                               case axis_ancestor_or_self:
+                                       return step_do(c, stack, eval, axis_to_type<axis_ancestor_or_self>());
+
+                               case axis_attribute:
+                                       return step_do(c, stack, eval, axis_to_type<axis_attribute>());
+
+                               case axis_child:
+                                       return step_do(c, stack, eval, axis_to_type<axis_child>());
+                               
+                               case axis_descendant:
+                                       return step_do(c, stack, eval, axis_to_type<axis_descendant>());
+
+                               case axis_descendant_or_self:
+                                       return step_do(c, stack, eval, axis_to_type<axis_descendant_or_self>());
+
+                               case axis_following:
+                                       return step_do(c, stack, eval, axis_to_type<axis_following>());
+                               
+                               case axis_following_sibling:
+                                       return step_do(c, stack, eval, axis_to_type<axis_following_sibling>());
+                               
+                               case axis_namespace:
+                                       // namespaced axis is not supported
+                                       return xpath_node_set_raw();
+                               
+                               case axis_parent:
+                                       return step_do(c, stack, eval, axis_to_type<axis_parent>());
+                               
+                               case axis_preceding:
+                                       return step_do(c, stack, eval, axis_to_type<axis_preceding>());
+
+                               case axis_preceding_sibling:
+                                       return step_do(c, stack, eval, axis_to_type<axis_preceding_sibling>());
+                               
+                               case axis_self:
+                                       return step_do(c, stack, eval, axis_to_type<axis_self>());
+
+                               default:
+                                       assert(!"Unknown axis");
+                                       return xpath_node_set_raw();
+                               }
+                       }
+
+                       case ast_step_root:
+                       {
+                               assert(!_right); // root step can't have any predicates
+
+                               xpath_node_set_raw ns;
+
+                               ns.set_type(xpath_node_set::type_sorted);
+
+                               if (c.n.node()) ns.push_back(c.n.node().root(), stack.result);
+                               else if (c.n.attribute()) ns.push_back(c.n.parent().root(), stack.result);
+
+                               return ns;
+                       }
+
+                       case ast_variable:
+                       {
+                               assert(_rettype == _data.variable->type());
+
+                               if (_rettype == xpath_type_node_set)
+                               {
+                                       const xpath_node_set& s = _data.variable->get_node_set();
+
+                                       xpath_node_set_raw ns;
+
+                                       ns.set_type(s.type());
+                                       ns.append(s.begin(), s.end(), stack.result);
+
+                                       return ns;
+                               }
+                       }
+                       // fallthrough
+
+                       default:
+                               assert(!"Wrong expression for return type node set");
+                               return xpath_node_set_raw();
+                       }
+               }
+
+               void optimize(xpath_allocator* alloc)
+               {
+                       if (_left) _left->optimize(alloc);
+                       if (_right) _right->optimize(alloc);
+                       if (_next) _next->optimize(alloc);
+
+                       // Rewrite [position()=expr] with [expr]
+                       // Note that this step has to go before classification to recognize [position()=1]
+                       if ((_type == ast_filter || _type == ast_predicate) &&
+                               _right->_type == ast_op_equal && _right->_left->_type == ast_func_position && _right->_right->_rettype == xpath_type_number)
+                       {
+                               _right = _right->_right;
+                       }
+
+                       // Classify filter/predicate ops to perform various optimizations during evaluation
+                       if (_type == ast_filter || _type == ast_predicate)
+                       {
+                               assert(_test == predicate_default);
+
+                               if (_right->_type == ast_number_constant && _right->_data.number == 1.0)
+                                       _test = predicate_constant_one;
+                               else if (_right->_rettype == xpath_type_number && (_right->_type == ast_number_constant || _right->_type == ast_variable || _right->_type == ast_func_last))
+                                       _test = predicate_constant;
+                               else if (_right->_rettype != xpath_type_number && _right->is_posinv_expr())
+                                       _test = predicate_posinv;
+                       }
+
+                       // Rewrite descendant-or-self::node()/child::foo with descendant::foo
+                       // The former is a full form of //foo, the latter is much faster since it executes the node test immediately
+                       // Do a similar kind of rewrite for self/descendant/descendant-or-self axes
+                       // Note that we only rewrite positionally invariant steps (//foo[1] != /descendant::foo[1])
+                       if (_type == ast_step && (_axis == axis_child || _axis == axis_self || _axis == axis_descendant || _axis == axis_descendant_or_self) && _left &&
+                               _left->_type == ast_step && _left->_axis == axis_descendant_or_self && _left->_test == nodetest_type_node && !_left->_right &&
+                               is_posinv_step())
+                       {
+                               if (_axis == axis_child || _axis == axis_descendant)
+                                       _axis = axis_descendant;
+                               else
+                                       _axis = axis_descendant_or_self;
+
+                               _left = _left->_left;
+                       }
+
+                       // Use optimized lookup table implementation for translate() with constant arguments
+                       if (_type == ast_func_translate && _right->_type == ast_string_constant && _right->_next->_type == ast_string_constant)
+                       {
+                               unsigned char* table = translate_table_generate(alloc, _right->_data.string, _right->_next->_data.string);
+
+                               if (table)
+                               {
+                                       _type = ast_opt_translate_table;
+                                       _data.table = table;
+                               }
+                       }
+
+                       // Use optimized path for @attr = 'value' or @attr = $value
+                       if (_type == ast_op_equal &&
+                               _left->_type == ast_step && _left->_axis == axis_attribute && _left->_test == nodetest_name && !_left->_left && !_left->_right &&
+                               (_right->_type == ast_string_constant || (_right->_type == ast_variable && _right->_rettype == xpath_type_string)))
+                       {
+                               _type = ast_opt_compare_attribute;
+                       }
+               }
+               
+               bool is_posinv_expr() const
+               {
+                       switch (_type)
+                       {
+                       case ast_func_position:
+                       case ast_func_last:
+                               return false;
+
+                       case ast_string_constant:
+                       case ast_number_constant:
+                       case ast_variable:
+                               return true;
+
+                       case ast_step:
+                       case ast_step_root:
+                               return true;
+
+                       case ast_predicate:
+                       case ast_filter:
+                               return true;
+
+                       default:
+                               if (_left && !_left->is_posinv_expr()) return false;
+                               
+                               for (xpath_ast_node* n = _right; n; n = n->_next)
+                                       if (!n->is_posinv_expr()) return false;
+                                       
+                               return true;
+                       }
+               }
+
+               bool is_posinv_step() const
+               {
+                       assert(_type == ast_step);
+
+                       for (xpath_ast_node* n = _right; n; n = n->_next)
+                       {
+                               assert(n->_type == ast_predicate);
+
+                               if (n->_test != predicate_posinv)
+                                       return false;
+                       }
+
+                       return true;
+               }
+
+               xpath_value_type rettype() const
+               {
+                       return static_cast<xpath_value_type>(_rettype);
+               }
+       };
+
+       struct xpath_parser
+       {
+               xpath_allocator* _alloc;
+               xpath_lexer _lexer;
+
+               const char_t* _query;
+               xpath_variable_set* _variables;
+
+               xpath_parse_result* _result;
+
+               char_t _scratch[32];
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               jmp_buf _error_handler;
+       #endif
+
+               void throw_error(const char* message)
+               {
+                       _result->error = message;
+                       _result->offset = _lexer.current_pos() - _query;
+
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       longjmp(_error_handler, 1);
+               #else
+                       throw xpath_exception(*_result);
+               #endif
+               }
+
+               void throw_error_oom()
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       throw_error("Out of memory");
+               #else
+                       throw std::bad_alloc();
+               #endif
+               }
+
+               void* alloc_node()
+               {
+                       void* result = _alloc->allocate_nothrow(sizeof(xpath_ast_node));
+
+                       if (!result) throw_error_oom();
+
+                       return result;
+               }
+
+               const char_t* alloc_string(const xpath_lexer_string& value)
+               {
+                       if (value.begin)
+                       {
+                               size_t length = static_cast<size_t>(value.end - value.begin);
+
+                               char_t* c = static_cast<char_t*>(_alloc->allocate_nothrow((length + 1) * sizeof(char_t)));
+                               if (!c) throw_error_oom();
+                               assert(c); // workaround for clang static analysis
+
+                               memcpy(c, value.begin, length * sizeof(char_t));
+                               c[length] = 0;
+
+                               return c;
+                       }
+                       else return 0;
+               }
+
+               xpath_ast_node* parse_function_helper(ast_type_t type0, ast_type_t type1, size_t argc, xpath_ast_node* args[2])
+               {
+                       assert(argc <= 1);
+
+                       if (argc == 1 && args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+
+                       return new (alloc_node()) xpath_ast_node(argc == 0 ? type0 : type1, xpath_type_string, args[0]);
+               }
+
+               xpath_ast_node* parse_function(const xpath_lexer_string& name, size_t argc, xpath_ast_node* args[2])
+               {
+                       switch (name.begin[0])
+                       {
+                       case 'b':
+                               if (name == PUGIXML_TEXT("boolean") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_boolean, xpath_type_boolean, args[0]);
+                                       
+                               break;
+                       
+                       case 'c':
+                               if (name == PUGIXML_TEXT("count") && argc == 1)
+                               {
+                                       if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+                                       return new (alloc_node()) xpath_ast_node(ast_func_count, xpath_type_number, args[0]);
+                               }
+                               else if (name == PUGIXML_TEXT("contains") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_contains, xpath_type_boolean, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("concat") && argc >= 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_concat, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("ceiling") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_ceiling, xpath_type_number, args[0]);
+                                       
+                               break;
+                       
+                       case 'f':
+                               if (name == PUGIXML_TEXT("false") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_false, xpath_type_boolean);
+                               else if (name == PUGIXML_TEXT("floor") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_floor, xpath_type_number, args[0]);
+                                       
+                               break;
+                       
+                       case 'i':
+                               if (name == PUGIXML_TEXT("id") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_id, xpath_type_node_set, args[0]);
+                                       
+                               break;
+                       
+                       case 'l':
+                               if (name == PUGIXML_TEXT("last") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_last, xpath_type_number);
+                               else if (name == PUGIXML_TEXT("lang") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_lang, xpath_type_boolean, args[0]);
+                               else if (name == PUGIXML_TEXT("local-name") && argc <= 1)
+                                       return parse_function_helper(ast_func_local_name_0, ast_func_local_name_1, argc, args);
+                       
+                               break;
+                       
+                       case 'n':
+                               if (name == PUGIXML_TEXT("name") && argc <= 1)
+                                       return parse_function_helper(ast_func_name_0, ast_func_name_1, argc, args);
+                               else if (name == PUGIXML_TEXT("namespace-uri") && argc <= 1)
+                                       return parse_function_helper(ast_func_namespace_uri_0, ast_func_namespace_uri_1, argc, args);
+                               else if (name == PUGIXML_TEXT("normalize-space") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_normalize_space_0 : ast_func_normalize_space_1, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("not") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_not, xpath_type_boolean, args[0]);
+                               else if (name == PUGIXML_TEXT("number") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_number_0 : ast_func_number_1, xpath_type_number, args[0]);
+                       
+                               break;
+                       
+                       case 'p':
+                               if (name == PUGIXML_TEXT("position") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_position, xpath_type_number);
+                               
+                               break;
+                       
+                       case 'r':
+                               if (name == PUGIXML_TEXT("round") && argc == 1)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_round, xpath_type_number, args[0]);
+
+                               break;
+                       
+                       case 's':
+                               if (name == PUGIXML_TEXT("string") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_0 : ast_func_string_1, xpath_type_string, args[0]);
+                               else if (name == PUGIXML_TEXT("string-length") && argc <= 1)
+                                       return new (alloc_node()) xpath_ast_node(argc == 0 ? ast_func_string_length_0 : ast_func_string_length_1, xpath_type_number, args[0]);
+                               else if (name == PUGIXML_TEXT("starts-with") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_starts_with, xpath_type_boolean, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("substring-before") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_substring_before, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("substring-after") && argc == 2)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_substring_after, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("substring") && (argc == 2 || argc == 3))
+                                       return new (alloc_node()) xpath_ast_node(argc == 2 ? ast_func_substring_2 : ast_func_substring_3, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("sum") && argc == 1)
+                               {
+                                       if (args[0]->rettype() != xpath_type_node_set) throw_error("Function has to be applied to node set");
+                                       return new (alloc_node()) xpath_ast_node(ast_func_sum, xpath_type_number, args[0]);
+                               }
+
+                               break;
+                       
+                       case 't':
+                               if (name == PUGIXML_TEXT("translate") && argc == 3)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_translate, xpath_type_string, args[0], args[1]);
+                               else if (name == PUGIXML_TEXT("true") && argc == 0)
+                                       return new (alloc_node()) xpath_ast_node(ast_func_true, xpath_type_boolean);
+                                       
+                               break;
+
+                       default:
+                               break;
+                       }
+
+                       throw_error("Unrecognized function or wrong parameter count");
+
+                       return 0;
+               }
+
+               axis_t parse_axis_name(const xpath_lexer_string& name, bool& specified)
+               {
+                       specified = true;
+
+                       switch (name.begin[0])
+                       {
+                       case 'a':
+                               if (name == PUGIXML_TEXT("ancestor"))
+                                       return axis_ancestor;
+                               else if (name == PUGIXML_TEXT("ancestor-or-self"))
+                                       return axis_ancestor_or_self;
+                               else if (name == PUGIXML_TEXT("attribute"))
+                                       return axis_attribute;
+                               
+                               break;
+                       
+                       case 'c':
+                               if (name == PUGIXML_TEXT("child"))
+                                       return axis_child;
+                               
+                               break;
+                       
+                       case 'd':
+                               if (name == PUGIXML_TEXT("descendant"))
+                                       return axis_descendant;
+                               else if (name == PUGIXML_TEXT("descendant-or-self"))
+                                       return axis_descendant_or_self;
+                               
+                               break;
+                       
+                       case 'f':
+                               if (name == PUGIXML_TEXT("following"))
+                                       return axis_following;
+                               else if (name == PUGIXML_TEXT("following-sibling"))
+                                       return axis_following_sibling;
+                               
+                               break;
+                       
+                       case 'n':
+                               if (name == PUGIXML_TEXT("namespace"))
+                                       return axis_namespace;
+                               
+                               break;
+                       
+                       case 'p':
+                               if (name == PUGIXML_TEXT("parent"))
+                                       return axis_parent;
+                               else if (name == PUGIXML_TEXT("preceding"))
+                                       return axis_preceding;
+                               else if (name == PUGIXML_TEXT("preceding-sibling"))
+                                       return axis_preceding_sibling;
+                               
+                               break;
+                       
+                       case 's':
+                               if (name == PUGIXML_TEXT("self"))
+                                       return axis_self;
+                               
+                               break;
+
+                       default:
+                               break;
+                       }
+
+                       specified = false;
+                       return axis_child;
+               }
+
+               nodetest_t parse_node_test_type(const xpath_lexer_string& name)
+               {
+                       switch (name.begin[0])
+                       {
+                       case 'c':
+                               if (name == PUGIXML_TEXT("comment"))
+                                       return nodetest_type_comment;
+
+                               break;
+
+                       case 'n':
+                               if (name == PUGIXML_TEXT("node"))
+                                       return nodetest_type_node;
+
+                               break;
+
+                       case 'p':
+                               if (name == PUGIXML_TEXT("processing-instruction"))
+                                       return nodetest_type_pi;
+
+                               break;
+
+                       case 't':
+                               if (name == PUGIXML_TEXT("text"))
+                                       return nodetest_type_text;
+
+                               break;
+                       
+                       default:
+                               break;
+                       }
+
+                       return nodetest_none;
+               }
+
+               // PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall
+               xpath_ast_node* parse_primary_expression()
+               {
+                       switch (_lexer.current())
+                       {
+                       case lex_var_ref:
+                       {
+                               xpath_lexer_string name = _lexer.contents();
+
+                               if (!_variables)
+                                       throw_error("Unknown variable: variable set is not provided");
+
+                               xpath_variable* var = get_variable_scratch(_scratch, _variables, name.begin, name.end);
+
+                               if (!var)
+                                       throw_error("Unknown variable: variable set does not contain the given name");
+
+                               _lexer.next();
+
+                               return new (alloc_node()) xpath_ast_node(ast_variable, var->type(), var);
+                       }
+
+                       case lex_open_brace:
+                       {
+                               _lexer.next();
+
+                               xpath_ast_node* n = parse_expression();
+
+                               if (_lexer.current() != lex_close_brace)
+                                       throw_error("Unmatched braces");
+
+                               _lexer.next();
+
+                               return n;
+                       }
+
+                       case lex_quoted_string:
+                       {
+                               const char_t* value = alloc_string(_lexer.contents());
+
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_string_constant, xpath_type_string, value);
+                               _lexer.next();
+
+                               return n;
+                       }
+
+                       case lex_number:
+                       {
+                               double value = 0;
+
+                               if (!convert_string_to_number_scratch(_scratch, _lexer.contents().begin, _lexer.contents().end, &value))
+                                       throw_error_oom();
+
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_number_constant, xpath_type_number, value);
+                               _lexer.next();
+
+                               return n;
+                       }
+
+                       case lex_string:
+                       {
+                               xpath_ast_node* args[2] = {0};
+                               size_t argc = 0;
+                               
+                               xpath_lexer_string function = _lexer.contents();
+                               _lexer.next();
+                               
+                               xpath_ast_node* last_arg = 0;
+                               
+                               if (_lexer.current() != lex_open_brace)
+                                       throw_error("Unrecognized function call");
+                               _lexer.next();
+
+                               if (_lexer.current() != lex_close_brace)
+                                       args[argc++] = parse_expression();
+
+                               while (_lexer.current() != lex_close_brace)
+                               {
+                                       if (_lexer.current() != lex_comma)
+                                               throw_error("No comma between function arguments");
+                                       _lexer.next();
+                                       
+                                       xpath_ast_node* n = parse_expression();
+                                       
+                                       if (argc < 2) args[argc] = n;
+                                       else last_arg->set_next(n);
+
+                                       argc++;
+                                       last_arg = n;
+                               }
+                               
+                               _lexer.next();
+
+                               return parse_function(function, argc, args);
+                       }
+
+                       default:
+                               throw_error("Unrecognizable primary expression");
+
+                               return 0;
+                       }
+               }
+               
+               // FilterExpr ::= PrimaryExpr | FilterExpr Predicate
+               // Predicate ::= '[' PredicateExpr ']'
+               // PredicateExpr ::= Expr
+               xpath_ast_node* parse_filter_expression()
+               {
+                       xpath_ast_node* n = parse_primary_expression();
+
+                       while (_lexer.current() == lex_open_square_brace)
+                       {
+                               _lexer.next();
+
+                               xpath_ast_node* expr = parse_expression();
+
+                               if (n->rettype() != xpath_type_node_set) throw_error("Predicate has to be applied to node set");
+
+                               n = new (alloc_node()) xpath_ast_node(ast_filter, n, expr, predicate_default);
+
+                               if (_lexer.current() != lex_close_square_brace)
+                                       throw_error("Unmatched square brace");
+                       
+                               _lexer.next();
+                       }
+                       
+                       return n;
+               }
+               
+               // Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep
+               // AxisSpecifier ::= AxisName '::' | '@'?
+               // NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')'
+               // NameTest ::= '*' | NCName ':' '*' | QName
+               // AbbreviatedStep ::= '.' | '..'
+               xpath_ast_node* parse_step(xpath_ast_node* set)
+               {
+                       if (set && set->rettype() != xpath_type_node_set)
+                               throw_error("Step has to be applied to node set");
+
+                       bool axis_specified = false;
+                       axis_t axis = axis_child; // implied child axis
+
+                       if (_lexer.current() == lex_axis_attribute)
+                       {
+                               axis = axis_attribute;
+                               axis_specified = true;
+                               
+                               _lexer.next();
+                       }
+                       else if (_lexer.current() == lex_dot)
+                       {
+                               _lexer.next();
+                               
+                               return new (alloc_node()) xpath_ast_node(ast_step, set, axis_self, nodetest_type_node, 0);
+                       }
+                       else if (_lexer.current() == lex_double_dot)
+                       {
+                               _lexer.next();
+                               
+                               return new (alloc_node()) xpath_ast_node(ast_step, set, axis_parent, nodetest_type_node, 0);
+                       }
+               
+                       nodetest_t nt_type = nodetest_none;
+                       xpath_lexer_string nt_name;
+                       
+                       if (_lexer.current() == lex_string)
+                       {
+                               // node name test
+                               nt_name = _lexer.contents();
+                               _lexer.next();
+
+                               // was it an axis name?
+                               if (_lexer.current() == lex_double_colon)
+                               {
+                                       // parse axis name
+                                       if (axis_specified) throw_error("Two axis specifiers in one step");
+
+                                       axis = parse_axis_name(nt_name, axis_specified);
+
+                                       if (!axis_specified) throw_error("Unknown axis");
+
+                                       // read actual node test
+                                       _lexer.next();
+
+                                       if (_lexer.current() == lex_multiply)
+                                       {
+                                               nt_type = nodetest_all;
+                                               nt_name = xpath_lexer_string();
+                                               _lexer.next();
+                                       }
+                                       else if (_lexer.current() == lex_string)
+                                       {
+                                               nt_name = _lexer.contents();
+                                               _lexer.next();
+                                       }
+                                       else throw_error("Unrecognized node test");
+                               }
+                               
+                               if (nt_type == nodetest_none)
+                               {
+                                       // node type test or processing-instruction
+                                       if (_lexer.current() == lex_open_brace)
+                                       {
+                                               _lexer.next();
+                                               
+                                               if (_lexer.current() == lex_close_brace)
+                                               {
+                                                       _lexer.next();
+
+                                                       nt_type = parse_node_test_type(nt_name);
+
+                                                       if (nt_type == nodetest_none) throw_error("Unrecognized node type");
+                                                       
+                                                       nt_name = xpath_lexer_string();
+                                               }
+                                               else if (nt_name == PUGIXML_TEXT("processing-instruction"))
+                                               {
+                                                       if (_lexer.current() != lex_quoted_string)
+                                                               throw_error("Only literals are allowed as arguments to processing-instruction()");
+                                               
+                                                       nt_type = nodetest_pi;
+                                                       nt_name = _lexer.contents();
+                                                       _lexer.next();
+                                                       
+                                                       if (_lexer.current() != lex_close_brace)
+                                                               throw_error("Unmatched brace near processing-instruction()");
+                                                       _lexer.next();
+                                               }
+                                               else
+                                                       throw_error("Unmatched brace near node type test");
+
+                                       }
+                                       // QName or NCName:*
+                                       else
+                                       {
+                                               if (nt_name.end - nt_name.begin > 2 && nt_name.end[-2] == ':' && nt_name.end[-1] == '*') // NCName:*
+                                               {
+                                                       nt_name.end--; // erase *
+                                                       
+                                                       nt_type = nodetest_all_in_namespace;
+                                               }
+                                               else nt_type = nodetest_name;
+                                       }
+                               }
+                       }
+                       else if (_lexer.current() == lex_multiply)
+                       {
+                               nt_type = nodetest_all;
+                               _lexer.next();
+                       }
+                       else throw_error("Unrecognized node test");
+                       
+                       xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step, set, axis, nt_type, alloc_string(nt_name));
+                       
+                       xpath_ast_node* last = 0;
+                       
+                       while (_lexer.current() == lex_open_square_brace)
+                       {
+                               _lexer.next();
+                               
+                               xpath_ast_node* expr = parse_expression();
+
+                               xpath_ast_node* pred = new (alloc_node()) xpath_ast_node(ast_predicate, 0, expr, predicate_default);
+                               
+                               if (_lexer.current() != lex_close_square_brace)
+                                       throw_error("Unmatched square brace");
+                               _lexer.next();
+                               
+                               if (last) last->set_next(pred);
+                               else n->set_right(pred);
+                               
+                               last = pred;
+                       }
+
+                       return n;
+               }
+               
+               // RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | RelativeLocationPath '//' Step
+               xpath_ast_node* parse_relative_location_path(xpath_ast_node* set)
+               {
+                       xpath_ast_node* n = parse_step(set);
+                       
+                       while (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+                       {
+                               lexeme_t l = _lexer.current();
+                               _lexer.next();
+
+                               if (l == lex_double_slash)
+                                       n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+                               
+                               n = parse_step(n);
+                       }
+                       
+                       return n;
+               }
+               
+               // LocationPath ::= RelativeLocationPath | AbsoluteLocationPath
+               // AbsoluteLocationPath ::= '/' RelativeLocationPath? | '//' RelativeLocationPath
+               xpath_ast_node* parse_location_path()
+               {
+                       if (_lexer.current() == lex_slash)
+                       {
+                               _lexer.next();
+                               
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+
+                               // relative location path can start from axis_attribute, dot, double_dot, multiply and string lexemes; any other lexeme means standalone root path
+                               lexeme_t l = _lexer.current();
+
+                               if (l == lex_string || l == lex_axis_attribute || l == lex_dot || l == lex_double_dot || l == lex_multiply)
+                                       return parse_relative_location_path(n);
+                               else
+                                       return n;
+                       }
+                       else if (_lexer.current() == lex_double_slash)
+                       {
+                               _lexer.next();
+                               
+                               xpath_ast_node* n = new (alloc_node()) xpath_ast_node(ast_step_root, xpath_type_node_set);
+                               n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+                               
+                               return parse_relative_location_path(n);
+                       }
+
+                       // else clause moved outside of if because of bogus warning 'control may reach end of non-void function being inlined' in gcc 4.0.1
+                       return parse_relative_location_path(0);
+               }
+               
+               // PathExpr ::= LocationPath
+               //                              | FilterExpr
+               //                              | FilterExpr '/' RelativeLocationPath
+               //                              | FilterExpr '//' RelativeLocationPath
+               // UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
+               // UnaryExpr ::= UnionExpr | '-' UnaryExpr
+               xpath_ast_node* parse_path_or_unary_expression()
+               {
+                       // Clarification.
+                       // PathExpr begins with either LocationPath or FilterExpr.
+                       // FilterExpr begins with PrimaryExpr
+                       // PrimaryExpr begins with '$' in case of it being a variable reference,
+                       // '(' in case of it being an expression, string literal, number constant or
+                       // function call.
+
+                       if (_lexer.current() == lex_var_ref || _lexer.current() == lex_open_brace || 
+                               _lexer.current() == lex_quoted_string || _lexer.current() == lex_number ||
+                               _lexer.current() == lex_string)
+                       {
+                               if (_lexer.current() == lex_string)
+                               {
+                                       // This is either a function call, or not - if not, we shall proceed with location path
+                                       const char_t* state = _lexer.state();
+                                       
+                                       while (PUGI__IS_CHARTYPE(*state, ct_space)) ++state;
+                                       
+                                       if (*state != '(') return parse_location_path();
+
+                                       // This looks like a function call; however this still can be a node-test. Check it.
+                                       if (parse_node_test_type(_lexer.contents()) != nodetest_none) return parse_location_path();
+                               }
+                               
+                               xpath_ast_node* n = parse_filter_expression();
+
+                               if (_lexer.current() == lex_slash || _lexer.current() == lex_double_slash)
+                               {
+                                       lexeme_t l = _lexer.current();
+                                       _lexer.next();
+                                       
+                                       if (l == lex_double_slash)
+                                       {
+                                               if (n->rettype() != xpath_type_node_set) throw_error("Step has to be applied to node set");
+
+                                               n = new (alloc_node()) xpath_ast_node(ast_step, n, axis_descendant_or_self, nodetest_type_node, 0);
+                                       }
+       
+                                       // select from location path
+                                       return parse_relative_location_path(n);
+                               }
+
+                               return n;
+                       }
+                       else if (_lexer.current() == lex_minus)
+                       {
+                               _lexer.next();
+
+                               // precedence 7+ - only parses union expressions
+                               xpath_ast_node* expr = parse_expression_rec(parse_path_or_unary_expression(), 7);
+
+                               return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr);
+                       }
+                       else
+                               return parse_location_path();
+               }
+
+               struct binary_op_t
+               {
+                       ast_type_t asttype;
+                       xpath_value_type rettype;
+                       int precedence;
+
+                       binary_op_t(): asttype(ast_unknown), rettype(xpath_type_none), precedence(0)
+                       {
+                       }
+
+                       binary_op_t(ast_type_t asttype_, xpath_value_type rettype_, int precedence_): asttype(asttype_), rettype(rettype_), precedence(precedence_)
+                       {
+                       }
+
+                       static binary_op_t parse(xpath_lexer& lexer)
+                       {
+                               switch (lexer.current())
+                               {
+                               case lex_string:
+                                       if (lexer.contents() == PUGIXML_TEXT("or"))
+                                               return binary_op_t(ast_op_or, xpath_type_boolean, 1);
+                                       else if (lexer.contents() == PUGIXML_TEXT("and"))
+                                               return binary_op_t(ast_op_and, xpath_type_boolean, 2);
+                                       else if (lexer.contents() == PUGIXML_TEXT("div"))
+                                               return binary_op_t(ast_op_divide, xpath_type_number, 6);
+                                       else if (lexer.contents() == PUGIXML_TEXT("mod"))
+                                               return binary_op_t(ast_op_mod, xpath_type_number, 6);
+                                       else
+                                               return binary_op_t();
+
+                               case lex_equal:
+                                       return binary_op_t(ast_op_equal, xpath_type_boolean, 3);
+
+                               case lex_not_equal:
+                                       return binary_op_t(ast_op_not_equal, xpath_type_boolean, 3);
+
+                               case lex_less:
+                                       return binary_op_t(ast_op_less, xpath_type_boolean, 4);
+
+                               case lex_greater:
+                                       return binary_op_t(ast_op_greater, xpath_type_boolean, 4);
+
+                               case lex_less_or_equal:
+                                       return binary_op_t(ast_op_less_or_equal, xpath_type_boolean, 4);
+
+                               case lex_greater_or_equal:
+                                       return binary_op_t(ast_op_greater_or_equal, xpath_type_boolean, 4);
+
+                               case lex_plus:
+                                       return binary_op_t(ast_op_add, xpath_type_number, 5);
+
+                               case lex_minus:
+                                       return binary_op_t(ast_op_subtract, xpath_type_number, 5);
+
+                               case lex_multiply:
+                                       return binary_op_t(ast_op_multiply, xpath_type_number, 6);
+
+                               case lex_union:
+                                       return binary_op_t(ast_op_union, xpath_type_node_set, 7);
+
+                               default:
+                                       return binary_op_t();
+                               }
+                       }
+               };
+
+               xpath_ast_node* parse_expression_rec(xpath_ast_node* lhs, int limit)
+               {
+                       binary_op_t op = binary_op_t::parse(_lexer);
+
+                       while (op.asttype != ast_unknown && op.precedence >= limit)
+                       {
+                               _lexer.next();
+
+                               xpath_ast_node* rhs = parse_path_or_unary_expression();
+
+                               binary_op_t nextop = binary_op_t::parse(_lexer);
+
+                               while (nextop.asttype != ast_unknown && nextop.precedence > op.precedence)
+                               {
+                                       rhs = parse_expression_rec(rhs, nextop.precedence);
+
+                                       nextop = binary_op_t::parse(_lexer);
+                               }
+
+                               if (op.asttype == ast_op_union && (lhs->rettype() != xpath_type_node_set || rhs->rettype() != xpath_type_node_set))
+                                       throw_error("Union operator has to be applied to node sets");
+
+                               lhs = new (alloc_node()) xpath_ast_node(op.asttype, op.rettype, lhs, rhs);
+
+                               op = binary_op_t::parse(_lexer);
+                       }
+
+                       return lhs;
+               }
+
+               // Expr ::= OrExpr
+               // OrExpr ::= AndExpr | OrExpr 'or' AndExpr
+               // AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr
+               // EqualityExpr ::= RelationalExpr
+               //                                      | EqualityExpr '=' RelationalExpr
+               //                                      | EqualityExpr '!=' RelationalExpr
+               // RelationalExpr ::= AdditiveExpr
+               //                                        | RelationalExpr '<' AdditiveExpr
+               //                                        | RelationalExpr '>' AdditiveExpr
+               //                                        | RelationalExpr '<=' AdditiveExpr
+               //                                        | RelationalExpr '>=' AdditiveExpr
+               // AdditiveExpr ::= MultiplicativeExpr
+               //                                      | AdditiveExpr '+' MultiplicativeExpr
+               //                                      | AdditiveExpr '-' MultiplicativeExpr
+               // MultiplicativeExpr ::= UnaryExpr
+               //                                                | MultiplicativeExpr '*' UnaryExpr
+               //                                                | MultiplicativeExpr 'div' UnaryExpr
+               //                                                | MultiplicativeExpr 'mod' UnaryExpr
+               xpath_ast_node* parse_expression()
+               {
+                       return parse_expression_rec(parse_path_or_unary_expression(), 0);
+               }
+
+               xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result)
+               {
+               }
+
+               xpath_ast_node* parse()
+               {
+                       xpath_ast_node* result = parse_expression();
+                       
+                       if (_lexer.current() != lex_eof)
+                       {
+                               // there are still unparsed tokens left, error
+                               throw_error("Incorrect query");
+                       }
+                       
+                       return result;
+               }
+
+               static xpath_ast_node* parse(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result)
+               {
+                       xpath_parser parser(query, variables, alloc, result);
+
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       int error = setjmp(parser._error_handler);
+
+                       return (error == 0) ? parser.parse() : 0;
+               #else
+                       return parser.parse();
+               #endif
+               }
+       };
+
+       struct xpath_query_impl
+       {
+               static xpath_query_impl* create()
+               {
+                       void* memory = xml_memory::allocate(sizeof(xpath_query_impl));
+
+                       return new (memory) xpath_query_impl();
+               }
+
+               static void destroy(void* ptr)
+               {
+                       if (!ptr) return;
+                       
+                       // free all allocated pages
+                       static_cast<xpath_query_impl*>(ptr)->alloc.release();
+
+                       // free allocator memory (with the first page)
+                       xml_memory::deallocate(ptr);
+               }
+
+               xpath_query_impl(): root(0), alloc(&block)
+               {
+                       block.next = 0;
+                       block.capacity = sizeof(block.data);
+               }
+
+               xpath_ast_node* root;
+               xpath_allocator alloc;
+               xpath_memory_block block;
+       };
+
+       PUGI__FN xpath_string evaluate_string_impl(xpath_query_impl* impl, const xpath_node& n, xpath_stack_data& sd)
+       {
+               if (!impl) return xpath_string();
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return xpath_string();
+       #endif
+
+               xpath_context c(n, 1, 1);
+
+               return impl->root->eval_string(c, sd.stack);
+       }
+
+       PUGI__FN impl::xpath_ast_node* evaluate_node_set_prepare(xpath_query_impl* impl)
+       {
+               if (!impl) return 0;
+
+               if (impl->root->rettype() != xpath_type_node_set)
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       return 0;
+               #else
+                       xpath_parse_result res;
+                       res.error = "Expression does not evaluate to node set";
+
+                       throw xpath_exception(res);
+               #endif
+               }
+
+               return impl->root;
+       }
+PUGI__NS_END
+
+namespace pugi
+{
+#ifndef PUGIXML_NO_EXCEPTIONS
+       PUGI__FN xpath_exception::xpath_exception(const xpath_parse_result& result_): _result(result_)
+       {
+               assert(_result.error);
+       }
+       
+       PUGI__FN const char* xpath_exception::what() const throw()
+       {
+               return _result.error;
+       }
+
+       PUGI__FN const xpath_parse_result& xpath_exception::result() const
+       {
+               return _result;
+       }
+#endif
+       
+       PUGI__FN xpath_node::xpath_node()
+       {
+       }
+               
+       PUGI__FN xpath_node::xpath_node(const xml_node& node_): _node(node_)
+       {
+       }
+               
+       PUGI__FN xpath_node::xpath_node(const xml_attribute& attribute_, const xml_node& parent_): _node(attribute_ ? parent_ : xml_node()), _attribute(attribute_)
+       {
+       }
+
+       PUGI__FN xml_node xpath_node::node() const
+       {
+               return _attribute ? xml_node() : _node;
+       }
+               
+       PUGI__FN xml_attribute xpath_node::attribute() const
+       {
+               return _attribute;
+       }
+       
+       PUGI__FN xml_node xpath_node::parent() const
+       {
+               return _attribute ? _node : _node.parent();
+       }
+
+       PUGI__FN static void unspecified_bool_xpath_node(xpath_node***)
+       {
+       }
+
+       PUGI__FN xpath_node::operator xpath_node::unspecified_bool_type() const
+       {
+               return (_node || _attribute) ? unspecified_bool_xpath_node : 0;
+       }
+       
+       PUGI__FN bool xpath_node::operator!() const
+       {
+               return !(_node || _attribute);
+       }
+
+       PUGI__FN bool xpath_node::operator==(const xpath_node& n) const
+       {
+               return _node == n._node && _attribute == n._attribute;
+       }
+       
+       PUGI__FN bool xpath_node::operator!=(const xpath_node& n) const
+       {
+               return _node != n._node || _attribute != n._attribute;
+       }
+
+#ifdef __BORLANDC__
+       PUGI__FN bool operator&&(const xpath_node& lhs, bool rhs)
+       {
+               return (bool)lhs && rhs;
+       }
+
+       PUGI__FN bool operator||(const xpath_node& lhs, bool rhs)
+       {
+               return (bool)lhs || rhs;
+       }
+#endif
+
+       PUGI__FN void xpath_node_set::_assign(const_iterator begin_, const_iterator end_)
+       {
+               assert(begin_ <= end_);
+
+               size_t size_ = static_cast<size_t>(end_ - begin_);
+
+               if (size_ <= 1)
+               {
+                       // deallocate old buffer
+                       if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+                       // use internal buffer
+                       if (begin_ != end_) _storage = *begin_;
+
+                       _begin = &_storage;
+                       _end = &_storage + size_;
+               }
+               else
+               {
+                       // make heap copy
+                       xpath_node* storage = static_cast<xpath_node*>(impl::xml_memory::allocate(size_ * sizeof(xpath_node)));
+
+                       if (!storage)
+                       {
+                       #ifdef PUGIXML_NO_EXCEPTIONS
+                               return;
+                       #else
+                               throw std::bad_alloc();
+                       #endif
+                       }
+
+                       memcpy(storage, begin_, size_ * sizeof(xpath_node));
+                       
+                       // deallocate old buffer
+                       if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+
+                       // finalize
+                       _begin = storage;
+                       _end = storage + size_;
+               }
+       }
+
+       PUGI__FN xpath_node_set::xpath_node_set(): _type(type_unsorted), _begin(&_storage), _end(&_storage)
+       {
+       }
+
+       PUGI__FN xpath_node_set::xpath_node_set(const_iterator begin_, const_iterator end_, type_t type_): _type(type_), _begin(&_storage), _end(&_storage)
+       {
+               _assign(begin_, end_);
+       }
+
+       PUGI__FN xpath_node_set::~xpath_node_set()
+       {
+               if (_begin != &_storage) impl::xml_memory::deallocate(_begin);
+       }
+               
+       PUGI__FN xpath_node_set::xpath_node_set(const xpath_node_set& ns): _type(ns._type), _begin(&_storage), _end(&_storage)
+       {
+               _assign(ns._begin, ns._end);
+       }
+       
+       PUGI__FN xpath_node_set& xpath_node_set::operator=(const xpath_node_set& ns)
+       {
+               if (this == &ns) return *this;
+               
+               _type = ns._type;
+               _assign(ns._begin, ns._end);
+
+               return *this;
+       }
+
+       PUGI__FN xpath_node_set::type_t xpath_node_set::type() const
+       {
+               return _type;
+       }
+               
+       PUGI__FN size_t xpath_node_set::size() const
+       {
+               return _end - _begin;
+       }
+               
+       PUGI__FN bool xpath_node_set::empty() const
+       {
+               return _begin == _end;
+       }
+               
+       PUGI__FN const xpath_node& xpath_node_set::operator[](size_t index) const
+       {
+               assert(index < size());
+               return _begin[index];
+       }
+
+       PUGI__FN xpath_node_set::const_iterator xpath_node_set::begin() const
+       {
+               return _begin;
+       }
+               
+       PUGI__FN xpath_node_set::const_iterator xpath_node_set::end() const
+       {
+               return _end;
+       }
+       
+       PUGI__FN void xpath_node_set::sort(bool reverse)
+       {
+               _type = impl::xpath_sort(_begin, _end, _type, reverse);
+       }
+
+       PUGI__FN xpath_node xpath_node_set::first() const
+       {
+               return impl::xpath_first(_begin, _end, _type);
+       }
+
+       PUGI__FN xpath_parse_result::xpath_parse_result(): error("Internal error"), offset(0)
+       {
+       }
+
+       PUGI__FN xpath_parse_result::operator bool() const
+       {
+               return error == 0;
+       }
+
+       PUGI__FN const char* xpath_parse_result::description() const
+       {
+               return error ? error : "No error";
+       }
+
+       PUGI__FN xpath_variable::xpath_variable(): _type(xpath_type_none), _next(0)
+       {
+       }
+
+       PUGI__FN const char_t* xpath_variable::name() const
+       {
+               switch (_type)
+               {
+               case xpath_type_node_set:
+                       return static_cast<const impl::xpath_variable_node_set*>(this)->name;
+
+               case xpath_type_number:
+                       return static_cast<const impl::xpath_variable_number*>(this)->name;
+
+               case xpath_type_string:
+                       return static_cast<const impl::xpath_variable_string*>(this)->name;
+
+               case xpath_type_boolean:
+                       return static_cast<const impl::xpath_variable_boolean*>(this)->name;
+
+               default:
+                       assert(!"Invalid variable type");
+                       return 0;
+               }
+       }
+
+       PUGI__FN xpath_value_type xpath_variable::type() const
+       {
+               return _type;
+       }
+
+       PUGI__FN bool xpath_variable::get_boolean() const
+       {
+               return (_type == xpath_type_boolean) ? static_cast<const impl::xpath_variable_boolean*>(this)->value : false;
+       }
+
+       PUGI__FN double xpath_variable::get_number() const
+       {
+               return (_type == xpath_type_number) ? static_cast<const impl::xpath_variable_number*>(this)->value : impl::gen_nan();
+       }
+
+       PUGI__FN const char_t* xpath_variable::get_string() const
+       {
+               const char_t* value = (_type == xpath_type_string) ? static_cast<const impl::xpath_variable_string*>(this)->value : 0;
+               return value ? value : PUGIXML_TEXT("");
+       }
+
+       PUGI__FN const xpath_node_set& xpath_variable::get_node_set() const
+       {
+               return (_type == xpath_type_node_set) ? static_cast<const impl::xpath_variable_node_set*>(this)->value : impl::dummy_node_set;
+       }
+
+       PUGI__FN bool xpath_variable::set(bool value)
+       {
+               if (_type != xpath_type_boolean) return false;
+
+               static_cast<impl::xpath_variable_boolean*>(this)->value = value;
+               return true;
+       }
+
+       PUGI__FN bool xpath_variable::set(double value)
+       {
+               if (_type != xpath_type_number) return false;
+
+               static_cast<impl::xpath_variable_number*>(this)->value = value;
+               return true;
+       }
+
+       PUGI__FN bool xpath_variable::set(const char_t* value)
+       {
+               if (_type != xpath_type_string) return false;
+
+               impl::xpath_variable_string* var = static_cast<impl::xpath_variable_string*>(this);
+
+               // duplicate string
+               size_t size = (impl::strlength(value) + 1) * sizeof(char_t);
+
+               char_t* copy = static_cast<char_t*>(impl::xml_memory::allocate(size));
+               if (!copy) return false;
+
+               memcpy(copy, value, size);
+
+               // replace old string
+               if (var->value) impl::xml_memory::deallocate(var->value);
+               var->value = copy;
+
+               return true;
+       }
+
+       PUGI__FN bool xpath_variable::set(const xpath_node_set& value)
+       {
+               if (_type != xpath_type_node_set) return false;
+
+               static_cast<impl::xpath_variable_node_set*>(this)->value = value;
+               return true;
+       }
+
+       PUGI__FN xpath_variable_set::xpath_variable_set()
+       {
+               for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i) _data[i] = 0;
+       }
+
+       PUGI__FN xpath_variable_set::~xpath_variable_set()
+       {
+               for (size_t i = 0; i < sizeof(_data) / sizeof(_data[0]); ++i)
+               {
+                       xpath_variable* var = _data[i];
+
+                       while (var)
+                       {
+                               xpath_variable* next = var->_next;
+
+                               impl::delete_xpath_variable(var->_type, var);
+
+                               var = next;
+                       }
+               }
+       }
+
+       PUGI__FN xpath_variable* xpath_variable_set::find(const char_t* name) const
+       {
+               const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+               size_t hash = impl::hash_string(name) % hash_size;
+
+               // look for existing variable
+               for (xpath_variable* var = _data[hash]; var; var = var->_next)
+                       if (impl::strequal(var->name(), name))
+                               return var;
+
+               return 0;
+       }
+
+       PUGI__FN xpath_variable* xpath_variable_set::add(const char_t* name, xpath_value_type type)
+       {
+               const size_t hash_size = sizeof(_data) / sizeof(_data[0]);
+               size_t hash = impl::hash_string(name) % hash_size;
+
+               // look for existing variable
+               for (xpath_variable* var = _data[hash]; var; var = var->_next)
+                       if (impl::strequal(var->name(), name))
+                               return var->type() == type ? var : 0;
+
+               // add new variable
+               xpath_variable* result = impl::new_xpath_variable(type, name);
+
+               if (result)
+               {
+                       result->_type = type;
+                       result->_next = _data[hash];
+
+                       _data[hash] = result;
+               }
+
+               return result;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, bool value)
+       {
+               xpath_variable* var = add(name, xpath_type_boolean);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, double value)
+       {
+               xpath_variable* var = add(name, xpath_type_number);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, const char_t* value)
+       {
+               xpath_variable* var = add(name, xpath_type_string);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN bool xpath_variable_set::set(const char_t* name, const xpath_node_set& value)
+       {
+               xpath_variable* var = add(name, xpath_type_node_set);
+               return var ? var->set(value) : false;
+       }
+
+       PUGI__FN xpath_variable* xpath_variable_set::get(const char_t* name)
+       {
+               return find(name);
+       }
+
+       PUGI__FN const xpath_variable* xpath_variable_set::get(const char_t* name) const
+       {
+               return find(name);
+       }
+
+       PUGI__FN xpath_query::xpath_query(const char_t* query, xpath_variable_set* variables): _impl(0)
+       {
+               impl::xpath_query_impl* qimpl = impl::xpath_query_impl::create();
+
+               if (!qimpl)
+               {
+               #ifdef PUGIXML_NO_EXCEPTIONS
+                       _result.error = "Out of memory";
+               #else
+                       throw std::bad_alloc();
+               #endif
+               }
+               else
+               {
+                       impl::buffer_holder impl_holder(qimpl, impl::xpath_query_impl::destroy);
+
+                       qimpl->root = impl::xpath_parser::parse(query, variables, &qimpl->alloc, &_result);
+
+                       if (qimpl->root)
+                       {
+                               qimpl->root->optimize(&qimpl->alloc);
+
+                               _impl = static_cast<impl::xpath_query_impl*>(impl_holder.release());
+                               _result.error = 0;
+                       }
+               }
+       }
+
+       PUGI__FN xpath_query::~xpath_query()
+       {
+               impl::xpath_query_impl::destroy(_impl);
+       }
+
+       PUGI__FN xpath_value_type xpath_query::return_type() const
+       {
+               if (!_impl) return xpath_type_none;
+
+               return static_cast<impl::xpath_query_impl*>(_impl)->root->rettype();
+       }
+
+       PUGI__FN bool xpath_query::evaluate_boolean(const xpath_node& n) const
+       {
+               if (!_impl) return false;
+               
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return false;
+       #endif
+               
+               return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_boolean(c, sd.stack);
+       }
+       
+       PUGI__FN double xpath_query::evaluate_number(const xpath_node& n) const
+       {
+               if (!_impl) return impl::gen_nan();
+               
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return impl::gen_nan();
+       #endif
+
+               return static_cast<impl::xpath_query_impl*>(_impl)->root->eval_number(c, sd.stack);
+       }
+
+#ifndef PUGIXML_NO_STL
+       PUGI__FN string_t xpath_query::evaluate_string(const xpath_node& n) const
+       {
+               impl::xpath_stack_data sd;
+
+               impl::xpath_string r = impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd);
+
+               return string_t(r.c_str(), r.length());
+       }
+#endif
+
+       PUGI__FN size_t xpath_query::evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const
+       {
+               impl::xpath_stack_data sd;
+
+               impl::xpath_string r = impl::evaluate_string_impl(static_cast<impl::xpath_query_impl*>(_impl), n, sd);
+
+               size_t full_size = r.length() + 1;
+               
+               if (capacity > 0)
+               {
+                       size_t size = (full_size < capacity) ? full_size : capacity;
+                       assert(size > 0);
+
+                       memcpy(buffer, r.c_str(), (size - 1) * sizeof(char_t));
+                       buffer[size - 1] = 0;
+               }
+               
+               return full_size;
+       }
+
+       PUGI__FN xpath_node_set xpath_query::evaluate_node_set(const xpath_node& n) const
+       {
+               impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast<impl::xpath_query_impl*>(_impl));
+               if (!root) return xpath_node_set();
+
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return xpath_node_set();
+       #endif
+
+               impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_all);
+
+               return xpath_node_set(r.begin(), r.end(), r.type());
+       }
+
+       PUGI__FN xpath_node xpath_query::evaluate_node(const xpath_node& n) const
+       {
+               impl::xpath_ast_node* root = impl::evaluate_node_set_prepare(static_cast<impl::xpath_query_impl*>(_impl));
+               if (!root) return xpath_node();
+
+               impl::xpath_context c(n, 1, 1);
+               impl::xpath_stack_data sd;
+
+       #ifdef PUGIXML_NO_EXCEPTIONS
+               if (setjmp(sd.error_handler)) return xpath_node();
+       #endif
+
+               impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_first);
+
+               return r.first();
+       }
+
+       PUGI__FN const xpath_parse_result& xpath_query::result() const
+       {
+               return _result;
+       }
+
+       PUGI__FN static void unspecified_bool_xpath_query(xpath_query***)
+       {
+       }
+
+       PUGI__FN xpath_query::operator xpath_query::unspecified_bool_type() const
+       {
+               return _impl ? unspecified_bool_xpath_query : 0;
+       }
+
+       PUGI__FN bool xpath_query::operator!() const
+       {
+               return !_impl;
+       }
+
+       PUGI__FN xpath_node xml_node::select_node(const char_t* query, xpath_variable_set* variables) const
+       {
+               xpath_query q(query, variables);
+               return select_node(q);
+       }
+
+       PUGI__FN xpath_node xml_node::select_node(const xpath_query& query) const
+       {
+               return query.evaluate_node(*this);
+       }
+
+       PUGI__FN xpath_node_set xml_node::select_nodes(const char_t* query, xpath_variable_set* variables) const
+       {
+               xpath_query q(query, variables);
+               return select_nodes(q);
+       }
+
+       PUGI__FN xpath_node_set xml_node::select_nodes(const xpath_query& query) const
+       {
+               return query.evaluate_node_set(*this);
+       }
+
+       PUGI__FN xpath_node xml_node::select_single_node(const char_t* query, xpath_variable_set* variables) const
+       {
+               xpath_query q(query, variables);
+               return select_single_node(q);
+       }
+
+       PUGI__FN xpath_node xml_node::select_single_node(const xpath_query& query) const
+       {
+               return query.evaluate_node(*this);
+       }
+}
+
+#endif
+
+#ifdef __BORLANDC__
+#      pragma option pop
+#endif
+
+// Intel C++ does not properly keep warning state for function templates,
+// so popping warning state at the end of translation unit leads to warnings in the middle.
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#      pragma warning(pop)
+#endif
+
+// Undefine all local macros (makes sure we're not leaking macros in header-only mode)
+#undef PUGI__NO_INLINE
+#undef PUGI__UNLIKELY
+#undef PUGI__STATIC_ASSERT
+#undef PUGI__DMC_VOLATILE
+#undef PUGI__MSVC_CRT_VERSION
+#undef PUGI__NS_BEGIN
+#undef PUGI__NS_END
+#undef PUGI__FN
+#undef PUGI__FN_NO_INLINE
+#undef PUGI__NODETYPE
+#undef PUGI__IS_CHARTYPE_IMPL
+#undef PUGI__IS_CHARTYPE
+#undef PUGI__IS_CHARTYPEX
+#undef PUGI__ENDSWITH
+#undef PUGI__SKIPWS
+#undef PUGI__OPTSET
+#undef PUGI__PUSHNODE
+#undef PUGI__POPNODE
+#undef PUGI__SCANFOR
+#undef PUGI__SCANWHILE
+#undef PUGI__SCANWHILE_UNROLL
+#undef PUGI__ENDSEG
+#undef PUGI__THROW_ERROR
+#undef PUGI__CHECK_ERROR
+
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma GCC diagnostic pop
diff --git a/src/pugixml/pugixml.hpp b/src/pugixml/pugixml.hpp

new file mode 100644 (file)

index 0000000..d4d5a62
--- /dev/null
+++ b/src/pugixml/pugixml.hpp
@@ -0,0 +1,1366 @@
+/**
+ * pugixml parser - version 1.5
+ * --------------------------------------------------------
+ * Copyright (C) 2006-2014, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at http://pugixml.org/
+ *
+ * This library is distributed under the MIT License. See notice at the end
+ * of this file.
+ *
+ * This work is based on the pugxml parser, which is:
+ * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
+ */
+
+#ifndef PUGIXML_VERSION
+// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons
+#      define PUGIXML_VERSION 150
+#endif
+
+// Include user configuration file (this can define various configuration macros)
+#include "pugiconfig.hpp"
+
+#ifndef HEADER_PUGIXML_HPP
+#define HEADER_PUGIXML_HPP
+
+// Include stddef.h for size_t and ptrdiff_t
+#include <cstddef>
+
+// Include exception header for XPath
+#if !defined(PUGIXML_NO_XPATH) && !defined(PUGIXML_NO_EXCEPTIONS)
+#      include <exception>
+#endif
+
+// Include STL headers
+#ifndef PUGIXML_NO_STL
+#      include <iterator>
+#      include <iosfwd>
+#      include <string>
+#endif
+
+// Macro for deprecated features
+#ifndef PUGIXML_DEPRECATED
+#      if defined(__GNUC__)
+#              define PUGIXML_DEPRECATED __attribute__((deprecated))
+#      elif defined(_MSC_VER) && _MSC_VER >= 1300
+#              define PUGIXML_DEPRECATED __declspec(deprecated)
+#      else
+#              define PUGIXML_DEPRECATED
+#      endif
+#endif
+
+// If no API is defined, assume default
+#ifndef PUGIXML_API
+#      define PUGIXML_API
+#endif
+
+// If no API for classes is defined, assume default
+#ifndef PUGIXML_CLASS
+#      define PUGIXML_CLASS PUGIXML_API
+#endif
+
+// If no API for functions is defined, assume default
+#ifndef PUGIXML_FUNCTION
+#      define PUGIXML_FUNCTION PUGIXML_API
+#endif
+
+// If the platform is known to have long long support, enable long long functions
+#ifndef PUGIXML_HAS_LONG_LONG
+#      if defined(__cplusplus) && __cplusplus >= 201103
+#              define PUGIXML_HAS_LONG_LONG
+#      elif defined(_MSC_VER) && _MSC_VER >= 1400
+#              define PUGIXML_HAS_LONG_LONG
+#      endif
+#endif
+
+// Character interface macros
+#ifdef PUGIXML_WCHAR_MODE
+#      define PUGIXML_TEXT(t) L ## t
+#      define PUGIXML_CHAR wchar_t
+#else
+#      define PUGIXML_TEXT(t) t
+#      define PUGIXML_CHAR char
+#endif
+
+namespace pugi
+{
+       // Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE
+       typedef PUGIXML_CHAR char_t;
+
+#ifndef PUGIXML_NO_STL
+       // String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE
+       typedef std::basic_string<PUGIXML_CHAR, std::char_traits<PUGIXML_CHAR>, std::allocator<PUGIXML_CHAR> > string_t;
+#endif
+}
+
+// The PugiXML namespace
+namespace pugi
+{
+       // Tree node types
+       enum xml_node_type
+       {
+               node_null,                      // Empty (null) node handle
+               node_document,          // A document tree's absolute root
+               node_element,           // Element tag, i.e. '<node/>'
+               node_pcdata,            // Plain character data, i.e. 'text'
+               node_cdata,                     // Character data, i.e. '<![CDATA[text]]>'
+               node_comment,           // Comment tag, i.e. '<!-- text -->'
+               node_pi,                        // Processing instruction, i.e. '<?name?>'
+               node_declaration,       // Document declaration, i.e. '<?xml version="1.0"?>'
+               node_doctype            // Document type declaration, i.e. '<!DOCTYPE doc>'
+       };
+
+       // Parsing options
+
+       // Minimal parsing mode (equivalent to turning all other flags off).
+       // Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed.
+       const unsigned int parse_minimal = 0x0000;
+
+       // This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is off by default.
+       const unsigned int parse_pi = 0x0001;
+
+       // This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by default.
+       const unsigned int parse_comments = 0x0002;
+
+       // This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by default.
+       const unsigned int parse_cdata = 0x0004;
+
+       // This flag determines if plain character data (node_pcdata) that consist only of whitespace are added to the DOM tree.
+       // This flag is off by default; turning it on usually results in slower parsing and more memory consumption.
+       const unsigned int parse_ws_pcdata = 0x0008;
+
+       // This flag determines if character and entity references are expanded during parsing. This flag is on by default.
+       const unsigned int parse_escapes = 0x0010;
+
+       // This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
+       const unsigned int parse_eol = 0x0020;
+       
+       // This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
+       const unsigned int parse_wconv_attribute = 0x0040;
+
+       // This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
+       const unsigned int parse_wnorm_attribute = 0x0080;
+       
+       // This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
+       const unsigned int parse_declaration = 0x0100;
+
+       // This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default.
+       const unsigned int parse_doctype = 0x0200;
+
+       // This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only
+       // of whitespace is added to the DOM tree.
+       // This flag is off by default; turning it on may result in slower parsing and more memory consumption.
+       const unsigned int parse_ws_pcdata_single = 0x0400;
+
+       // This flag determines if leading and trailing whitespace is to be removed from plain character data. This flag is off by default.
+       const unsigned int parse_trim_pcdata = 0x0800;
+
+       // This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document
+       // is a valid document. This flag is off by default.
+       const unsigned int parse_fragment = 0x1000;
+
+       // The default parsing mode.
+       // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
+       // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+       const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol;
+
+       // The full parsing mode.
+       // Nodes of all types are added to the DOM tree, character/reference entities are expanded,
+       // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
+       const unsigned int parse_full = parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype;
+
+       // These flags determine the encoding of input data for XML document
+       enum xml_encoding
+       {
+               encoding_auto,          // Auto-detect input encoding using BOM or < / <? detection; use UTF8 if BOM is not found
+               encoding_utf8,          // UTF8 encoding
+               encoding_utf16_le,      // Little-endian UTF16
+               encoding_utf16_be,      // Big-endian UTF16
+               encoding_utf16,         // UTF16 with native endianness
+               encoding_utf32_le,      // Little-endian UTF32
+               encoding_utf32_be,      // Big-endian UTF32
+               encoding_utf32,         // UTF32 with native endianness
+               encoding_wchar,         // The same encoding wchar_t has (either UTF16 or UTF32)
+               encoding_latin1
+       };
+
+       // Formatting flags
+       
+       // Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
+       const unsigned int format_indent = 0x01;
+       
+       // Write encoding-specific BOM to the output stream. This flag is off by default.
+       const unsigned int format_write_bom = 0x02;
+
+       // Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
+       const unsigned int format_raw = 0x04;
+       
+       // Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
+       const unsigned int format_no_declaration = 0x08;
+
+       // Don't escape attribute values and PCDATA contents. This flag is off by default.
+       const unsigned int format_no_escapes = 0x10;
+
+       // Open file using text mode in xml_document::save_file. This enables special character (i.e. new-line) conversions on some systems. This flag is off by default.
+       const unsigned int format_save_file_text = 0x20;
+
+       // The default set of formatting flags.
+       // Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
+       const unsigned int format_default = format_indent;
+               
+       // Forward declarations
+       struct xml_attribute_struct;
+       struct xml_node_struct;
+
+       class xml_node_iterator;
+       class xml_attribute_iterator;
+       class xml_named_node_iterator;
+
+       class xml_tree_walker;
+
+       struct xml_parse_result;
+
+       class xml_node;
+
+       class xml_text;
+       
+       #ifndef PUGIXML_NO_XPATH
+       class xpath_node;
+       class xpath_node_set;
+       class xpath_query;
+       class xpath_variable_set;
+       #endif
+
+       // Range-based for loop support
+       template <typename It> class xml_object_range
+       {
+       public:
+               typedef It const_iterator;
+               typedef It iterator;
+
+               xml_object_range(It b, It e): _begin(b), _end(e)
+               {
+               }
+
+               It begin() const { return _begin; }
+               It end() const { return _end; }
+
+       private:
+               It _begin, _end;
+       };
+
+       // Writer interface for node printing (see xml_node::print)
+       class PUGIXML_CLASS xml_writer
+       {
+       public:
+               virtual ~xml_writer() {}
+
+               // Write memory chunk into stream/file/whatever
+               virtual void write(const void* data, size_t size) = 0;
+       };
+
+       // xml_writer implementation for FILE*
+       class PUGIXML_CLASS xml_writer_file: public xml_writer
+       {
+       public:
+               // Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
+               xml_writer_file(void* file);
+
+               virtual void write(const void* data, size_t size);
+
+       private:
+               void* file;
+       };
+
+       #ifndef PUGIXML_NO_STL
+       // xml_writer implementation for streams
+       class PUGIXML_CLASS xml_writer_stream: public xml_writer
+       {
+       public:
+               // Construct writer from an output stream object
+               xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
+               xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
+
+               virtual void write(const void* data, size_t size);
+
+       private:
+               std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
+               std::basic_ostream<wchar_t, std::char_traits<wchar_t> >* wide_stream;
+       };
+       #endif
+
+       // A light-weight handle for manipulating attributes in DOM tree
+       class PUGIXML_CLASS xml_attribute
+       {
+               friend class xml_attribute_iterator;
+               friend class xml_node;
+
+       private:
+               xml_attribute_struct* _attr;
+       
+               typedef void (*unspecified_bool_type)(xml_attribute***);
+
+       public:
+               // Default constructor. Constructs an empty attribute.
+               xml_attribute();
+               
+               // Constructs attribute from internal pointer
+               explicit xml_attribute(xml_attribute_struct* attr);
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+
+               // Comparison operators (compares wrapped attribute pointers)
+               bool operator==(const xml_attribute& r) const;
+               bool operator!=(const xml_attribute& r) const;
+               bool operator<(const xml_attribute& r) const;
+               bool operator>(const xml_attribute& r) const;
+               bool operator<=(const xml_attribute& r) const;
+               bool operator>=(const xml_attribute& r) const;
+
+               // Check if attribute is empty
+               bool empty() const;
+
+               // Get attribute name/value, or "" if attribute is empty
+               const char_t* name() const;
+               const char_t* value() const;
+
+               // Get attribute value, or the default value if attribute is empty
+               const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+               // Get attribute value as a number, or the default value if conversion did not succeed or attribute is empty
+               int as_int(int def = 0) const;
+               unsigned int as_uint(unsigned int def = 0) const;
+               double as_double(double def = 0) const;
+               float as_float(float def = 0) const;
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               long long as_llong(long long def = 0) const;
+               unsigned long long as_ullong(unsigned long long def = 0) const;
+       #endif
+
+               // Get attribute value as bool (returns true if first character is in '1tTyY' set), or the default value if attribute is empty
+               bool as_bool(bool def = false) const;
+
+               // Set attribute name/value (returns false if attribute is empty or there is not enough memory)
+               bool set_name(const char_t* rhs);
+               bool set_value(const char_t* rhs);
+
+               // Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+               bool set_value(int rhs);
+               bool set_value(unsigned int rhs);
+               bool set_value(double rhs);
+               bool set_value(float rhs);
+               bool set_value(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               bool set_value(long long rhs);
+               bool set_value(unsigned long long rhs);
+       #endif
+
+               // Set attribute value (equivalent to set_value without error checking)
+               xml_attribute& operator=(const char_t* rhs);
+               xml_attribute& operator=(int rhs);
+               xml_attribute& operator=(unsigned int rhs);
+               xml_attribute& operator=(double rhs);
+               xml_attribute& operator=(float rhs);
+               xml_attribute& operator=(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               xml_attribute& operator=(long long rhs);
+               xml_attribute& operator=(unsigned long long rhs);
+       #endif
+
+               // Get next/previous attribute in the attribute list of the parent node
+               xml_attribute next_attribute() const;
+               xml_attribute previous_attribute() const;
+
+               // Get hash value (unique for handles to the same object)
+               size_t hash_value() const;
+
+               // Get internal pointer
+               xml_attribute_struct* internal_object() const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xml_attribute& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xml_attribute& lhs, bool rhs);
+#endif
+
+       // A light-weight handle for manipulating nodes in DOM tree
+       class PUGIXML_CLASS xml_node
+       {
+               friend class xml_attribute_iterator;
+               friend class xml_node_iterator;
+               friend class xml_named_node_iterator;
+
+       protected:
+               xml_node_struct* _root;
+
+               typedef void (*unspecified_bool_type)(xml_node***);
+
+       public:
+               // Default constructor. Constructs an empty node.
+               xml_node();
+
+               // Constructs node from internal pointer
+               explicit xml_node(xml_node_struct* p);
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+       
+               // Comparison operators (compares wrapped node pointers)
+               bool operator==(const xml_node& r) const;
+               bool operator!=(const xml_node& r) const;
+               bool operator<(const xml_node& r) const;
+               bool operator>(const xml_node& r) const;
+               bool operator<=(const xml_node& r) const;
+               bool operator>=(const xml_node& r) const;
+
+               // Check if node is empty.
+               bool empty() const;
+
+               // Get node type
+               xml_node_type type() const;
+
+               // Get node name, or "" if node is empty or it has no name
+               const char_t* name() const;
+
+               // Get node value, or "" if node is empty or it has no value
+               // Note: For <node>text</node> node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes.
+               const char_t* value() const;
+       
+               // Get attribute list
+               xml_attribute first_attribute() const;
+               xml_attribute last_attribute() const;
+
+               // Get children list
+               xml_node first_child() const;
+               xml_node last_child() const;
+
+               // Get next/previous sibling in the children list of the parent node
+               xml_node next_sibling() const;
+               xml_node previous_sibling() const;
+               
+               // Get parent node
+               xml_node parent() const;
+
+               // Get root of DOM tree this node belongs to
+               xml_node root() const;
+
+               // Get text object for the current node
+               xml_text text() const;
+
+               // Get child, attribute or next/previous sibling with the specified name
+               xml_node child(const char_t* name) const;
+               xml_attribute attribute(const char_t* name) const;
+               xml_node next_sibling(const char_t* name) const;
+               xml_node previous_sibling(const char_t* name) const;
+
+               // Get child value of current node; that is, value of the first child node of type PCDATA/CDATA
+               const char_t* child_value() const;
+
+               // Get child value of child with specified name. Equivalent to child(name).child_value().
+               const char_t* child_value(const char_t* name) const;
+
+               // Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
+               bool set_name(const char_t* rhs);
+               bool set_value(const char_t* rhs);
+               
+               // Add attribute with specified name. Returns added attribute, or empty attribute on errors.
+               xml_attribute append_attribute(const char_t* name);
+               xml_attribute prepend_attribute(const char_t* name);
+               xml_attribute insert_attribute_after(const char_t* name, const xml_attribute& attr);
+               xml_attribute insert_attribute_before(const char_t* name, const xml_attribute& attr);
+
+               // Add a copy of the specified attribute. Returns added attribute, or empty attribute on errors.
+               xml_attribute append_copy(const xml_attribute& proto);
+               xml_attribute prepend_copy(const xml_attribute& proto);
+               xml_attribute insert_copy_after(const xml_attribute& proto, const xml_attribute& attr);
+               xml_attribute insert_copy_before(const xml_attribute& proto, const xml_attribute& attr);
+
+               // Add child node with specified type. Returns added node, or empty node on errors.
+               xml_node append_child(xml_node_type type = node_element);
+               xml_node prepend_child(xml_node_type type = node_element);
+               xml_node insert_child_after(xml_node_type type, const xml_node& node);
+               xml_node insert_child_before(xml_node_type type, const xml_node& node);
+
+               // Add child element with specified name. Returns added node, or empty node on errors.
+               xml_node append_child(const char_t* name);
+               xml_node prepend_child(const char_t* name);
+               xml_node insert_child_after(const char_t* name, const xml_node& node);
+               xml_node insert_child_before(const char_t* name, const xml_node& node);
+
+               // Add a copy of the specified node as a child. Returns added node, or empty node on errors.
+               xml_node append_copy(const xml_node& proto);
+               xml_node prepend_copy(const xml_node& proto);
+               xml_node insert_copy_after(const xml_node& proto, const xml_node& node);
+               xml_node insert_copy_before(const xml_node& proto, const xml_node& node);
+
+               // Move the specified node to become a child of this node. Returns moved node, or empty node on errors.
+               xml_node append_move(const xml_node& moved);
+               xml_node prepend_move(const xml_node& moved);
+               xml_node insert_move_after(const xml_node& moved, const xml_node& node);
+               xml_node insert_move_before(const xml_node& moved, const xml_node& node);
+
+               // Remove specified attribute
+               bool remove_attribute(const xml_attribute& a);
+               bool remove_attribute(const char_t* name);
+
+               // Remove specified child
+               bool remove_child(const xml_node& n);
+               bool remove_child(const char_t* name);
+
+               // Parses buffer as an XML document fragment and appends all nodes as children of the current node.
+               // Copies/converts the buffer, so it may be deleted or changed after the function returns.
+               // Note: append_buffer allocates memory that has the lifetime of the owning document; removing the appended nodes does not immediately reclaim that memory.
+               xml_parse_result append_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Find attribute using predicate. Returns first attribute for which predicate returned true.
+               template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
+               {
+                       if (!_root) return xml_attribute();
+                       
+                       for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
+                               if (pred(attrib))
+                                       return attrib;
+               
+                       return xml_attribute();
+               }
+
+               // Find child node using predicate. Returns first child for which predicate returned true.
+               template <typename Predicate> xml_node find_child(Predicate pred) const
+               {
+                       if (!_root) return xml_node();
+       
+                       for (xml_node node = first_child(); node; node = node.next_sibling())
+                               if (pred(node))
+                                       return node;
+               
+                       return xml_node();
+               }
+
+               // Find node from subtree using predicate. Returns first node from subtree (depth-first), for which predicate returned true.
+               template <typename Predicate> xml_node find_node(Predicate pred) const
+               {
+                       if (!_root) return xml_node();
+
+                       xml_node cur = first_child();
+                       
+                       while (cur._root && cur._root != _root)
+                       {
+                               if (pred(cur)) return cur;
+
+                               if (cur.first_child()) cur = cur.first_child();
+                               else if (cur.next_sibling()) cur = cur.next_sibling();
+                               else
+                               {
+                                       while (!cur.next_sibling() && cur._root != _root) cur = cur.parent();
+
+                                       if (cur._root != _root) cur = cur.next_sibling();
+                               }
+                       }
+
+                       return xml_node();
+               }
+
+               // Find child node by attribute name/value
+               xml_node find_child_by_attribute(const char_t* name, const char_t* attr_name, const char_t* attr_value) const;
+               xml_node find_child_by_attribute(const char_t* attr_name, const char_t* attr_value) const;
+
+       #ifndef PUGIXML_NO_STL
+               // Get the absolute node path from root as a text string.
+               string_t path(char_t delimiter = '/') const;
+       #endif
+
+               // Search for a node by path consisting of node names and . or .. elements.
+               xml_node first_element_by_path(const char_t* path, char_t delimiter = '/') const;
+
+               // Recursively traverse subtree with xml_tree_walker
+               bool traverse(xml_tree_walker& walker);
+       
+       #ifndef PUGIXML_NO_XPATH
+               // Select single node by evaluating XPath query. Returns first node from the resulting node set.
+               xpath_node select_node(const char_t* query, xpath_variable_set* variables = 0) const;
+               xpath_node select_node(const xpath_query& query) const;
+
+               // Select node set by evaluating XPath query
+               xpath_node_set select_nodes(const char_t* query, xpath_variable_set* variables = 0) const;
+               xpath_node_set select_nodes(const xpath_query& query) const;
+
+               // (deprecated: use select_node instead) Select single node by evaluating XPath query.
+               xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
+               xpath_node select_single_node(const xpath_query& query) const;
+
+       #endif
+               
+               // Print subtree using a writer object
+               void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+
+       #ifndef PUGIXML_NO_STL
+               // Print subtree to stream
+               void print(std::basic_ostream<char, std::char_traits<char> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
+               void print(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const;
+       #endif
+
+               // Child nodes iterators
+               typedef xml_node_iterator iterator;
+
+               iterator begin() const;
+               iterator end() const;
+
+               // Attribute iterators
+               typedef xml_attribute_iterator attribute_iterator;
+
+               attribute_iterator attributes_begin() const;
+               attribute_iterator attributes_end() const;
+
+               // Range-based for support
+               xml_object_range<xml_node_iterator> children() const;
+               xml_object_range<xml_named_node_iterator> children(const char_t* name) const;
+               xml_object_range<xml_attribute_iterator> attributes() const;
+
+               // Get node offset in parsed file/string (in char_t units) for debugging purposes
+               ptrdiff_t offset_debug() const;
+
+               // Get hash value (unique for handles to the same object)
+               size_t hash_value() const;
+
+               // Get internal pointer
+               xml_node_struct* internal_object() const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xml_node& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xml_node& lhs, bool rhs);
+#endif
+
+       // A helper for working with text inside PCDATA nodes
+       class PUGIXML_CLASS xml_text
+       {
+               friend class xml_node;
+
+               xml_node_struct* _root;
+
+               typedef void (*unspecified_bool_type)(xml_text***);
+
+               explicit xml_text(xml_node_struct* root);
+
+               xml_node_struct* _data_new();
+               xml_node_struct* _data() const;
+
+       public:
+               // Default constructor. Constructs an empty object.
+               xml_text();
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+
+               // Check if text object is empty
+               bool empty() const;
+
+               // Get text, or "" if object is empty
+               const char_t* get() const;
+
+               // Get text, or the default value if object is empty
+               const char_t* as_string(const char_t* def = PUGIXML_TEXT("")) const;
+
+               // Get text as a number, or the default value if conversion did not succeed or object is empty
+               int as_int(int def = 0) const;
+               unsigned int as_uint(unsigned int def = 0) const;
+               double as_double(double def = 0) const;
+               float as_float(float def = 0) const;
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               long long as_llong(long long def = 0) const;
+               unsigned long long as_ullong(unsigned long long def = 0) const;
+       #endif
+
+               // Get text as bool (returns true if first character is in '1tTyY' set), or the default value if object is empty
+               bool as_bool(bool def = false) const;
+
+               // Set text (returns false if object is empty or there is not enough memory)
+               bool set(const char_t* rhs);
+
+               // Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
+               bool set(int rhs);
+               bool set(unsigned int rhs);
+               bool set(double rhs);
+               bool set(float rhs);
+               bool set(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               bool set(long long rhs);
+               bool set(unsigned long long rhs);
+       #endif
+
+               // Set text (equivalent to set without error checking)
+               xml_text& operator=(const char_t* rhs);
+               xml_text& operator=(int rhs);
+               xml_text& operator=(unsigned int rhs);
+               xml_text& operator=(double rhs);
+               xml_text& operator=(float rhs);
+               xml_text& operator=(bool rhs);
+
+       #ifdef PUGIXML_HAS_LONG_LONG
+               xml_text& operator=(long long rhs);
+               xml_text& operator=(unsigned long long rhs);
+       #endif
+
+               // Get the data node (node_pcdata or node_cdata) for this object
+               xml_node data() const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xml_text& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xml_text& lhs, bool rhs);
+#endif
+
+       // Child node iterator (a bidirectional iterator over a collection of xml_node)
+       class PUGIXML_CLASS xml_node_iterator
+       {
+               friend class xml_node;
+
+       private:
+               mutable xml_node _wrap;
+               xml_node _parent;
+
+               xml_node_iterator(xml_node_struct* ref, xml_node_struct* parent);
+
+       public:
+               // Iterator traits
+               typedef ptrdiff_t difference_type;
+               typedef xml_node value_type;
+               typedef xml_node* pointer;
+               typedef xml_node& reference;
+
+       #ifndef PUGIXML_NO_STL
+               typedef std::bidirectional_iterator_tag iterator_category;
+       #endif
+
+               // Default constructor
+               xml_node_iterator();
+
+               // Construct an iterator which points to the specified node
+               xml_node_iterator(const xml_node& node);
+
+               // Iterator operators
+               bool operator==(const xml_node_iterator& rhs) const;
+               bool operator!=(const xml_node_iterator& rhs) const;
+
+               xml_node& operator*() const;
+               xml_node* operator->() const;
+
+               const xml_node_iterator& operator++();
+               xml_node_iterator operator++(int);
+
+               const xml_node_iterator& operator--();
+               xml_node_iterator operator--(int);
+       };
+
+       // Attribute iterator (a bidirectional iterator over a collection of xml_attribute)
+       class PUGIXML_CLASS xml_attribute_iterator
+       {
+               friend class xml_node;
+
+       private:
+               mutable xml_attribute _wrap;
+               xml_node _parent;
+
+               xml_attribute_iterator(xml_attribute_struct* ref, xml_node_struct* parent);
+
+       public:
+               // Iterator traits
+               typedef ptrdiff_t difference_type;
+               typedef xml_attribute value_type;
+               typedef xml_attribute* pointer;
+               typedef xml_attribute& reference;
+
+       #ifndef PUGIXML_NO_STL
+               typedef std::bidirectional_iterator_tag iterator_category;
+       #endif
+
+               // Default constructor
+               xml_attribute_iterator();
+
+               // Construct an iterator which points to the specified attribute
+               xml_attribute_iterator(const xml_attribute& attr, const xml_node& parent);
+
+               // Iterator operators
+               bool operator==(const xml_attribute_iterator& rhs) const;
+               bool operator!=(const xml_attribute_iterator& rhs) const;
+
+               xml_attribute& operator*() const;
+               xml_attribute* operator->() const;
+
+               const xml_attribute_iterator& operator++();
+               xml_attribute_iterator operator++(int);
+
+               const xml_attribute_iterator& operator--();
+               xml_attribute_iterator operator--(int);
+       };
+
+       // Named node range helper
+       class PUGIXML_CLASS xml_named_node_iterator
+       {
+               friend class xml_node;
+
+       public:
+               // Iterator traits
+               typedef ptrdiff_t difference_type;
+               typedef xml_node value_type;
+               typedef xml_node* pointer;
+               typedef xml_node& reference;
+
+       #ifndef PUGIXML_NO_STL
+               typedef std::bidirectional_iterator_tag iterator_category;
+       #endif
+
+               // Default constructor
+               xml_named_node_iterator();
+
+               // Construct an iterator which points to the specified node
+               xml_named_node_iterator(const xml_node& node, const char_t* name);
+
+               // Iterator operators
+               bool operator==(const xml_named_node_iterator& rhs) const;
+               bool operator!=(const xml_named_node_iterator& rhs) const;
+
+               xml_node& operator*() const;
+               xml_node* operator->() const;
+
+               const xml_named_node_iterator& operator++();
+               xml_named_node_iterator operator++(int);
+
+               const xml_named_node_iterator& operator--();
+               xml_named_node_iterator operator--(int);
+
+       private:
+               mutable xml_node _wrap;
+               xml_node _parent;
+               const char_t* _name;
+
+               xml_named_node_iterator(xml_node_struct* ref, xml_node_struct* parent, const char_t* name);
+       };
+
+       // Abstract tree walker class (see xml_node::traverse)
+       class PUGIXML_CLASS xml_tree_walker
+       {
+               friend class xml_node;
+
+       private:
+               int _depth;
+       
+       protected:
+               // Get current traversal depth
+               int depth() const;
+       
+       public:
+               xml_tree_walker();
+               virtual ~xml_tree_walker();
+
+               // Callback that is called when traversal begins
+               virtual bool begin(xml_node& node);
+
+               // Callback that is called for each node traversed
+               virtual bool for_each(xml_node& node) = 0;
+
+               // Callback that is called when traversal ends
+               virtual bool end(xml_node& node);
+       };
+
+       // Parsing status, returned as part of xml_parse_result object
+       enum xml_parse_status
+       {
+               status_ok = 0,                          // No error
+
+               status_file_not_found,          // File was not found during load_file()
+               status_io_error,                        // Error reading from file/stream
+               status_out_of_memory,           // Could not allocate memory
+               status_internal_error,          // Internal error occurred
+
+               status_unrecognized_tag,        // Parser could not determine tag type
+
+               status_bad_pi,                          // Parsing error occurred while parsing document declaration/processing instruction
+               status_bad_comment,                     // Parsing error occurred while parsing comment
+               status_bad_cdata,                       // Parsing error occurred while parsing CDATA section
+               status_bad_doctype,                     // Parsing error occurred while parsing document type declaration
+               status_bad_pcdata,                      // Parsing error occurred while parsing PCDATA section
+               status_bad_start_element,       // Parsing error occurred while parsing start element tag
+               status_bad_attribute,           // Parsing error occurred while parsing element attribute
+               status_bad_end_element,         // Parsing error occurred while parsing end element tag
+               status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
+
+               status_append_invalid_root,     // Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer)
+
+               status_no_document_element      // Parsing resulted in a document without element nodes
+       };
+
+       // Parsing result
+       struct PUGIXML_CLASS xml_parse_result
+       {
+               // Parsing status (see xml_parse_status)
+               xml_parse_status status;
+
+               // Last parsed offset (in char_t units from start of input data)
+               ptrdiff_t offset;
+
+               // Source document encoding
+               xml_encoding encoding;
+
+               // Default constructor, initializes object to failed state
+               xml_parse_result();
+
+               // Cast to bool operator
+               operator bool() const;
+
+               // Get error description
+               const char* description() const;
+       };
+
+       // Document class (DOM tree root)
+       class PUGIXML_CLASS xml_document: public xml_node
+       {
+       private:
+               char_t* _buffer;
+
+               char _memory[192];
+               
+               // Non-copyable semantics
+               xml_document(const xml_document&);
+               const xml_document& operator=(const xml_document&);
+
+               void create();
+               void destroy();
+
+       public:
+               // Default constructor, makes empty document
+               xml_document();
+
+               // Destructor, invalidates all node/attribute handles to this document
+               ~xml_document();
+
+               // Removes all nodes, leaving the empty document
+               void reset();
+
+               // Removes all nodes, then copies the entire contents of the specified document
+               void reset(const xml_document& proto);
+
+       #ifndef PUGIXML_NO_STL
+               // Load document from stream.
+               xml_parse_result load(std::basic_istream<char, std::char_traits<char> >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+               xml_parse_result load(std::basic_istream<wchar_t, std::char_traits<wchar_t> >& stream, unsigned int options = parse_default);
+       #endif
+
+               // (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied.
+               xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
+
+               // Load document from zero-terminated string. No encoding conversions are applied.
+               xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default);
+
+               // Load document from file
+               xml_parse_result load_file(const char* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+               xml_parse_result load_file(const wchar_t* path, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Load document from buffer. Copies/converts the buffer, so it may be deleted or changed after the function returns.
+               xml_parse_result load_buffer(const void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+               // You should ensure that buffer data will persist throughout the document's lifetime, and free the buffer memory manually once document is destroyed.
+               xml_parse_result load_buffer_inplace(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Load document from buffer, using the buffer for in-place parsing (the buffer is modified and used for storage of document data).
+               // You should allocate the buffer with pugixml allocation function; document will free the buffer when it is no longer needed (you can't use it anymore).
+               xml_parse_result load_buffer_inplace_own(void* contents, size_t size, unsigned int options = parse_default, xml_encoding encoding = encoding_auto);
+
+               // Save XML document to writer (semantics is slightly different from xml_node::print, see documentation for details).
+               void save(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+       #ifndef PUGIXML_NO_STL
+               // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details).
+               void save(std::basic_ostream<char, std::char_traits<char> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+               void save(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const;
+       #endif
+
+               // Save XML to file
+               bool save_file(const char* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+               bool save_file(const wchar_t* path, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const;
+
+               // Get document element
+               xml_node document_element() const;
+       };
+
+#ifndef PUGIXML_NO_XPATH
+       // XPath query return type
+       enum xpath_value_type
+       {
+               xpath_type_none,          // Unknown type (query failed to compile)
+               xpath_type_node_set,  // Node set (xpath_node_set)
+               xpath_type_number,        // Number
+               xpath_type_string,        // String
+               xpath_type_boolean        // Boolean
+       };
+
+       // XPath parsing result
+       struct PUGIXML_CLASS xpath_parse_result
+       {
+               // Error message (0 if no error)
+               const char* error;
+
+               // Last parsed offset (in char_t units from string start)
+               ptrdiff_t offset;
+
+               // Default constructor, initializes object to failed state
+               xpath_parse_result();
+
+               // Cast to bool operator
+               operator bool() const;
+
+               // Get error description
+               const char* description() const;
+       };
+
+       // A single XPath variable
+       class PUGIXML_CLASS xpath_variable
+       {
+               friend class xpath_variable_set;
+
+       protected:
+               xpath_value_type _type;
+               xpath_variable* _next;
+
+               xpath_variable();
+
+               // Non-copyable semantics
+               xpath_variable(const xpath_variable&);
+               xpath_variable& operator=(const xpath_variable&);
+               
+       public:
+               // Get variable name
+               const char_t* name() const;
+
+               // Get variable type
+               xpath_value_type type() const;
+
+               // Get variable value; no type conversion is performed, default value (false, NaN, empty string, empty node set) is returned on type mismatch error
+               bool get_boolean() const;
+               double get_number() const;
+               const char_t* get_string() const;
+               const xpath_node_set& get_node_set() const;
+
+               // Set variable value; no type conversion is performed, false is returned on type mismatch error
+               bool set(bool value);
+               bool set(double value);
+               bool set(const char_t* value);
+               bool set(const xpath_node_set& value);
+       };
+
+       // A set of XPath variables
+       class PUGIXML_CLASS xpath_variable_set
+       {
+       private:
+               xpath_variable* _data[64];
+
+               // Non-copyable semantics
+               xpath_variable_set(const xpath_variable_set&);
+               xpath_variable_set& operator=(const xpath_variable_set&);
+
+               xpath_variable* find(const char_t* name) const;
+
+       public:
+               // Default constructor/destructor
+               xpath_variable_set();
+               ~xpath_variable_set();
+
+               // Add a new variable or get the existing one, if the types match
+               xpath_variable* add(const char_t* name, xpath_value_type type);
+
+               // Set value of an existing variable; no type conversion is performed, false is returned if there is no such variable or if types mismatch
+               bool set(const char_t* name, bool value);
+               bool set(const char_t* name, double value);
+               bool set(const char_t* name, const char_t* value);
+               bool set(const char_t* name, const xpath_node_set& value);
+
+               // Get existing variable by name
+               xpath_variable* get(const char_t* name);
+               const xpath_variable* get(const char_t* name) const;
+       };
+
+       // A compiled XPath query object
+       class PUGIXML_CLASS xpath_query
+       {
+       private:
+               void* _impl;
+               xpath_parse_result _result;
+
+               typedef void (*unspecified_bool_type)(xpath_query***);
+
+               // Non-copyable semantics
+               xpath_query(const xpath_query&);
+               xpath_query& operator=(const xpath_query&);
+
+       public:
+               // Construct a compiled object from XPath expression.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on compilation errors.
+               explicit xpath_query(const char_t* query, xpath_variable_set* variables = 0);
+
+               // Destructor
+               ~xpath_query();
+
+               // Get query expression return type
+               xpath_value_type return_type() const;
+               
+               // Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               bool evaluate_boolean(const xpath_node& n) const;
+               
+               // Evaluate expression as double value in the specified context; performs type conversion if necessary.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               double evaluate_number(const xpath_node& n) const;
+               
+       #ifndef PUGIXML_NO_STL
+               // Evaluate expression as string value in the specified context; performs type conversion if necessary.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               string_t evaluate_string(const xpath_node& n) const;
+       #endif
+               
+               // Evaluate expression as string value in the specified context; performs type conversion if necessary.
+               // At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
+               // If PUGIXML_NO_EXCEPTIONS is defined, returns empty  set instead.
+               size_t evaluate_string(char_t* buffer, size_t capacity, const xpath_node& n) const;
+
+               // Evaluate expression as node set in the specified context.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
+               // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node set instead.
+               xpath_node_set evaluate_node_set(const xpath_node& n) const;
+
+               // Evaluate expression as node set in the specified context.
+               // Return first node in document order, or empty node if node set is empty.
+               // If PUGIXML_NO_EXCEPTIONS is not defined, throws xpath_exception on type mismatch and std::bad_alloc on out of memory errors.
+               // If PUGIXML_NO_EXCEPTIONS is defined, returns empty node instead.
+               xpath_node evaluate_node(const xpath_node& n) const;
+
+               // Get parsing result (used to get compilation errors in PUGIXML_NO_EXCEPTIONS mode)
+               const xpath_parse_result& result() const;
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+
+               // Borland C++ workaround
+               bool operator!() const;
+       };
+       
+       #ifndef PUGIXML_NO_EXCEPTIONS
+       // XPath exception class
+       class PUGIXML_CLASS xpath_exception: public std::exception
+       {
+       private:
+               xpath_parse_result _result;
+
+       public:
+               // Construct exception from parse result
+               explicit xpath_exception(const xpath_parse_result& result);
+
+               // Get error message
+               virtual const char* what() const throw();
+
+               // Get parse result
+               const xpath_parse_result& result() const;
+       };
+       #endif
+       
+       // XPath node class (either xml_node or xml_attribute)
+       class PUGIXML_CLASS xpath_node
+       {
+       private:
+               xml_node _node;
+               xml_attribute _attribute;
+       
+               typedef void (*unspecified_bool_type)(xpath_node***);
+
+       public:
+               // Default constructor; constructs empty XPath node
+               xpath_node();
+               
+               // Construct XPath node from XML node/attribute
+               xpath_node(const xml_node& node);
+               xpath_node(const xml_attribute& attribute, const xml_node& parent);
+
+               // Get node/attribute, if any
+               xml_node node() const;
+               xml_attribute attribute() const;
+               
+               // Get parent of contained node/attribute
+               xml_node parent() const;
+
+               // Safe bool conversion operator
+               operator unspecified_bool_type() const;
+               
+               // Borland C++ workaround
+               bool operator!() const;
+
+               // Comparison operators
+               bool operator==(const xpath_node& n) const;
+               bool operator!=(const xpath_node& n) const;
+       };
+
+#ifdef __BORLANDC__
+       // Borland C++ workaround
+       bool PUGIXML_FUNCTION operator&&(const xpath_node& lhs, bool rhs);
+       bool PUGIXML_FUNCTION operator||(const xpath_node& lhs, bool rhs);
+#endif
+
+       // A fixed-size collection of XPath nodes
+       class PUGIXML_CLASS xpath_node_set
+       {
+       public:
+               // Collection type
+               enum type_t
+               {
+                       type_unsorted,                  // Not ordered
+                       type_sorted,                    // Sorted by document order (ascending)
+                       type_sorted_reverse             // Sorted by document order (descending)
+               };
+               
+               // Constant iterator type
+               typedef const xpath_node* const_iterator;
+
+               // We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work
+               typedef const xpath_node* iterator;
+       
+               // Default constructor. Constructs empty set.
+               xpath_node_set();
+
+               // Constructs a set from iterator range; data is not checked for duplicates and is not sorted according to provided type, so be careful
+               xpath_node_set(const_iterator begin, const_iterator end, type_t type = type_unsorted);
+
+               // Destructor
+               ~xpath_node_set();
+               
+               // Copy constructor/assignment operator
+               xpath_node_set(const xpath_node_set& ns);
+               xpath_node_set& operator=(const xpath_node_set& ns);
+
+               // Get collection type
+               type_t type() const;
+               
+               // Get collection size
+               size_t size() const;
+
+               // Indexing operator
+               const xpath_node& operator[](size_t index) const;
+               
+               // Collection iterators
+               const_iterator begin() const;
+               const_iterator end() const;
+
+               // Sort the collection in ascending/descending order by document order
+               void sort(bool reverse = false);
+               
+               // Get first node in the collection by document order
+               xpath_node first() const;
+               
+               // Check if collection is empty
+               bool empty() const;
+       
+       private:
+               type_t _type;
+               
+               xpath_node _storage;
+               
+               xpath_node* _begin;
+               xpath_node* _end;
+
+               void _assign(const_iterator begin, const_iterator end);
+       };
+#endif
+
+#ifndef PUGIXML_NO_STL
+       // Convert wide string to UTF8
+       std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
+       std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
+       
+       // Convert UTF8 to wide string
+       std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
+       std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
+#endif
+
+       // Memory allocation function interface; returns pointer to allocated memory or NULL on failure
+       typedef void* (*allocation_function)(size_t size);
+       
+       // Memory deallocation function interface
+       typedef void (*deallocation_function)(void* ptr);
+
+       // Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
+       void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
+       
+       // Get current memory management functions
+       allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
+       deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
+}
+
+#if !defined(PUGIXML_NO_STL) && (defined(_MSC_VER) || defined(__ICC))
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection for older versions (MSVC7/IC8 and earlier)
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_node_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_attribute_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION _Iter_cat(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#if !defined(PUGIXML_NO_STL) && defined(__SUNPRO_CC)
+namespace std
+{
+       // Workarounds for (non-standard) iterator category detection
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_node_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_attribute_iterator&);
+       std::bidirectional_iterator_tag PUGIXML_FUNCTION __iterator_category(const pugi::xml_named_node_iterator&);
+}
+#endif
+
+#endif
+
+// Make sure implementation is included in header-only mode
+// Use macro expansion in #include to work around QMake (QTBUG-11923)
+#if defined(PUGIXML_HEADER_ONLY) && !defined(PUGIXML_SOURCE)
+#      define PUGIXML_SOURCE "pugixml.cpp"
+#      include PUGIXML_SOURCE
+#endif
+
+/**
+ * Copyright (c) 2006-2014 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/src/vcf/VcfFormat.cpp b/src/vcf/VcfFormat.cpp

new file mode 100644 (file)

index 0000000..2e2ddd8
--- /dev/null
+++ b/src/vcf/VcfFormat.cpp
@@ -0,0 +1,572 @@
+// Author: Derek Barnett
+
+#include <pbbam/vcf/VcfFormat.h>
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include <htslib/vcf.h>
+
+#include <pbbam/StringUtilities.h>
+#include <pbbam/vcf/VcfHeader.h>
+
+namespace PacBio {
+namespace VCF {
+
+namespace {  // anonymous
+
+// using htslib's current version for better compatibility
+static constexpr const char current_version[] = "VCFv4.2";
+
+namespace Tokens {
+
+static constexpr const char file_format[] = "fileformat";
+
+static constexpr const char double_hash[] = "##";
+static constexpr const char contig_lead[] = "##contig=<";
+static constexpr const char filter_lead[] = "##FILTER=<";
+static constexpr const char format_lead[] = "##FORMAT=<";
+static constexpr const char info_lead[] = "##INFO=<";
+static constexpr const char chrom_lead[] = "#CHROM";
+
+static constexpr const char id[] = "ID";
+static constexpr const char number[] = "Number";
+static constexpr const char type[] = "Type";
+static constexpr const char description[] = "Description";
+static constexpr const char source[] = "Source";
+static constexpr const char version[] = "Version";
+
+}  // namespace Tokens
+
+std::string QuotedText(const std::string& d) { return "\"" + d + "\""; }
+
+std::string UnquotedText(const std::string& d)
+{
+    if (d.size() < 2 || d.front() != '"' || d.back() != '"')
+        throw std::runtime_error{"VCF format error: description text not quoted: " + d};
+    return d.substr(1, d.size() - 2);
+}
+
+}  // namespace anonymous
+
+const char* VcfFormat::CurrentVersion() { return current_version; }
+
+std::string VcfFormat::FormattedContigDefinition(const ContigDefinition& def)
+{
+    std::ostringstream text;
+
+    // ID
+    text << Tokens::contig_lead << Tokens::id << '=' << def.Id();
+
+    // attributes
+    if (!def.Attributes().empty()) {
+        text << ',';
+        bool first = true;
+        for (const auto& attr : def.Attributes()) {
+            if (!first) text << ',';
+            text << attr.first << '=' << attr.second;
+            first = false;
+        }
+    }
+    text << '>';
+    return text.str();
+}
+
+std::string VcfFormat::FormattedFilterDefinition(const FilterDefinition& def)
+{
+    std::ostringstream text;
+    text << Tokens::filter_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::description
+         << '=' << QuotedText(def.Description()) << '>';
+    return text.str();
+}
+
+std::string VcfFormat::FormattedFormatDefinition(const FormatDefinition& def)
+{
+    std::ostringstream text;
+    text << Tokens::format_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::number << '='
+         << def.Number() << ',' << Tokens::type << '=' << def.Type() << ',' << Tokens::description
+         << '=' << QuotedText(def.Description()) << '>';
+    return text.str();
+}
+
+std::string VcfFormat::FormattedGeneralDefinition(const GeneralDefinition& def)
+{
+    std::ostringstream text;
+    text << Tokens::double_hash << def.Id() << '=' << def.Text();
+    return text.str();
+}
+
+std::string VcfFormat::FormattedInfoDefinition(const InfoDefinition& def)
+{
+    std::ostringstream text;
+    text << Tokens::info_lead << Tokens::id << '=' << def.Id() << ',' << Tokens::number << '='
+         << def.Number() << ',' << Tokens::type << '=' << def.Type() << ',' << Tokens::description
+         << '=' << QuotedText(def.Description());
+
+    if (def.Source().is_initialized() && !def.Source().get().empty())
+        text << ',' << Tokens::source << '=' << QuotedText(def.Source().get());
+
+    if (def.Version().is_initialized() && !def.Version().get().empty())
+        text << ',' << Tokens::version << '=' << QuotedText(def.Version().get());
+
+    text << '>';
+    return text.str();
+}
+
+std::string VcfFormat::FormattedHeader(const VcfHeader& header)
+{
+    std::ostringstream out;
+
+    const auto& fileformat = header.GeneralDefinition(Tokens::file_format);
+    out << FormattedGeneralDefinition(fileformat) << '\n';
+
+    // remaining general definiitions
+    for (const auto& def : header.GeneralDefinitions()) {
+        if (def.Id() != Tokens::file_format) out << FormattedGeneralDefinition(def) << '\n';
+    }
+
+    // ##contig
+    for (const auto& contig : header.ContigDefinitions())
+        out << FormattedContigDefinition(contig) << '\n';
+
+    // ##FILTER
+    for (const auto& filter : header.FilterDefinitions())
+        out << FormattedFilterDefinition(filter) << '\n';
+
+    // ##INFO
+    for (const auto& info : header.InfoDefinitions())
+        out << FormattedInfoDefinition(info) << '\n';
+
+    // ##FORMAT
+    for (const auto& format : header.FormatDefinitions())
+        out << FormattedFormatDefinition(format) << '\n';
+
+    // fixed headers
+    out << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
+
+    // samples
+    const auto& samples = header.Samples();
+    if (!samples.empty()) {
+        out << "\tFORMAT";
+        for (const auto& sample : samples)
+            out << '\t' << sample;
+    }
+
+    return out.str();
+}
+
+ContigDefinition VcfFormat::ParsedContigDefinition(std::string line)
+{
+    // should already be checked by "normal" code path
+    assert(line.find(Tokens::contig_lead) == 0);
+
+    // substring between brackets
+    const auto lastBracketPos = line.find_last_of('>');
+    if (lastBracketPos == std::string::npos)
+        throw std::runtime_error{"VCF format error: malformed ##contig line: " + line};
+    line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
+
+    std::string id;
+    std::vector<std::pair<std::string, std::string>> attributes;
+
+    const auto fields = PacBio::BAM::Split(line, ',');
+    for (const auto& field : fields) {
+        const auto tokens = PacBio::BAM::Split(field, '=');
+        if (tokens.size() != 2) {
+            throw std::runtime_error{"VCF format error: malformed ##contig line: " + line};
+        }
+        if (tokens[0] == Tokens::id)
+            id = tokens[1];
+        else
+            attributes.push_back(std::make_pair(tokens[0], tokens[1]));
+    }
+
+    return ContigDefinition{std::move(id), std::move(attributes)};
+}
+
+FilterDefinition VcfFormat::ParsedFilterDefinition(std::string line)
+{
+    // should already be checked by "normal" code path
+    assert(line.find(Tokens::filter_lead) == 0);
+
+    // substring between brackets
+    const auto lastBracketPos = line.find_last_of('>');
+    if (lastBracketPos == std::string::npos)
+        throw std::runtime_error{"VCF format error: malformed FILTER line: " + line};
+    line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
+
+    std::string id;
+    std::string description;
+
+    const auto fields = PacBio::BAM::Split(line, ',');
+    for (const auto& field : fields) {
+        const auto tokens = PacBio::BAM::Split(field, '=');
+        if (tokens.size() != 2) {
+            throw std::runtime_error{"VCF format error: malformed FILTER line: " + line};
+        }
+        if (tokens[0] == Tokens::id)
+            id = tokens[1];
+        else if (tokens[0] == Tokens::description) {
+            description = UnquotedText(tokens[1]);
+        } else
+            throw std::runtime_error{"VCF format error: unrecognized FILTER field: " + tokens[0]};
+    }
+
+    return FilterDefinition{std::move(id), std::move(description)};
+}
+
+FormatDefinition VcfFormat::ParsedFormatDefinition(std::string line)
+{
+    // should already be checked by "normal" code path
+    assert(line.find(Tokens::format_lead) == 0);
+
+    // substring between brackets
+    const auto lastBracketPos = line.find_last_of('>');
+    if (lastBracketPos == std::string::npos)
+        throw std::runtime_error{"VCF format error: malformed FORMAT line: " + line};
+    line = std::string(line.cbegin() + 10, line.cbegin() + lastBracketPos);
+
+    std::string id;
+    std::string number;
+    std::string type;
+    std::string description;
+
+    const auto fields = PacBio::BAM::Split(line, ',');
+    for (const auto& field : fields) {
+        const auto tokens = PacBio::BAM::Split(field, '=');
+        if (tokens.size() != 2) {
+            throw std::runtime_error{"VCF format error: malformed FORMAT line: " + line};
+        }
+        if (tokens[0] == Tokens::id)
+            id = tokens[1];
+        else if (tokens[0] == Tokens::number)
+            number = tokens[1];
+        else if (tokens[0] == Tokens::type)
+            type = tokens[1];
+        else if (tokens[0] == Tokens::description) {
+            description = UnquotedText(tokens[1]);
+        } else
+            throw std::runtime_error{"VCF format error: unrecognized FORMAT field: " + tokens[0]};
+    }
+
+    return FormatDefinition{std::move(id), std::move(number), std::move(type),
+                            std::move(description)};
+}
+
+GeneralDefinition VcfFormat::ParsedGeneralDefinition(const std::string& line)
+{
+    const auto tokens = PacBio::BAM::Split(line, '=');
+    if (tokens.size() != 2 || tokens[0].find(Tokens::double_hash) != 0) {
+        throw std::runtime_error{"VCF format error: malformed header line: " + line};
+    }
+
+    return GeneralDefinition{tokens[0].substr(2), tokens[1]};
+}
+
+InfoDefinition VcfFormat::ParsedInfoDefinition(std::string line)
+{
+    // should already be checked by "normal" code path
+    assert(line.find(Tokens::info_lead) == 0);
+
+    // substring between brackets
+    const auto lastBracketPos = line.find_last_of('>');
+    if (lastBracketPos == std::string::npos)
+        throw std::runtime_error{"VCF format error: malformed INFO line: " + line};
+    line = std::string(line.cbegin() + 8, line.cbegin() + lastBracketPos);
+
+    std::string id;
+    std::string number;
+    std::string type;
+    std::string description;
+    std::string source;
+    std::string version;
+
+    const auto fields = PacBio::BAM::Split(line, ',');
+    for (const auto& field : fields) {
+        const auto tokens = PacBio::BAM::Split(field, '=');
+        if (tokens.size() != 2) {
+            throw std::runtime_error{"VCF format error: malformed INFO line: " + line};
+        }
+        if (tokens[0] == Tokens::id)
+            id = tokens[1];
+        else if (tokens[0] == Tokens::number)
+            number = tokens[1];
+        else if (tokens[0] == Tokens::type)
+            type = tokens[1];
+        else if (tokens[0] == Tokens::description) {
+            description = UnquotedText(tokens[1]);
+        } else if (tokens[0] == Tokens::source) {
+            source = UnquotedText(tokens[1]);
+        } else if (tokens[0] == Tokens::version) {
+            version = UnquotedText(tokens[1]);
+        } else
+            throw std::runtime_error{"VCF format error: unrecognized INFO field: " + tokens[0]};
+    }
+
+    return InfoDefinition{std::move(id),          std::move(number), std::move(type),
+                          std::move(description), std::move(source), std::move(version)};
+}
+
+VcfHeader VcfFormat::ParsedHeader(const std::string& hdrText)
+{
+    VcfHeader hdr;
+
+    std::istringstream text{hdrText};
+    std::string line;
+
+    // quick check for fileformat - should be the first line
+    std::getline(text, line);
+    {
+        auto genDef = ParsedGeneralDefinition(line);
+        if (genDef.Id() != Tokens::file_format)
+            throw std::runtime_error{"VCF format error: file must begin with #fileformat line"};
+        hdr.AddGeneralDefinition(std::move(genDef));
+    }
+
+    // read through rest of header
+    bool chromLineFound = false;
+    for (; std::getline(text, line);) {
+        if (line.empty()) continue;
+
+        // info line
+        if (line.find(Tokens::info_lead) == 0) hdr.AddInfoDefinition(ParsedInfoDefinition(line));
+
+        // filter line
+        else if (line.find(Tokens::filter_lead) == 0)
+            hdr.AddFilterDefinition(ParsedFilterDefinition(line));
+
+        // format line
+        else if (line.find(Tokens::format_lead) == 0)
+            hdr.AddFormatDefinition(ParsedFormatDefinition(line));
+
+        // contig line
+        else if (line.find(Tokens::contig_lead) == 0)
+            hdr.AddContigDefinition(ParsedContigDefinition(line));
+
+        // general comment line
+        //
+        // NOTE: Check this after all other specific header line types. This
+        //       catches all remaining lines starting with "##"
+        //
+        else if (line.find(Tokens::double_hash) == 0)
+            hdr.AddGeneralDefinition(ParsedGeneralDefinition(line));
+
+        // CHROM line (maybe w/ samples)
+        else if (line.find(Tokens::chrom_lead) == 0) {
+            std::vector<Sample> samples;
+
+            // If samples are present, skip the fixed colums & FORMAT column (9)
+            // and read the remaining column labels as sample names.
+            //
+            auto columns = PacBio::BAM::Split(line, '\t');
+            for (size_t i = 9; i < columns.size(); ++i)
+                samples.push_back(std::move(columns[i]));
+            hdr.Samples(std::move(samples));
+
+            // quit header parsing after CHROM line
+            chromLineFound = true;
+            break;
+        } else
+            throw std::runtime_error{"VCF format error: unexpected line found in header:\n" + line};
+    }
+
+    if (!chromLineFound) throw std::runtime_error{"VCF format error: CHROM column line is missing"};
+
+    return hdr;
+}
+
+VcfHeader VcfFormat::HeaderFromFile(const std::string& fn)
+{
+    std::ifstream in(fn);
+    return HeaderFromStream(in);
+}
+
+VcfHeader VcfFormat::HeaderFromStream(std::istream& in)
+{
+    std::stringstream text;
+
+    std::string line;
+    while (std::getline(in, line)) {
+        if (line.empty()) continue;
+        if (line.front() == '#')
+            text << line << '\n';
+        else
+            break;
+    }
+
+    return ParsedHeader(text.str());
+}
+
+InfoField VcfFormat::ParsedInfoField(const std::string& text)
+{
+    const auto& tokens = PacBio::BAM::Split(text, '=');
+    if (tokens.empty()) throw std::runtime_error{"VCF format error: malformed INFO field: " + text};
+
+    // required ID
+    InfoField result;
+    result.id = tokens.at(0);
+    if (tokens.size() == 1) return result;
+
+    // optional value or values
+    const auto& valueStr = tokens.at(1);
+    const auto commaFound = valueStr.find(',');
+    if (commaFound != std::string::npos) {
+        std::vector<std::string> values;
+        for (auto&& value : PacBio::BAM::Split(valueStr, ','))
+            values.push_back(std::move(value));
+        result.values = std::move(values);
+    } else
+        result.value = valueStr;
+
+    return result;
+}
+
+std::vector<InfoField> VcfFormat::ParsedInfoFields(const std::string& text)
+{
+    std::vector<InfoField> result;
+    const auto& fields = PacBio::BAM::Split(text, ';');
+    for (const auto& field : fields)
+        result.push_back(ParsedInfoField(field));
+    return result;
+}
+
+VcfVariant VcfFormat::ParsedVariant(const std::string& line)
+{
+    const auto fields = PacBio::BAM::Split(line, '\t');
+    if (fields.size() < 7)
+        throw std::runtime_error{"VCF format error: record is missing required fields: " + line};
+
+    // CHROM POS ID REF ALT REF
+    auto chrom = fields.at(0);
+    auto pos = std::stoi(fields.at(1));
+    auto id = fields.at(2);
+    auto ref = fields.at(3);
+    auto alt = fields.at(4);
+
+    VcfVariant var{std::move(id), std::move(chrom), std::move(pos), std::move(ref), std::move(alt)};
+
+    // QUAL
+    const auto& qualStr = fields.at(5);
+    const float qual = (qualStr == "." ? NAN : stof(qualStr));
+    var.Quality(qual);
+
+    // FILTER
+    auto filter = fields.at(6);
+    var.Filter(std::move(filter));
+
+    // INFO (allow empty)
+    if (fields.size() >= 8) var.InfoFields(ParsedInfoFields(fields.at(7)));
+
+    // GENOTYPE (samples)
+    if (fields.size() > 9) {
+        std::vector<std::string> genotypeIds;
+        const auto& formatField = fields.at(8);
+        const auto formatFields = PacBio::BAM::Split(formatField, ':');
+        for (auto&& genotypeId : formatFields)
+            genotypeIds.push_back(genotypeId);
+        var.GenotypeIds(std::move(genotypeIds));
+
+        // per-sample
+        std::vector<GenotypeField> sampleGenotypes;
+        for (size_t i = 9; i < fields.size(); ++i) {
+
+            GenotypeField g;
+
+            const auto& sampleField = fields.at(i);
+            const auto fieldValues = PacBio::BAM::Split(sampleField, ':');
+            for (const auto fieldValue : fieldValues) {
+                GenotypeData data;
+                const auto genotypeDataValues = PacBio::BAM::Split(fieldValue, ',');
+                if (genotypeDataValues.size() == 1)
+                    data.value = genotypeDataValues.at(0);
+                else
+                    data.values = genotypeDataValues;
+                g.data.push_back(std::move(data));
+            }
+
+            sampleGenotypes.push_back(std::move(g));
+        }
+        var.Genotypes(std::move(sampleGenotypes));
+    }
+
+    return var;
+}
+
+std::string VcfFormat::FormattedInfoField(const InfoField& field)
+{
+    std::ostringstream out;
+    out << field.id;
+    if (field.value.is_initialized()) {
+        out << '=' << field.value.get();
+    } else if (field.values.is_initialized()) {
+        out << '=';
+        bool first = true;
+        for (const auto& value : field.values.get()) {
+            if (!first) out << ',';
+            out << value;
+            first = false;
+        }
+    }
+    return out.str();
+}
+
+std::string VcfFormat::FormattedInfoFields(const std::vector<InfoField>& fields)
+{
+    std::ostringstream out;
+    bool first = true;
+    for (const auto field : fields) {
+        if (!first) out << ';';
+        out << FormattedInfoField(field);
+        first = false;
+    }
+    return out.str();
+}
+
+std::string VcfFormat::FormattedVariant(const VcfVariant& var)
+{
+    std::ostringstream out;
+    out << var.Chrom() << '\t' << var.Position() << '\t' << var.Id() << '\t' << var.RefAllele()
+        << '\t' << var.AltAllele() << '\t'
+        << (var.IsQualityMissing() ? "." : std::to_string(var.Quality())) << '\t' << var.Filter()
+        << '\t' << FormattedInfoFields(var.InfoFields());
+
+    const auto& genotypeIds = var.GenotypeIds();
+    if (!genotypeIds.empty()) {
+        out << '\t';
+        bool firstId = true;
+        for (const auto id : genotypeIds) {
+            if (!firstId) out << ':';
+            out << id;
+            firstId = false;
+        }
+
+        const auto& sampleGenotypes = var.Genotypes();
+        for (const auto sampleGenotype : sampleGenotypes) {
+            out << '\t';
+            bool firstDataEntry = true;
+            for (const auto& d : sampleGenotype.data) {
+                if (!firstDataEntry) out << ':';
+                if (d.value.is_initialized())
+                    out << d.value.get();
+                else {
+                    assert(d.values.is_initialized());
+                    bool firstDatapoint = true;
+                    for (const auto& datapoint : d.values.get()) {
+                        if (!firstDatapoint) out << ',';
+                        out << datapoint;
+                        firstDatapoint = false;
+                    }
+                }
+                firstDataEntry = false;
+            }
+        }
+    }
+    return out.str();
+}
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfHeader.cpp b/src/vcf/VcfHeader.cpp

new file mode 100644 (file)

index 0000000..bca70a8
--- /dev/null
+++ b/src/vcf/VcfHeader.cpp
@@ -0,0 +1,15 @@
+// Author: Derek Barnett
+
+#include <pbbam/vcf/VcfHeader.h>
+
+#include <pbbam/vcf/VcfFormat.h>
+
+namespace PacBio {
+namespace VCF {
+
+VcfHeader::VcfHeader() { Version(VcfFormat::CurrentVersion()); }
+
+VcfHeader::VcfHeader(const std::string& hdrText) { *this = VcfFormat::ParsedHeader(hdrText); }
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfQuery.cpp b/src/vcf/VcfQuery.cpp

new file mode 100644 (file)

index 0000000..c191ca8
--- /dev/null
+++ b/src/vcf/VcfQuery.cpp
@@ -0,0 +1,16 @@
+#include <pbbam/vcf/VcfQuery.h>
+
+namespace PacBio {
+namespace VCF {
+
+VcfQuery::VcfQuery(std::string fn) : VcfQuery{VcfFile{std::move(fn)}} {}
+
+VcfQuery::VcfQuery(const VcfFile& file)
+    : PacBio::BAM::internal::QueryBase<VcfVariant>(), reader_{file}
+{
+}
+
+bool VcfQuery::GetNext(VcfVariant& var) { return reader_.GetNext(var); }
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfReader.cpp b/src/vcf/VcfReader.cpp

new file mode 100644 (file)

index 0000000..a181958
--- /dev/null
+++ b/src/vcf/VcfReader.cpp
@@ -0,0 +1,38 @@
+// Author: Derek Barnett
+
+#include <pbbam/vcf/VcfReader.h>
+
+namespace PacBio {
+namespace VCF {
+
+VcfReader::VcfReader(std::string fn) : VcfReader{VcfFile{std::move(fn)}} {}
+
+VcfReader::VcfReader(const VcfFile& file) : in_{file.Filename()}, header_{file.Header()}
+{
+    // skip header lines
+    const auto& header = file.Header();
+    std::string line;
+    for (size_t i = header.NumLines(); i > 0; --i)
+        std::getline(in_, line);
+
+    FetchNext();
+}
+
+void VcfReader::FetchNext()
+{
+    line_.clear();
+    std::getline(in_, line_);
+}
+
+bool VcfReader::GetNext(VcfVariant& var)
+{
+    if (line_.empty()) return false;
+    var = VcfVariant{line_};
+    FetchNext();
+    return true;
+}
+
+const VcfHeader& VcfReader::Header() const { return header_; }
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfSort.cpp b/src/vcf/VcfSort.cpp

new file mode 100644 (file)

index 0000000..98f52b7
--- /dev/null
+++ b/src/vcf/VcfSort.cpp
@@ -0,0 +1,51 @@
+// Author: Derek Barnett
+
+#include <pbbam/vcf/VcfSort.h>
+
+#include <algorithm>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <pbbam/vcf/VcfQuery.h>
+#include <pbbam/vcf/VcfWriter.h>
+
+namespace PacBio {
+namespace VCF {
+
+void SortFile(const VcfFile& file, const std::string& outputFilename)
+{
+    const auto& header = file.Header();
+
+    // configure contig sort order
+    std::unordered_map<std::string, size_t> contigLookup;
+    const auto& contigDefs = header.ContigDefinitions();
+    for (size_t i = 0; i < contigDefs.size(); ++i) {
+        const auto& contigId = contigDefs.at(i).Id();
+        contigLookup.insert(std::make_pair(contigId, i));
+    }
+
+    // read & sort variants
+    std::vector<VcfVariant> variants;
+    VcfQuery query{file};
+    for (const auto& v : query)
+        variants.push_back(v);
+
+    std::sort(variants.begin(), variants.end(),
+              [&contigLookup](const VcfVariant& lhs, const VcfVariant& rhs) {
+                  const auto lhsIdx = contigLookup.at(lhs.Chrom());
+                  const auto rhsIdx = contigLookup.at(rhs.Chrom());
+                  const auto lhsPos = lhs.Position();
+                  const auto rhsPos = rhs.Position();
+                  return std::tie(lhsIdx, lhsPos) < std::tie(rhsIdx, rhsPos);
+              });
+
+    // write results to file
+    VcfWriter writer{outputFilename, header};
+    for (const auto& var : variants)
+        writer.Write(var);
+}
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfVariant.cpp b/src/vcf/VcfVariant.cpp

new file mode 100644 (file)

index 0000000..9e73acb
--- /dev/null
+++ b/src/vcf/VcfVariant.cpp
@@ -0,0 +1,13 @@
+// Author: Derek Barnett
+
+#include <pbbam/vcf/VcfVariant.h>
+
+#include <pbbam/vcf/VcfFormat.h>
+
+namespace PacBio {
+namespace VCF {
+
+VcfVariant::VcfVariant(const std::string& text) { *this = VcfFormat::ParsedVariant(text); }
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/src/vcf/VcfWriter.cpp b/src/vcf/VcfWriter.cpp

new file mode 100644 (file)

index 0000000..8a2a1b3
--- /dev/null
+++ b/src/vcf/VcfWriter.cpp
@@ -0,0 +1,44 @@
+// Author: Derek Barnett
+
+#include <pbbam/vcf/VcfWriter.h>
+
+#include <fstream>
+#include <iostream>
+
+#include <pbbam/MakeUnique.h>
+#include <pbbam/vcf/VcfFormat.h>
+#include <pbbam/vcf/VcfHeader.h>
+#include <pbbam/vcf/VcfVariant.h>
+#include "../FileProducer.h"
+
+namespace PacBio {
+namespace VCF {
+
+struct VcfWriter::VcfWriterPrivate : public PacBio::BAM::internal::FileProducer
+{
+    VcfWriterPrivate(std::string fn, const VcfHeader& header)
+        : PacBio::BAM::internal::FileProducer{std::move(fn)}, out_{TempFilename()}
+    {
+        out_ << VcfFormat::FormattedHeader(header) << '\n';
+    }
+
+    bool Write(const VcfVariant& var)
+    {
+        out_ << VcfFormat::FormattedVariant(var) << '\n';
+        return true;  // TODO: handle errors
+    }
+
+    std::ofstream out_;
+};
+
+VcfWriter::VcfWriter(std::string fn, const VcfHeader& header)
+    : d_{std::make_unique<VcfWriterPrivate>(std::move(fn), header)}
+{
+}
+
+bool VcfWriter::Write(const VcfVariant& var) { return d_->Write(var); }
+
+VcfWriter::~VcfWriter() {}
+
+}  // namespace VCF
+}  // namespace PacBio
diff --git a/subprojects/gtest.wrap b/subprojects/gtest.wrap

new file mode 100644 (file)

index 0000000..773a713
--- /dev/null
+++ b/subprojects/gtest.wrap
@@ -0,0 +1,10 @@
+[wrap-file]
+directory = googletest-release-1.8.0
+
+source_url = https://github.com/google/googletest/archive/release-1.8.0.zip
+source_filename = gtest-1.8.0.zip
+source_hash = f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf
+
+patch_url = https://wrapdb.mesonbuild.com/v1/projects/gtest/1.8.0/5/get_zip
+patch_filename = gtest-1.8.0-5-wrap.zip
+patch_hash = 7eeaede4aa2610a403313b74e04baf91ccfbaef03203d8f56312e22df1834ec5
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt

new file mode 100644 (file)

index 0000000..16d3778
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,77 @@
+
+if(PacBioBAM_build_tests)
+
+    # setup GoogleTest
+    if (NOT GTEST_SRC_DIR)
+        set(PREBUILT_GTEST_SRC ${PacBioBAM_RootDir}/../../../../prebuilt.tmpout/gtest/gtest_1.7.0/)
+        if(EXISTS ${PREBUILT_GTEST_SRC})
+            set(GTEST_SRC_DIR ${PREBUILT_GTEST_SRC})
+        else()
+            set(GTEST_SRC_DIR ${PacBioBAM_RootDir}/../gtest) # keep old fallback behavior for external builds, for now at least
+        endif()
+    endif()
+    add_subdirectory(${GTEST_SRC_DIR} external/gtest/build)
+
+    # generate paths/values used by for unit tests
+    configure_file(
+        ${PacBioBAM_TestsDir}/src/PbbamTestData.h.in
+        ${CMAKE_BINARY_DIR}/generated/PbbamTestData.h
+    )
+    configure_file(
+        ${PacBioBAM_TestsDir}/data/group/group.fofn.in
+        ${CMAKE_BINARY_DIR}/generated/group.fofn
+    )
+    configure_file(
+        ${PacBioBAM_TestsDir}/data/pbbamify/synthetic_movie_all.subreadset.xml.in
+        ${CMAKE_BINARY_DIR}/generated/synthetic_movie_all.subreadset.xml
+    )
+
+    # grab PacBioBAM unit test source files
+    include(files.cmake)
+    set(SOURCES
+        ${PacBioBAMTest_H}
+        ${PacBioBAMTest_CPP}
+    )
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PacBioBAM_CXX_FLAGS}")
+
+    # define unit test executable
+    if(MSVC)
+        # VS2012+ pooh-pooh's Derek's "#define private public" trick
+        add_definitions(-D_ALLOW_KEYWORD_MACROS)
+    endif()
+
+    add_executable(test_pbbam ${SOURCES})
+    set_target_properties(test_pbbam PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY ${PacBioBAM_BinDir}
+    )
+    target_include_directories(test_pbbam
+        PUBLIC
+        ${CMAKE_BINARY_DIR}/generated
+        ${PacBioBAM_INCLUDE_DIRS}
+        ${gtest_SOURCE_DIR}/include
+        ${gtest_SOURCE_DIR}
+    )
+
+    # generate test data
+    add_custom_target(
+        generate_test_data
+        WORKING_DIRECTORY ${PacBioBAM_TestsDir}/scripts
+        COMMAND "python" generate_data.py
+            ${PacBioBAM_TestsDir}/data/
+            ${GeneratedTestDataDir}
+    )
+
+    # add unit tests to test framework
+    add_test(
+        NAME UnitTests
+        WORKING_DIRECTORY ${PacBioBAM_BinDir}
+        COMMAND test_pbbam
+    )
+    add_dependencies(test_pbbam generate_test_data)
+    target_link_libraries(test_pbbam
+        pbbam
+        ${CMAKE_THREAD_LIBS_INIT} # quirky pthreads
+        gtest
+        gtest_main
+    )
+endif() # PacBioBAM_build_tests
diff --git a/tests/data/aligned.bam b/tests/data/aligned.bam

new file mode 100644 (file)

index 0000000..34d81e5

Binary files /dev/null and b/tests/data/aligned.bam differ
diff --git a/tests/data/aligned.bam.bai b/tests/data/aligned.bam.bai

new file mode 100644 (file)

index 0000000..66ba855

Binary files /dev/null and b/tests/data/aligned.bam.bai differ
diff --git a/tests/data/aligned.bam.pbi b/tests/data/aligned.bam.pbi

new file mode 100644 (file)

index 0000000..f2cf207

Binary files /dev/null and b/tests/data/aligned.bam.pbi differ
diff --git a/tests/data/aligned.sam b/tests/data/aligned.sam

new file mode 100644 (file)

index 0000000..ad45e63
--- /dev/null
+++ b/tests/data/aligned.sam
@@ -0,0 +1,8 @@
+@HD    VN:1.3.1        SO:coordinate   pb:3.0.3
+@SQ    SN:lambda_NEB3011       LN:48502        M5:a1319ff90e994c8190a4fe6569d0822a
+@RG    ID:0d7b28fa     PL:PACBIO       DS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100      PU:singleInsertion      PM:SEQUEL
+@PG    ID:bwa  PN:bwa  VN:0.7.10-r1017-dirty   CL:bwa mem lambdaNEB.fa singleInsertion.fasta
+singleInsertion/100/0_49       2048    lambda_NEB3011  5211    60      3H8=1D19=1I21=59H       *       0       0       GGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGAT       *       NM:i:2  MD:Z:8^T40      AS:i:34 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,9378,+,52S37=2D10=1I11=,60,3;       qe:i:49 qs:i:0  np:i:1  zm:i:100        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
+singleInsertion/200/0_49       2048    lambda_NEB3011  5211    60      3H8=1D19=1I21=59H       *       0       0       GGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGAT       *       NM:i:2  MD:Z:8^T40      AS:i:34 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,9378,-,37=2D10=1I11=52S,60,3;       qe:i:49 qs:i:0  np:i:1  zm:i:200        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
+singleInsertion/100/0_111      0       lambda_NEB3011  9378    60      52S37=2D10=1I11=        *       0       0       TTTGGCTGCAGGTACAGCGGTCAGGAGGCCAATTGATGCCGGACTGGCTGATAAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGAGCAGCACGGTAAACAGCGGCAA *       NM:i:3  MD:Z:37^TC21    AS:i:43 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,5211,+,3S8=1D19=1I21=59S,60,2;      qe:i:111        qs:i:0  np:i:1  zm:i:100        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
+singleInsertion/100/0_111      16      lambda_NEB3011  9378    60      37=2D10=1I11=52S        *       0       0       AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGAGCAGCACGGTAAACAGCGGCAAATCAGCCAGTCCGGCATCAATTGGCCTCCTGACCGCTGTACCTGCAGCCAAA *       NM:i:3  MD:Z:37^TC21    AS:i:43 XS:i:0  RG:Z:0d7b28fa   SA:Z:lambda_NEB3011,5211,+,3S8=1D19=1I21=59S,60,2;      qe:i:111        qs:i:0  np:i:1  zm:i:100        rq:f:0.6        sn:B:f,0.2,0.2,0.2,0.2
diff --git a/tests/data/aligned2.bam b/tests/data/aligned2.bam

new file mode 100644 (file)

index 0000000..672e5e5

Binary files /dev/null and b/tests/data/aligned2.bam differ
diff --git a/tests/data/aligned2.bam.bai b/tests/data/aligned2.bam.bai

new file mode 100644 (file)

index 0000000..f954ab0

Binary files /dev/null and b/tests/data/aligned2.bam.bai differ
diff --git a/tests/data/aligned2.bam.pbi b/tests/data/aligned2.bam.pbi

new file mode 100644 (file)

index 0000000..c1e82de

Binary files /dev/null and b/tests/data/aligned2.bam.pbi differ
diff --git a/tests/data/chemistry.xml b/tests/data/chemistry.xml

new file mode 100644 (file)

index 0000000..c6a6521
--- /dev/null
+++ b/tests/data/chemistry.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<MappingTable>
+  <Mapping>
+    <SequencingChemistry>FOUND</SequencingChemistry>
+    <BindingKit>1</BindingKit>
+    <SequencingKit>2</SequencingKit>
+    <SoftwareVersion>3.4</SoftwareVersion>
+  </Mapping>
+</MappingTable>
diff --git a/tests/data/chimera_minimal.fasta b/tests/data/chimera_minimal.fasta

new file mode 100644 (file)

index 0000000..f2eb86c
--- /dev/null
+++ b/tests/data/chimera_minimal.fasta
@@ -0,0 +1,220 @@
+>Barcode0--0_Cluster1_Phase1_NumReads297
+GCAGGTGCCTTTGCAGAAACAAAGTCAGGGTTCTTCAAGTCACAAAGGGAAGGGCAGGAA
+CAACTCTTGCCTCTCAGTCCCACACAAGGCAGCTGTCTCACACTATAGAAAAAAATATTC
+ATGAACAAATTCGTATCTGTCACAGTGAGGGGTCACACTTTAAACAGCCCATCGCATGCT
+CAATACATCCAATGGAAAGAAACCCCATAGCACAGCTGTGTCCACTGTTCCGCCCAACAC
+CCAACACACATCAGGCCCTCCAGGCTCTCACCTTTACAAGCTGTGAGAGACACATCAGAG
+CCCTGGGCACTGTCACTGCCTGGGGTAGAACAAAAACAGAACCTGGTCAGATCCCACAGA
+AGATGTGGCTAGAGGAGGAATTGTGGGGTGGGTGAGCTCCCCCATGGGCTCCCAAACACA
+ATATCCCAAGGACCTCAGGCATCAGCCTCCTTCATACTTACTTGCAGCCTGAGAGTAGCT
+CCCTCCTTTTCTATCTGTGGGAAGAAAATGTCCTGTGAGATACCAGAAAGGAGTCAGGGC
+CTTAAGGTCCTAGAGGAACCTCCAAGTCTTGGACCTCAGAGAAGTTTCCAGAAATGTGTG
+ACTGCAGACCCAGGGCGGGATCAGGAAACATGAAGAAAGCAGGTGTGGGTCCTGGACCAA
+CCGCCCTCCTGAAGGTCCTCAGGGACCTTCCCCTGTGACTTGTGACTGCTGGGATCAGGT
+CCCATCACCGCTGTAATCAAGGTGATAAATCTGTCCTTCATTTTAACAGGTGCTTTACAA
+AAGAGTAAGTGCTGGCACACAGGGCCCAGGCTGGGTAGGCCCATAATTGTGGGTGGTGCT
+TCCCAGTAACGAGGCAGGGCACACTTCTACCTGGGTCTTGGAACCCTCAGTGAGACAAGA
+AATCTCAGACCCACCCTTCACCCCTTCCCCACCTGAGCTCTTCCTCCTCCACATCACAGC
+AGCGACCACAGCTCCAGTGATCACAGCTCCAAAGAGAACCAGGCCAGCAATGATGCCCAC
+GATGGGGATGGTGGGCTGGGAAGACGGCTCTGGGAAAAGAGGGGAAGGTGAGGGGCCCTG
+ACCCTGCTAAAGGTCAGAGAGGCTCCTGCTTTCCCTAAAAGACATGACACCCCCGTCTCC
+CTCCTTACCCCATCTCAGGGTGAGGGGCTTGGGCAAACCCTCATGCTGCACATGGCAGGT
+GTATCTCTGCTCCTGTCCAGAAGGCACCACCACAGCCGCCCACTTCTGGAAGGTTCCATC
+CCCTGCAGGCCTGGTCTCCACGAGCTCCGTGTCCTGGGTCTGGTCCTCCCCATCCCGCTG
+CCAGGTCAGTGTGATCTCCGCAGGGTAGAAGCTCAGGGCCCAGCACCTCAGGGTGGCTTC
+ATGGTCAGAGACAGCGTGGTGAGTCATATGCGTTTTGGGGGCGTCTGTCAGGAAGAGTCA
+GATCATTCAGGCATTTTGCATCTGTCATGGGACACTCCTCCAGCACACATGTGGCTATCT
+TGAGAATGGACAGGACACCTGGGATGGGGAAGGGAGCACAGAACCCAGACACCAGCCTGG
+ACACAGGCACCTGGGATAATCTCCTATTCCGTGGAAAATTCTAGTCCCTGAAGAGGGAAC
+AGCGACTTCTGGTCCTGACCTGAGTGGAGGCTGAAGGACTCAGAAGTGCTGGACTCAGAC
+CCCCACACACATTGAGTGTGAAGCAGAGAACAAGGCCTGAGAGGAAAAGTCACGGGCCCA
+AGGCTGCTGCCTGTGTGTGTCAAAGGGAACCACTCATCAGTATTCGAGGGATCGTCTTCC
+CGTCATTCCTTCAGAGATTTTATCCCTTAATTGTGTCAGAGAGCAGGGCGGAACCTCAGA
+GTCACTCTCTGGTACAGGATCTGGAAACCCAGGAGGATTCCTCTCCCTCAGGACCAGAGG
+GAGGGCGATATTCTAGTGTTGGTCCCAATTGTCTCCCCTCCTTGTGGGAGGCCAGCCCGG
+GAGATCTACAGGCGATCAGGGAGGCGCCCCGTGGCCCCTGGTACCCGTGCGCTGCAGCGT
+CTCCTTCCCGTTCTCCAGGTATCTGCGGAGCCACTCCACGCACGTGCCCTCCAGGTAGGC
+TCTCAACTGCTCCGCCACATGGGCCGCCTCCCACTTGTGCTTGGTGGTCTGAGCTGCCAT
+GTCCGCCGCGGTCCAAGAGCGCAGGTCCTCTTTCAGGGCGATGTAATCCTTGCCGTCGTA
+GGCGTACTGGTGGTACCCGCGGAGGAAGCGCCAGTCCGACCCCACGTCGCAGCCATACAT
+CCTCTGGACGGTGTGAGAACCTGGCCCGGACCCCGCGGTCAGCCCGGTCCCCCGAGCCCC
+GCCCCGCCCCGACCAACCTGGGGGGATTTTTGGCCTAAACTGAAAATGAAACCGGGTAAA
+GGCGCCTGGGCCTCTCCCGGGGCAAGGGTCTCGGGGTCCCGCGGCTTCGGGGCGGATCTC
+GGACCCGGAGACTGTGGGCGACCTGGCCCGTCCGTGGGGGATGAGAGGTCGTGACCTGCG
+CCCCGGGCCGGGGTCACTCACCGGCCTCGCTCTGGTTGTAGTAGCCGCGCAGGGTCCCCA
+GGTCCACTCGGTGAGTCTGTGAGTGGGCCTTCACTTTCCGTGTCTCCCCGTCCCAATACT
+CCGGACCCTCCTGCTCTATCCACGGCGCCCGCGGCTCCATCCTCTGGCTCGCGGCGTCGC
+TGTCGAACCGCACGAACTGCGTGTCGTCCACGTAGCCCACTGCGATGAAGCGGGGCTCCC
+CGCGGCCGGGCCGGGACACGGATGTGAAGAAATACCTCATGGAGTGAGAGCCTGGGGACG
+AGGAGTGGCTGAGACCCGCCCGACCCTCCTCCCGGCGCGGCTTCCCGGGTCCTGCGCCCC
+CGCCAGGCGGGCCCGTTGCTTCTCCCCACAGAGGCCGTTTCCCTCCCGACCCCGCACTCA
+CCCGCCCAGGTCTGGGTCAGGGCCAGAGCCCCCGAGAGTAGCAGGACGAGGGTTCGGGGC
+GCCATGACGGCCATCCTCGGCGTCTGGGGAGAATCTGAGTCCCGGTGGGTGCGTGCGGAC
+TTTAGAACCGCGACCGCGACGACACTGATTGGCTTCTCTGGAAACCCGACACCCAATGGG
+AGTGAGAACTGGGTCCGCGTCGTGAGTATCCA
+>Barcode0--0_Cluster3_Phase1_NumReads294
+GCAGGTGCCTTTGCAGAAACAAAGTCAGGGTTCTTCAAGTCACAAAGGGAAGGGCAGGAA
+CAACTCTTGCCTCTCAGTCCCACACAAGGCAGCTGTCTCACACTATAGAAAAAAATATTC
+ATGAACAAATTCATATCCATCACAGTGAGGGGTCACACCTTAAACAGCCCATCGCATGCT
+CAATACATCCAATGCAAAGAAACCCCATAGCACAGCTGTGTCCACTGTTCCGCCCAACAC
+CCAACACACATTAGGTCCTCCAAGCTCTCACCTTTACAAGCTGTGAGGGACACATCAGAG
+CCCTGGGCACTGTCACTGCCTGGGGTAGAACAAAAACAGAACCTGGTCAGATCCCACAGA
+AGATGTGGCTAGAGGAGGAATTGTGAGGTGGGTGGGCTCCCCCATGGGCTCCCAAACACA
+ATATCCCAAGGACCTCAGGCATCAGCCTCCTTCATACTTACTTGCAGCCTGAGTGTAACT
+CCCTCCTTTTCTATCTGTGAGAAGAAAATGTCCTGTGAGATACCAGAAAGGAGCCAGGGC
+CTTAAGGTCCTAGAGGAACCTCCTAGTCTTGGACCCCAGAGAAGTTTCCAGAAATGTGTG
+ACTGCAGACCCAGGGCGGGATCAGGAAACATGAAGAAAGCAGGTGTGGGTCCTGGACCAA
+TAGCCCTCCTGAGGTCTGTCCTCAGGGACCTTCCCCTGTGACTTGTGACTGCTGGGATCA
+GGTCCCATCACCGCCGTAATCAAGGTGATAAATCTGTCCTTCATTTTAACAGGTGCTTTA
+CAAAAGAGTAAGTGCTGGCACACAGGGCCCAGACTGGGTAGGCCCATGATTGTGGACGGT
+GCTTCCCAGTAATGAGACAGGGCACATTTCTAGCTGGGGCTTGGAACCCTCAGTGAGACA
+AGAAATCTCAGACCCCACCCTTCACCCCTTCTCCACCTGAGCTCTTCCTCCTCCACATCA
+CGGCAGCGACCACAGCTCCAGTGATCACAGCTCCAAGGAGAACCAGGCCAGCAATGATGC
+CCACGATGGGGATGGTGGGCTGGGAAGACAGCTCTGGGAAAAGAGGGGAAGGTGAGGGGC
+CCTGACCCTGCTAAAGGTCTCCAGAGAGGCTCCTGCTTTCCCTAAGAGACATGACACCCC
+CATCTCCCTCCTTACCCCATCTCAGGGTGAGGGGCTTGGGCAGACCCTCATGCTGCACAT
+GGCAGGTGTATCTCTGCTCCTCTCCAGAAGGCACCACCACAGCCGCCCACTTCTGGAAGG
+TTCCATCCCCTGCAGGCCTGGTCTCCACGAGCTCCGTGTCCTGGGTCTGGTCCTCCCCAT
+CCCGCTGCCAGGTCAGTGTGATCTCCGCAGGGTAGAAGCCCAGGGCCCAGCACCTCAGGG
+TGGCCTCATGGTCAGAGATGGGGTGGTGGGTCATATGTGTCTTGGGGGGGTCTGACGGGA
+AGAGTCAGAAAATTCAGGCATTTTGCATCTGTCATGGGACACTCCACCAGCACGCATGTG
+GCCATCTTGAGAATGGACAGGACACCCGGGATGGGGAAGAGAGCACAGAACCCAGACACC
+AGCCTGGACACAGGCACCTGGGATAATCTTCTATTCCCTGAGAAGGGAACAGCGACTTCT
+GGTCCTGACCTGAGTGGAGGCTGAGGGACTCAGAAGTGCTGGACTCAGACCCCCACACAC
+ATTGAGTGTGAAGCAGAGAACAAGGCCTGAGAGGAAAAGTCACGGGCCCAAGGCTGCTGC
+CGGTGTCAAAGGGAACCACTCATCAGTATTCGAGGGATCGTCTTCCCGTCACTCCTTCAG
+AGATTTTATCCCTTAATTGTGTCAGAGAGCAGGGCGGAACCTCAGAGTCACTCTCTGGTA
+CAGGATCTGGAACCCAGGAGGATTCCTCTCCCTCAGGACCAGAGGGAGGGTGATATTCTA
+GTGTTGGTCCCAATTGTCTCCCCTCCTTGTGGGAGGCCAGCCCGGGAGATCTACAGGCGA
+TCAGGGAGGCGCCCCGTGGCCCCTGGTACCCGTGCGCTGCAGCGTCTCCTTCCCGTTCTC
+CAGGTATCTGCGGAGCCACTCCACGCACGTGCCATCCAGGTAGGCTCTCAACTGCTCCGC
+CTCATGGGCCGCCTCCCACTTGCGCTTGGTGATCTGAGCCGCCATGTCCGCCGCGGTCCA
+AGAGCGCAGGTCCTCGTTCAGGGCGATGTAATCCTTGCCGTCGTAGGCGTCCTGCCGGTA
+CCCGCGGAGGAAGCGCCCGTCCGACCCCACGTCGCAGCCATACATTATCTGGATGGTGTG
+AGAACCTGGCCCCGACCCCGCGGTCAGCCCAGTCCCCCGAGCCCCGCCCAGCCCCGACCA
+ACCCGGGGGGATTTTTGGCCTAAACTGAAAATGAAACCGGGTAAAGGCGCCTGGGCCTCT
+CCCGGGGCAAGGGTCTCGGGGTCCCGCGGCTTCGGGGTGGATCTCGGACCCGGAGACTGT
+GGGCGACCTGGCCCGTCCGTGGGGGATGAGGGGTCCTGACCTGCGCCCCCGGCCGGGGTC
+ACTCACCGGCCTCGCTCTGGTTGTAGTAGCCGCGCAGGGTCCCCAGGTCCACTCGGTCAG
+TCTGTGACTGGGCCTTCACATTCCGTGTCTCCTGGTCCCAATACTCCGGCCCCTCCTGCT
+CTATCCACGGCGCCCGCGGCTCCATCCTCTGGCTCGCGGCGTCGCTGTCGAACCGCACGA
+ACTGCGTGTCGTCCACGTAGCCCACGGCGATGAAGCGGGGCTCCCCGCGGCCGGGCCGGG
+ACACGGATGTGAAGAAATACCTCATGGAGTGGGAGCCTGGGGGCGAGCAGTGGCTGAGAC
+CTGCCCGACCCTCGTCCCGGCGCGGCTCCCCCGGTCCTGCGCCCCCGCCAGGAGGGCCCC
+TTGCTTCTCCCCGCAGAGGCGGTTTCCCTCCCGACCCCGCACTCACCCGCCCAGGTCTGG
+GTCAGGGCCAGGGCCCCCGAGAGTAGCAGGAGGAGGGTTCGGGGCGCCATGACGGCCATC
+CTCGGCGTCTGGGGAGAATCTGAGTCCCGGTGGGTGCGTGCGGGCTTTAGAACAGCGACC
+GCGACGACACTGATTGGCTTCTCTGGAAACCCGACACCCAATGGGAGTGAGAACTGGGTC
+CGCGTCGTGAGTATCCA
+>Barcode0--0_Cluster0_Phase2_NumReads92
+CTGGGGAGGAAACACAGGTCAGCATGGGAACAGGGGTCACAGTGGACACGGGGGTGGGCT
+GTCTCTCCACCTCCTCACATTATGCTAACAGGGACGCAGACACATTCAGGTGCCTTTGCA
+GAAAGAGATGCCAGAGGCTCTTGAAGTCACAAAGGGGAGGAGTGAAGAAATCCTGCATCT
+CAGTCCCTCACAAGACAGCTGTCTCAGGCTACAGAAAACAACAGTCATGAACAAATTCTG
+GTTAGTCATGGTAAGTGATGACACTCTAAACAGCCCACCACACACGCGAAACATCCCAAT
+CAAAGAATCTCCATTACCCAGGCCTTTCCCCTCTGCCCCCTCCCCACCCCACCCCCCCCG
+CCCACTCTAGACCCCAAGAATCTCACCTTTTCAAGCTGTGAGAGACACATCAGAGCCCTG
+GGCACTGTCGCTGGCTGGAGTAGAACAAAAACAGGACCTGGTCAGAGCCCGCAGGAGACG
+TGGGACAGGAGGAATTATGGGGTGGGTGAGCTCCTCCACACTCCCACCCCCACCACTTAC
+ACGCAGCCTGAGAGTAGCTCCCTCCTTTTCCACCTGTGGGAAGAAAATGTCCTGTGAGGG
+GACTGGGAGGAAGCAGGGCCATGAGATCTTAGAGGAACCTCCTCGTCTTGGACCCAAAAG
+GAATTTCCAGAAGTATGACTACAGACCCAAGGCAGGATCAGGAAACACGAGGAAAGCAAG
+TGTGGGTCCTGGACCAACTGCCCTCCTAAGGTCTGTCCTTAGCAGGGACCTTCCCCTGAC
+TCATGAATGCTGAAATCAGGACCCCAACACCACAACCATCAAGGTGATACATCCGTCCTT
+CATTGTCACATGTGCTGCACAAAAGAGTAAGTGCTGGCACACAGGGTCCCAGGCTGCATT
+AGCCCCTGTGTGGATGCTGCTTCCCAGTAATGAGGCAGGGAACACTTCTACCTGGGGCTT
+GAAACCCCCAGTGGGACAAGAAAACCCAGACCCCACCCCTCACCCCTTCCCTACCTGAGC
+TCTTCCTCCTACACATCACAGTAGCGACCACAGCTCCGATGACCACAACTGCTAGGACAG
+CCAGGCCAGCAACAATGCCCACGATGGGGATGGTGGACTGGGAAGATGGCTCTGGGAAAG
+GAGGGGAAGACGAGGGGCCCTGACCCTGCTGAAGGGCTCCAGAAGGGCTCCTGCTTTCCC
+TGAGAAGAGATATGACCCCTCATCCCCCTCCTTACCCCATCTCAGGGTGAGGGGCTTCGG
+CAGCCCCTCATGCTGTACATGGCATGTGTATCTCTGCTCTTCTCCAGAAGGCACCACCAC
+AGCTGCCCACTTCTGGAAGGTTCTATCTCCTGCTGGTCTGGTCTCCACAAGCTCAGTGTC
+CTGAGTTTGGTCCTCGCCATCCCGCTGCCAGGTCAGTGTGATCTCCGCAGGGTAGAAGCC
+CAGGGCCCAGCACCTCAGGGTGGCCTCATGGTCAGAGATGGGGTGGTGGGTCACGTGTGT
+CTTTGGGGGGTCTGATGGGAAGAGTCAGAAAATTCAGGCGCTTTGCATCTCTCATAGGAC
+ACCCTAGGACCACCCATGTGACCAGCCTGAGAATGGACAGGACACCTGGGGTGGGGAAGG
+GGCACAGAACCCAGACACCAGCCTGGACGCAGGCACCTGGGATAATCTCCTATTCATTGG
+AAAGTTCGAGTCTCTGAGCGGGGAACAGGGACTTCTGCTCCTGATCTGAGTGGAGGTAAA
+GTGACTCAGAAGTGCTGGAATCAGAGCCCCAAACACACTGAGTGTGAGGCAGAGAACAAG
+GCCTGAGAGGAAAAGTCACGGTTCCCAAGGCTGCTGCAGGGGTCAAAGAGGACCCCTGAT
+CAGTATTCTAGGGACTGTCTTCCCCTCCATTTCCTCAGAGACGTCATCCCTTAATTGTCC
+TAGAGAGAAGAGGGGGCCCTCAGAGGAAACTCAGGAAAACTCATGCCATTCTCCATTCAA
+GGGAGGGCGACATTCTAGCGCTGATCCCATTTTCCTCCTCTTCTCGTGGGAGGCCATCCC
+CGGCGACCTATAGGAGATGGGGAAGGCTCCCCACTGCCCCTGGTACCCGCGCGCTGCAGC
+GTCTCCTTCCCGTTCTCCAGGTATCTGCGGAGCCACTCCACGCACAGGCCCTCCAGGTAG
+GCTCTCAGCTGCTCCGCCACACGGGCCGCCTCCCACTTGCGCTGGGTGATCTGAGCCGCG
+GTGTCCGCCGCGGTCCAGGAGCTCAGGTCCTCGTTCAGGGCGATGTAATCCTTGCCGTCG
+TAGGCTAACTGGTTATGCCCGCGGAGGAGGCGCCCGTCCGGCCCCAGGTCGCAGCCATAC
+ATCGTCTGCCAAGTGTGAGACCCTGGCCCCGGCCCCGCGGTCAGCCCCGTCCCCCGAGCC
+CCGCCCCGCCCCGACCAACCCGCGGGGATTTTGGCCTCAACTGAAAATGAAACCGGGTAA
+ACGCGCCTGGGGCTCTCGCCGGTCGAGGGTCTGGGCGGGTCCCGCGGCCTCAGGGAGGCG
+GATCTCGGACCCGGAGACTCGGGGCGACCCGGGCCGTACGTGGGGGATGGGGAGTCGTGA
+CCTGCGCCCCGGGCCGGGGTCACTCACCGGCCTCGCTCTGGTTGTAGTAGCCGCGCAGGT
+TCCGCAGGCTCTCTCGGTCAGTCTGTGCCTGGGCCTTGTAGATCTGTGTGTTCCGGTCCC
+AATACTCCGGCCCCTCCTGCTCTATCCACGGCGCCCGCGGCTCCTCTCTCGGACTCGCGG
+CGTCGCTGTCGAACCTCACGAACTGCGTGTCGTCCACGTAGCCCACTGCGATGAAGCGGG
+GCTCCCCGCGGCCGGGCCGGGACATGGCGGTGTAGAAATACCTCATGGAGTGGGAGCCTG
+GGGGCGAGGAGGGGCTGAGACCCGCCAGACCCTCCTCCCGGCGCGGCTCCCCGGGTCCTG
+CGCCCCCGCCTGCGGTCCCCTCGCTCCTCCCCACAGAGGCCATTTCCCTCCCGACCCGCA
+CTCACCGGCCCAGGTCTCGGTCAGGGCCAGGGCCCCCCAGAGCAGCAGGAGGAGGGTTCG
+GGGTGCCGTGACCCGCATCTCGGTGTCTGAGGAGACTCTGAGTCCGGGTGGGTGCGTGGG
+GACTTTAGAACTGGGACCCCGGCGACACTGATTGGCTTCTCTAGACACCCGACACCCAAT
+GGGAGTGGGAAATGGGGACGCGTCACGAGTATCCTGGAAGAAGGACCCGACATAGGTTGG
+GAGAAGAAGTGAAACTCGTGGGAGTGGGGAATCCCCAACGCTGCGCCTCCCCAATGCAGA
+CGCGGCCCTCGGAGCCTGAGACCCTGAGAGCCCCGTCCGGGACATGGGACTTCGTCCTGA
+TCCCTCTTCTCCTACACCAGCCTCTTTGTCACACTGTCTGCC
+>Barcode0--0_Cluster1_Phase3_NumReads56
+GCAGGTGCCTTTGCAGAAACAAAGTCAGGGTTCTTCAAGTCACAAAGGGAAGGGCAGGAA
+CAACTCTTGCCTCTCAGTCCCACACAAGGCAGCTGTCTCACACTATAGAAAAAAATATTC
+ATGAACAAATTCGTATCTGTCACAGTGAGGGGTCACACTTTAAACAGCCCATCGCATGCT
+CAATACATCCAATGGAAAGAAACCCCATAGCACAGCTGTGTCCACTGTTCCGCCCAACAC
+CCAACACACATCAGGCCCTCCAGGCTCTCACCTTTACAAGCTGTGAGAGACACATCAGAG
+CCCTGGGCACTGTCACTGCCTGGGGTAGAACAAAAACAGAACCTGGTCAGATCCCACAGA
+AGATGTGGCTAGAGGAGGAATTGTGGGGTGGGTGAGCTCCCCCATGGGCTCCCAAACACA
+ATATCCCAAGGACCTCAGGCATCAGCCTCCTTCATACTTACTTGCAGCCTGAGAGTAGCT
+CCCTCCTTTTCTATCTGTGGGAAGAAAATGTCCTGTGAGATACCAGAAAGGAGTCAGGGC
+CTTAAGGTCCTAGAGGAACCTCCAAGTCTTGGACCTCAGAGAAGTTTCCAGAAATGTGTG
+ACTGCAGACCCAGGGCGGGATCAGGAAACATGAAGAAAGCAGGTGTGGGTCCTGGACCAA
+CCGCCCTCCTGAAGGTCCTCAGGGACCTTCCCCTGTGACTTGTGACTGCTGGGATCAGGT
+CCCATCACCGCTGTAATCAAGGTGATAAATCTGTCCTTCATTTTAACAGGTGCTTTACAA
+AAGAGTAAGTGCTGGCACACAGGGCCCAGGCTGGGTAGGCCCATAATTGTGGGTGGTGCT
+TCCCAGTAACGAGGCAGGGCACACTTCTACCTGGGTCTTGGAACCCTCAGTGAGACAAGA
+AATCTCAGACCCACCCTTCACCCCTTCCCCACCTGAGCTCTTCCTCCTCCACATCACAGC
+AGCGACCACAGCTCCAGTGATCACAGCTCCAAAGAGAACCAGGCCAGCAATGATGCCCAC
+GATGGGGATGGTGGGCTGGGAAGACGGCTCTGGGAAAAGAGGGGAAGGTGAGGGGCCCTG
+ACCCTGCTAAAGGTCAGAGAGGCTCCTGCTTTCCCTAAAAGACATGACACCCCCGTCTCC
+CTCCTTACCCCATCTCAGGGTGAGGGGCTTGGGCAAACCCTCATGCTGCACATGGCAGGT
+GTATCTCTGCTCCTGTCCAGAAGGCACCACCACAGCCGCCCACTTCTGGAAGGTTCCATC
+CCCTGCAGGCCTGGTCTCCACGAGCTCCGTGTCCTGGGTCTGGTCCTCCCCATCCCGCTG
+CCAGGTCAGTGTGATCTCCGCAGGGTAGAAGCTCAGGGCCCAGCACCTCAGGGTGGCTTC
+ATGGTCAGAGACAGCGTGGTGAGTCATATGCGTTTTGGGGGCGTCTGTCAGGAAGAGTCA
+GATCATTCAGGCATTTTGCATCTGTCATGGGACACTCCTCCAGCACACATGTGGCTATCT
+TGAGAATGGACAGGACACCTGGGATGGGGAAGGGAGCACAGAACCCAGACACCAGCCTGG
+ACACAGGCACCTGGGATAATCTCCTATTCCGTGGAAAATTCTAGTCCCTGAAGAGGGAAC
+AGCGACTTCTGGTCCTGACCTGAGTGGAGGCTGAAGGACTCAGAAGTGCTGGACTCAGAC
+CCCCACACACATTGAGTGTGAAGCAGAGAACAAGGCCTGAGAGGAAAAGTCACGGGCCCA
+AGGCTGCTGCCTGTGTGTGTCAAAGGGAACCACTCATCAGTATTCGAGGGATCGTCTTCC
+CGTCATTCCTTCAGAGATTTTATCCCTTAATTGTGTCAGAGAGCAGGGCGGAACCTCAGA
+GTCACTCTCTGGTACAGGATCTGGAACCCAGGAGGATTCCTCTCCCTCAGGACCAGAGGG
+AGGGCGATATTCTAGTGTTGGTCCCAATTGTCTCCCCTCCTTGTGGGAGGCCAGCCCGGG
+AGATCTACAGGCGATCAGGGAGGCGCCCCGTGGCCCCTGGTACCCGTGCGCTGCAGCGTC
+TCCTTCCCGTTCTCCAGGTATCTGCGGAGCCACTCCACGCACGTGCCATCCAGGTAGGCT
+CTCAACTGCTCCGCCTCATGGGCCGCCTCCCACTTGCGCTTGGTGATCTGAGCCGCCATG
+TCCGCCGCGGTCCAAGAGCGCAGGTCCTCGTTCAGGGCGATGTAATCCTTGCCGTCGTAG
+GCGTCCTGCCGGTACCCGCGGAGGAAGCGCCCGTCCGACCCCACGTCGCAGCCATACATT
+ATCTGGATGGTGTGAGAACCTGGCCCCGACCCCGCGGTCAGCCCAGTCCCCCGAGCCCCG
+CCCAGCCCCGACCAACCCGGGGGGATTTTTGGCCTAAACTGAAAATGAAACCGGGTAAAG
+GCGCCTGGGCCTCTCCCGGGGCAAGGGTCTCGGGGTCCCGCGGCTTCGGGGTGGATCTCG
+GACCCGGAGACTGTGGGCGACCTGGCCCGTCCGTGGGGGATGAGGGGTCCTGACCTGCGC
+CCCCGGCCGGGGTCACTCACCGGCCTCGCTCTGGTTGTAGTAGCCGCGCAGGGTCCCCAG
+GTCCACTCGGTCAGTCTGTGACTGGGCCTTCACATTCCGTGTCTCCTGGTCCCAATACTC
+CGGCCCCTCCTGCTCTATCCACGGCGCCCGCGGCTCCATCCTCTGGCTCGCGGCGTCGCT
+GTCGAACCGCACGAACTGCGTGTCGTCCACGTAGCCCACGGCGATGAAGCGGGGCTCCCC
+GCGGCCGGGCCGGGACACGGATGTGAAGAAATACCTCATGGAGTGGGAGCCTGGGGGCGA
+GCAGTGGCTGAGACCTGCCCGACCCTCGTCCCGGCGCGGCTCCCCCGGTCCTGCGCCCCC
+GCCAGGAGGGCCCCTTGCTTCTCCCCGCAGAGGCGGTTTCCCTCCCGACCCCGCACTCAC
+CCGCCCAGGTCTGGGTCAGGGCCAGGGCCCCCGAGAGTAGCAGGAGGAGGGTTCGGGGCG
+CCATGACGGCCATCCTCGGCGTCTGGGGAGAATCTGAGTCCCGGTGGGTGCGTGCGGGCT
+TTAGAACAGCGACCGCGACGACACTGATTGGCTTCTCTGGAAACCCGACACCCAATGGGA
+GTGAGAACTGGGTCCGCGTCGTGAGTATCCA
+\ No newline at end of file
diff --git a/tests/data/chunking/chunking.subreadset.xml b/tests/data/chunking/chunking.subreadset.xml

new file mode 100644 (file)

index 0000000..6d15ff1
--- /dev/null
+++ b/tests/data/chunking/chunking.subreadset.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+    TimeStampedName="subreadset_150304_231155" 
+    MetaType="PacBio.DataSet.SubreadSet" 
+    Name="DataSet_SubreadSet" 
+    Tags="" 
+    Version="3.0.0" 
+    CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam">
+        <pbbase:FileIndices>
+            <pbbase:FileIndex 
+                UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5194" 
+                TimeStampedName="bam_index_150304_231155" 
+                MetaType="PacBio.Index.PacBioIndex" 
+                ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi"/>
+        </pbbase:FileIndices>
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5197" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam">
+        <pbbase:FileIndices>
+            <pbbase:FileIndex 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5198" 
+                TimeStampedName="bam_index_150304_231155" 
+                MetaType="PacBio.Index.PacBioIndex" 
+                ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi"/>
+        </pbbase:FileIndices>
+    </pbbase:ExternalResource><pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5195" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam">
+        <pbbase:FileIndices>
+            <pbbase:FileIndex 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5196" 
+                TimeStampedName="bam_index_150304_231155" 
+                MetaType="PacBio.Index.PacBioIndex" 
+                ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi"/>
+        </pbbase:FileIndices>
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="movie" Operator="=" Value="m150404_101626_42267_c100807920800000001823174110291514_s1_p0"/>
+            <pbbase:Property Name="zm" Operator="lt" Value="1816"/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam

new file mode 100644 (file)

index 0000000..c4ec7ea

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..4af87e2

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam

new file mode 100644 (file)

index 0000000..e623aca

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..6479979

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam

new file mode 100644 (file)

index 0000000..8544f6a

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam differ
diff --git a/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..a9f4edb

Binary files /dev/null and b/tests/data/chunking/m150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi differ
diff --git a/tests/data/dataset/ali1.xml b/tests/data/dataset/ali1.xml

new file mode 100644 (file)

index 0000000..ab0a82a
--- /dev/null
+++ b/tests/data/dataset/ali1.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments0.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments1.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments1.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/ali2.xml b/tests/data/dataset/ali2.xml

new file mode 100644 (file)

index 0000000..c35f9ec
--- /dev/null
+++ b/tests/data/dataset/ali2.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments2.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments2.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments3.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments3.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/ali3.xml b/tests/data/dataset/ali3.xml

new file mode 100644 (file)

index 0000000..f58d25f
--- /dev/null
+++ b/tests/data/dataset/ali3.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01"  MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments2.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments2.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments3.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments3.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.75" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/ali4.xml b/tests/data/dataset/ali4.xml

new file mode 100644 (file)

index 0000000..ab0a82a
--- /dev/null
+++ b/tests/data/dataset/ali4.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:AlignmentSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Alignments BAM" Description="Points to an example Alignments BAM file." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:///mnt/path/to/alignments0.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second Alignments BAM" Description="Points to another example Alignments BAM file, by relative path." MetaType="AlignmentFile.AlignmentBamFile" ResourceId="file:./alignments1.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:///mnt/path/to/alignments1.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSets>
+               <pbds:DataSet UniqueId="ab95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="HighQuality Read Alignments">
+                       <pbds:Filters> <!-- These Filters are in addition to those above. This provides a means to subset and label the parent DataSet further. -->
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+               <pbds:DataSet UniqueId="ac95d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" Name="Alignments to chromosome 1">
+                       <pbds:Filters>
+                               <pbds:Filter>
+                    <pbbase:Properties>
+                        <pbbase:Property Name="RNAME" Value="chr1" Operator="=="/>
+                    </pbbase:Properties>
+                </pbds:Filter>
+                       </pbds:Filters>
+               </pbds:DataSet>
+       </pbds:DataSets>
+</pbds:AlignmentSet>
diff --git a/tests/data/dataset/bam_mapping.bam b/tests/data/dataset/bam_mapping.bam

new file mode 100644 (file)

index 0000000..2d4ae7b

Binary files /dev/null and b/tests/data/dataset/bam_mapping.bam differ
diff --git a/tests/data/dataset/bam_mapping.bam.pbi b/tests/data/dataset/bam_mapping.bam.pbi

new file mode 100644 (file)

index 0000000..fe7c3be

Binary files /dev/null and b/tests/data/dataset/bam_mapping.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_1.bam b/tests/data/dataset/bam_mapping_1.bam

new file mode 100644 (file)

index 0000000..1e9670e

Binary files /dev/null and b/tests/data/dataset/bam_mapping_1.bam differ
diff --git a/tests/data/dataset/bam_mapping_1.bam.pbi b/tests/data/dataset/bam_mapping_1.bam.pbi

new file mode 100644 (file)

index 0000000..d99a174

Binary files /dev/null and b/tests/data/dataset/bam_mapping_1.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_2.bam b/tests/data/dataset/bam_mapping_2.bam

new file mode 100644 (file)

index 0000000..09678ea

Binary files /dev/null and b/tests/data/dataset/bam_mapping_2.bam differ
diff --git a/tests/data/dataset/bam_mapping_2.bam.pbi b/tests/data/dataset/bam_mapping_2.bam.pbi

new file mode 100644 (file)

index 0000000..d1765ef

Binary files /dev/null and b/tests/data/dataset/bam_mapping_2.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_new.bam b/tests/data/dataset/bam_mapping_new.bam

new file mode 100644 (file)

index 0000000..3039331

Binary files /dev/null and b/tests/data/dataset/bam_mapping_new.bam differ
diff --git a/tests/data/dataset/bam_mapping_new.bam.pbi b/tests/data/dataset/bam_mapping_new.bam.pbi

new file mode 100644 (file)

index 0000000..82d497c

Binary files /dev/null and b/tests/data/dataset/bam_mapping_new.bam.pbi differ
diff --git a/tests/data/dataset/bam_mapping_staggered.xml b/tests/data/dataset/bam_mapping_staggered.xml

new file mode 100644 (file)

index 0000000..879c193
--- /dev/null
+++ b/tests/data/dataset/bam_mapping_staggered.xml
@@ -0,0 +1,35 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<pbds:DataSet CreatedAt="2015-05-13T10:58:26" MetaType="PacBio.DataSet.DataSet" Name="" Tags="" UniqueId="30f72098-bc5b-e06b-566c-8b28dda909a8" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"  xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_1.bam">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_1.bam.bai"/>
+                       </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_2.bam">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_2.bam.bai"/>
+                       </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:DataSets>
+        <pbds:DataSet CreatedAt="2015-05-13T10:58:26" UniqueId="c5402d06-4643-057c-e300-fe229b4e8909" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+            <pbbase:ExternalResources>
+                <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_2.bam">
+                               <pbbase:FileIndices>
+                                       <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_2.bam.bai"/>
+                               </pbbase:FileIndices>
+                </pbbase:ExternalResource>
+            </pbbase:ExternalResources>
+        </pbds:DataSet>
+        <pbds:DataSet CreatedAt="2015-05-13T10:58:26" UniqueId="f8b54a55-5fb7-706f-ab35-39afc9c86924" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+            <pbbase:ExternalResources>
+                <pbbase:ExternalResource ResourceId="file:tests/data/bam_mapping_1.bam">
+                               <pbbase:FileIndices>
+                                       <pbbase:FileIndex ResourceId="file:tests/data/bam_mapping_1.bam.bai"/>
+                               </pbbase:FileIndices>
+                </pbbase:ExternalResource>
+            </pbbase:ExternalResources>
+        </pbds:DataSet>
+    </pbds:DataSets>
+</pbds:DataSet>
diff --git a/tests/data/dataset/barcode.dataset.xml b/tests/data/dataset/barcode.dataset.xml

new file mode 100644 (file)

index 0000000..1fbbb18
--- /dev/null
+++ b/tests/data/dataset/barcode.dataset.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:BarcodeSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.BarcodeSet" Name="DataSet_BarcodeSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First Barcodes FASTA" Description="Points to an example Barcodes FASTA file." MetaType="BarcodeFile.BarcodeFastaFile" ResourceId="file:///mnt/path/to/barcode.fasta" Tags="Example"/>
+       </pbbase:ExternalResources>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>400</pbds:TotalLength>
+               <pbds:NumRecords>30</pbds:NumRecords>
+               <pbds:BarcodeConstruction>paired</pbds:BarcodeConstruction>
+       </pbds:DataSetMetadata>
+</pbds:BarcodeSet>
diff --git a/tests/data/dataset/ccsread.dataset.xml b/tests/data/dataset/ccsread.dataset.xml

new file mode 100644 (file)

index 0000000..97b5943
--- /dev/null
+++ b/tests/data/dataset/ccsread.dataset.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:ConsensusReadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.ConsensusReadSet" Name="DataSet_ConsensusReadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First ConsensusRead BAM" Description="Points to an example ConsensusRead BAM file." MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="file:///mnt/path/to/ccsreads0.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="file:///mnt/path/to/ccsreads0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource Name="Second ConsensusRead BAM" Description="Points to another example ConsensusRead BAM file." MetaType="PacBio.ConsensusReadFile.ConsensusReadBamFile" ResourceId="file:///mnt/path/to/ccsreads1.bam" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="file:///mnt/path/to/ccsreads0.pbi"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+</pbds:ConsensusReadSet>
diff --git a/tests/data/dataset/lambda_contigs.xml b/tests/data/dataset/lambda_contigs.xml

new file mode 100644 (file)

index 0000000..4abc8cc
--- /dev/null
+++ b/tests/data/dataset/lambda_contigs.xml
@@ -0,0 +1,6 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<pbds:ReferenceSet CreatedAt="2015-05-28T10:56:36" MetaType="PacBio.DataSet.ReferenceSet" Name="" Tags="" UniqueId="596e87db-34f9-d2fd-c905-b017543170e1" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource ResourceId="file:tests/data/lambda_contigs.fasta"/>
+    </pbbase:ExternalResources>
+</pbds:ReferenceSet>
+\ No newline at end of file
diff --git a/tests/data/dataset/malformed.xml b/tests/data/dataset/malformed.xml

new file mode 100644 (file)

index 0000000..31e0942
--- /dev/null
+++ b/tests/data/dataset/malformed.xml
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="utf-8"?>
+<SubreadSet 
+    CreatedAt="2015-08-19T15:39:36.331"
+    Description="Merged dataset from 1 files using DatasetMerger 0.1.2" 
+    MetaType="PacBio.DataSet.HdfSubreadSet" 
+    Name="Subreads from runr000013_42267_150403" 
+    Tags="pacbio.secondary.instrument=RS" 
+    TimeStampedName="hdfsubreadset_2015-08-19T15:39:36.331-07:00" 
+    UniqueId="b4741521-2a4c-42df-8a13-0a755ca9ed1e"
+    Version="0.5" 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:ns0="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:ns1="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:ns2="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:ns3="http://pacificbiosciences.com/PacBioReagentKit.xsd">
+       <ns0:ExternalResources>
+        <ns0:ExternalResource 
+            MetaType="SubreadFile.SubreadBamFile"
+            TimeStampedName="SubreadFile.SubreadBamFile_00000000000000"
+            UniqueId="251acf71-9eb0-489e-9dd1-cdbd11432753" 
+            ResourceId="file:///mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0//mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0/file.subreads.subreads.bam"  />
+    </ns0:ExternalResources>
+    <DataSetMetadata>
+        <TotalLength>50000000</TotalLength>
+        <NumRecords>150000</NumRecords>
+        <ns2:Collections>
+            <ns2:CollectionMetadata 
+                Context="m150404_101626_42267_c100807920800000001823174110291514_s1_p0" 
+                InstrumentId="1" 
+                InstrumentName="42267" 
+                MetaType="PacBio.Collection" 
+                TimeStampedName="m150404_101626_42267_c100807920800000001823174110291514_s1_p0" 
+                UniqueId="d66c8372-2b70-4dcf-b64f-9f8b5cc351fd">
+                <ns2:InstCtrlVer>2.3.0.1.142990</ns2:InstCtrlVer>
+                <ns2:SigProcVer>NRT@172.31.128.10:8082, SwVer=2301.142990, HwVer=1.0</ns2:SigProcVer>
+                <ns2:RunDetails>
+                    <ns2:RunId>r000013_42267_150403</ns2:RunId>
+                    <ns2:Name>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:Name>
+                </ns2:RunDetails>
+                <ns2:WellSample Name="Inst42267-040315-SAT-100pM-2kb-P6C4">
+                    <ns2:PlateId>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:PlateId>
+                    <ns2:WellName>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:WellName>
+                    <ns2:Concentration>0.0</ns2:Concentration>                         
+                    <ns2:SampleReuseEnabled>false</ns2:SampleReuseEnabled>
+                    <ns2:StageHotstartEnabled>false</ns2:StageHotstartEnabled>
+                    <ns2:SizeSelectionEnabled>false</ns2:SizeSelectionEnabled>
+                    <ns2:UseCount>1</ns2:UseCount>
+                    <ns1:BioSamplePointers>
+                        <ns1:BioSamplePointer>251acf71-9eb0-489e-9dd1-cdbd11432752</ns1:BioSamplePointer>
+                    </ns1:BioSamplePointers>
+                </ns2:WellSample>
+                <ns2:Automation>
+                    <ns0:AutomationParameters>
+                        <ns0:AutomationParameter />
+                    </ns0:AutomationParameters>
+                </ns2:Automation>
+                <ns2:CollectionNumber>7</ns2:CollectionNumber>
+                <ns2:CellIndex>4</ns2:CellIndex>
+                <ns2:CellPac Barcode="10080792080000000182317411029151" />
+                <ns2:Primary>
+                    <ns2:AutomationName>BasecallerV1</ns2:AutomationName>
+                    <ns2:ConfigFileName>2-3-0_P6-C4.xml</ns2:ConfigFileName>
+                    <ns2:SequencingCondition />
+                    <ns2:OutputOptions>
+                        <ns2:ResultsFolder>Analysis_Results</ns2:ResultsFolder>
+                        <ns2:CollectionPathUri>rsy://mp-rsync/vol55//RS_DATA_STAGING/42267/Inst42267-040315-SAT-100pM-2kb-P6C4_13/A04_7/</ns2:CollectionPathUri>
+                        <ns2:CopyFiles>
+                            <ns2:CollectionFileCopy>Fasta</ns2:CollectionFileCopy>
+                        </ns2:CopyFiles>
+                        <ns2:Readout>Bases</ns2:Readout>
+                        <ns2:MetricsVerbosity>Minimal</ns2:MetricsVerbosity>
+                     </ns2:OutputOptions>
+                 </ns2:Primary>
+             </ns2:CollectionMetadata>
+         </ns2:Collections>
+         <ns1:BioSamples>
+             <ns1:BioSample
+                 Description="Inst42267-SAT-100pM-2kbLambda-P6C4-Std120_CPS_040315"
+                 MetaType="PacBio.Sample" 
+                 Name="Inst42267-040315-SAT-100pM-2kb-P6C4" 
+                 TimeStampedName="biosample_2015-08-19T15:39:36.331-07:00" 
+                 UniqueId="251acf71-9eb0-489e-9dd1-cdbd11432752" />
+         </ns1:BioSamples>
+      </DataSetMetadata>
+</SubreadSet>
diff --git a/tests/data/dataset/pbalchemy10kbp.xml b/tests/data/dataset/pbalchemy10kbp.xml

new file mode 100644 (file)

index 0000000..96189ad
--- /dev/null
+++ b/tests/data/dataset/pbalchemy10kbp.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:DataSet CreatedAt="2015-05-22T16:56:16" MetaType="PacBio.DataSet.DataSet" Name="" Tags="" UniqueId="58e3f7c5-24c1-b58b-fbd5-37de268cc2f0" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+  <pbbase:ExternalResources>
+    <pbbase:ExternalResource ResourceId="file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam">
+      <pbbase:FileIndices>
+        <pbbase:FileIndex ResourceId="file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam.bai"/>
+      </pbbase:FileIndices>
+    </pbbase:ExternalResource>
+  </pbbase:ExternalResources>
+  <pbds:Filters>
+      <pbds:Filter>
+          <pbbase:Properties>
+              <pbbase:Property Name="rname" Value="E.faecalis.1" Operator="=" />
+          </pbbase:Properties>
+      </pbds:Filter>
+  </pbds:Filters>
+</pbds:DataSet>
diff --git a/tests/data/dataset/reference.dataset.xml b/tests/data/dataset/reference.dataset.xml

new file mode 100644 (file)

index 0000000..3cfbe8c
--- /dev/null
+++ b/tests/data/dataset/reference.dataset.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:ReferenceSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.ReferenceSet" Name="DataSet_ReferenceSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource Name="First References FASTA" Description="Points to an example references FASTA file." MetaType="PacBio.ReferenceFile.ReferenceFastaFile" ResourceId="file:///mnt/path/to/reference.fasta" Tags="Example">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.SaWriterIndex" ResourceId="file:///mnt/path/to/reference.fasta.sa"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.SamIndex" ResourceId="file:///mnt/path/to/reference.fasta.fai"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>5000000</pbds:TotalLength>
+               <pbds:NumRecords>500</pbds:NumRecords>
+               <pbds:Organism>Tribble</pbds:Organism>
+               <pbds:Ploidy>Diploid</pbds:Ploidy>
+               <pbds:Contigs>
+                       <pbds:Contig Name="gi|229359445|emb|AM181176.4|" Description="Pseudomonas fluorescens SBW25 complete genome|quiver" Length="6722109" Digest="f627c795efad7ce0050ed42b942d408e"/>
+               </pbds:Contigs>
+       </pbds:DataSetMetadata>
+</pbds:ReferenceSet>
diff --git a/tests/data/dataset/subread_dataset1.xml b/tests/data/dataset/subread_dataset1.xml

new file mode 100644 (file)

index 0000000..1d64e79
--- /dev/null
+++ b/tests/data/dataset/subread_dataset1.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.SubreadSet" Name="DataSet_SubreadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd" >
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource Name="First Subreads BAM" Description="Points to an example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads0.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads0.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource Name="Second Subreads BAM" Description="Points to another example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads1.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads0.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:Filters>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="rq" Value="0.75" Operator=">"/>
+            </pbbase:Properties>
+        </pbds:Filter>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="QNAME" Value="100/0/0_100" Operator="=="/>
+            </pbbase:Properties>
+        </pbds:Filter>
+    </pbds:Filters>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>500000</pbds:TotalLength>
+        <pbds:NumRecords>500</pbds:NumRecords>
+        <pbmeta:Collections>
+            <pbmeta:CollectionMetadata Context="m152720_092723_00114_c100480560100000001823075906281381_s1_p0" InstrumentName="RS" InstrumentId="43210">
+                <pbmeta:InstCtrlVer>2.3.0.0.140640</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:RunId>e903682f-e502-465c-a2b6-9dd77c9f43fc</pbmeta:RunId>
+                    <pbmeta:Name>beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p</pbmeta:Name>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample Name="Well Sample 1" UniqueId="aaa2df90-d44f-4a48-9f35-3b99473c68f5">
+                    <pbmeta:PlateId>2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers</pbmeta:PlateId>
+                    <pbmeta:WellName>B01</pbmeta:WellName>
+                    <pbmeta:Concentration>10</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>true</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>true</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                    <pbmeta:Comments>Lorem ipsum</pbmeta:Comments>
+                    <pbsample:BioSamplePointers>
+                        <pbsample:BioSamplePointer>abc2df90-d44f-4a48-9f35-3b99473c68f5</pbsample:BioSamplePointer>
+                    </pbsample:BioSamplePointers>
+                </pbmeta:WellSample>
+                <pbmeta:Automation>
+                    <pbbase:AutomationParameters>
+                                   <pbbase:AutomationParameter/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:CellPac Barcode="100480560100000001823075906281381"/>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>1-3-0_Standard_C2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:SequencingCondition/>
+                    <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+                    <pbmeta:CollectionPathUri>rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1</pbmeta:CollectionPathUri>
+                    <pbmeta:CopyFiles>
+                        <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                    </pbmeta:CopyFiles>
+                </pbmeta:Primary>
+            </pbmeta:CollectionMetadata>
+        </pbmeta:Collections>
+        <pbsample:BioSamples>
+            <pbsample:BioSample UniqueId="abc2df90-d44f-4a48-9f35-3b99473c68f5" Name="consectetur purus" Description="Risus sit amet lectus vehicula vulputate quisque porta accumsan venenatis." CreatedAt="2015-01-20T13:27:23.9271737-08:00"/>
+        </pbsample:BioSamples>
+    </pbds:DataSetMetadata>
+</pbds:SubreadSet>
+<!-- TODO what do internal references look like?-->
diff --git a/tests/data/dataset/subread_dataset2.xml b/tests/data/dataset/subread_dataset2.xml

new file mode 100644 (file)

index 0000000..a395330
--- /dev/null
+++ b/tests/data/dataset/subread_dataset2.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.SubreadSet" Name="DataSet_SubreadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd" >
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource Name="First Subreads BAM" Description="Points to an example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads2.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads2.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource Name="Second Subreads BAM" Description="Points to another example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads3.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads3.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:Filters>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="rq" Value="0.75" Operator=">"/>
+            </pbbase:Properties>
+        </pbds:Filter>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="QNAME" Value="100/0/0_100" Operator="=="/>
+            </pbbase:Properties>
+        </pbds:Filter>
+    </pbds:Filters>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>500000</pbds:TotalLength>
+        <pbds:NumRecords>500</pbds:NumRecords>
+        <pbmeta:Collections>
+            <pbmeta:CollectionMetadata Context="m152720_092723_00114_c100480560100000001823075906281381_s1_p0" InstrumentName="RS" InstrumentId="43210">
+                <pbmeta:InstCtrlVer>2.3.0.0.140640</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:RunId>e903682f-e502-465c-a2b6-9dd77c9f43fc</pbmeta:RunId>
+                    <pbmeta:Name>beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p</pbmeta:Name>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample Name="Well Sample 1" UniqueId="aaa2df90-d44f-4a48-9f35-3b99473c68f5">
+                    <pbmeta:PlateId>2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers</pbmeta:PlateId>
+                    <pbmeta:WellName>B01</pbmeta:WellName>
+                    <pbmeta:Concentration>10</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>true</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>true</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                    <pbmeta:Comments>Lorem ipsum</pbmeta:Comments>
+                    <pbsample:BioSamplePointers>
+                        <pbsample:BioSamplePointer>abc2df90-d44f-4a48-9f35-3b99473c68f5</pbsample:BioSamplePointer>
+                    </pbsample:BioSamplePointers>
+                </pbmeta:WellSample>
+                <pbmeta:Automation>
+                    <pbbase:AutomationParameters>
+                            <pbbase:AutomationParameter/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:CellPac Barcode="100480560100000001823075906281381"/>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>1-3-0_Standard_C2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:SequencingCondition/>
+                    <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+                    <pbmeta:CollectionPathUri>rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1</pbmeta:CollectionPathUri>
+                    <pbmeta:CopyFiles>
+                        <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                    </pbmeta:CopyFiles>
+                </pbmeta:Primary>
+            </pbmeta:CollectionMetadata>
+        </pbmeta:Collections>
+        <pbsample:BioSamples>
+            <pbsample:BioSample UniqueId="abc2df90-d44f-4a48-9f35-3b99473c68f5" Name="consectetur purus" Description="Risus sit amet lectus vehicula vulputate quisque porta accumsan venenatis." CreatedAt="2015-01-20T13:27:23.9271737-08:00"/>
+        </pbsample:BioSamples>
+    </pbds:DataSetMetadata>
+</pbds:SubreadSet>
+<!-- TODO what do internal references look like?-->
diff --git a/tests/data/dataset/subread_dataset3.xml b/tests/data/dataset/subread_dataset3.xml

new file mode 100644 (file)

index 0000000..91923a8
--- /dev/null
+++ b/tests/data/dataset/subread_dataset3.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet CreatedAt="2015-01-27T09:00:01" MetaType="PacBio.DataSet.SubreadSet" Name="DataSet_SubreadSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0"  xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd" xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd" >
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource Name="First Subreads BAM" Description="Points to an example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads2.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads2.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource Name="Second Subreads BAM" Description="Points to another example Subreads BAM file." MetaType="SubreadFile.SubreadBamFile" ResourceId="file:///mnt/path/to/subreads3.bam" Tags="Example">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex ResourceId="file:///mnt/path/to/subreads3.pbi"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:Filters>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="rq" Value="0.85" Operator=">"/>
+            </pbbase:Properties>
+        </pbds:Filter>
+        <pbds:Filter>
+            <pbbase:Properties>
+                <pbbase:Property Name="QNAME" Value="100/0/0_100" Operator="=="/>
+            </pbbase:Properties>
+        </pbds:Filter>
+    </pbds:Filters>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>500000</pbds:TotalLength>
+        <pbds:NumRecords>500</pbds:NumRecords>
+        <pbmeta:Collections>
+            <pbmeta:CollectionMetadata Context="m152720_092723_00114_c100480560100000001823075906281381_s1_p0" InstrumentName="RS" InstrumentId="43210">
+                <pbmeta:InstCtrlVer>2.3.0.0.140640</pbmeta:InstCtrlVer>
+                <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2300.140640, HwVer=1.0</pbmeta:SigProcVer>
+                <pbmeta:RunDetails>
+                    <pbmeta:RunId>e903682f-e502-465c-a2b6-9dd77c9f43fc</pbmeta:RunId>
+                    <pbmeta:Name>beta4_130726_biotin_DEV_vs_MFG_PB11K_9458p</pbmeta:Name>
+                </pbmeta:RunDetails>
+                <pbmeta:WellSample Name="Well Sample 1" UniqueId="aaa2df90-d44f-4a48-9f35-3b99473c68f5">
+                    <pbmeta:PlateId>2014-12-24_141_NGAT_Igor_bisPNA Enrichment_Mag Bead Elution Buffers</pbmeta:PlateId>
+                    <pbmeta:WellName>B01</pbmeta:WellName>
+                    <pbmeta:Concentration>10</pbmeta:Concentration>
+                    <pbmeta:SampleReuseEnabled>true</pbmeta:SampleReuseEnabled>
+                    <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+                    <pbmeta:SizeSelectionEnabled>true</pbmeta:SizeSelectionEnabled>
+                    <pbmeta:UseCount>0</pbmeta:UseCount>
+                    <pbmeta:Comments>Lorem ipsum</pbmeta:Comments>
+                    <pbsample:BioSamplePointers>
+                        <pbsample:BioSamplePointer>abc2df90-d44f-4a48-9f35-3b99473c68f5</pbsample:BioSamplePointer>
+                    </pbsample:BioSamplePointers>
+                </pbmeta:WellSample>
+                <pbmeta:Automation>
+                    <pbbase:AutomationParameters>
+                                   <pbbase:AutomationParameter/>
+                    </pbbase:AutomationParameters>
+                </pbmeta:Automation>
+                <pbmeta:CollectionNumber>0</pbmeta:CollectionNumber>
+                <pbmeta:CellIndex>0</pbmeta:CellIndex>
+                <pbmeta:CellPac Barcode="100480560100000001823075906281381"/>
+                <pbmeta:Primary>
+                    <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+                    <pbmeta:ConfigFileName>1-3-0_Standard_C2.xml</pbmeta:ConfigFileName>
+                    <pbmeta:SequencingCondition/>
+                    <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+                    <pbmeta:CollectionPathUri>rsy://mp-rsync/vol56//RS_DATA_STAGING//2014-12-24_141_NGAT_Igor_bisPNA%20Enrichment_Mag%20Bead%20Elution%20Buffers_1094/B01_1</pbmeta:CollectionPathUri>
+                    <pbmeta:CopyFiles>
+                        <pbmeta:CollectionFileCopy>Bam</pbmeta:CollectionFileCopy>
+                    </pbmeta:CopyFiles>
+                </pbmeta:Primary>
+            </pbmeta:CollectionMetadata>
+        </pbmeta:Collections>
+        <pbsample:BioSamples>
+            <pbsample:BioSample UniqueId="abc2df90-d44f-4a48-9f35-3b99473c68f5" Name="consectetur purus" Description="Risus sit amet lectus vehicula vulputate quisque porta accumsan venenatis." CreatedAt="2015-01-20T13:27:23.9271737-08:00"/>
+        </pbsample:BioSamples>
+    </pbds:DataSetMetadata>
+</pbds:SubreadSet>
+<!-- TODO what do internal references look like?-->
diff --git a/tests/data/dataset/transformed_rs_subread_dataset.xml b/tests/data/dataset/transformed_rs_subread_dataset.xml

new file mode 100644 (file)

index 0000000..0750655
--- /dev/null
+++ b/tests/data/dataset/transformed_rs_subread_dataset.xml
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:HdfSubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xmlns:bax="http://whatever"
+    xmlns:fn="http://www.w3.org/2005/xpath-functions"
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd" 
+    xmlns:uuid="java:java.util.UUID" 
+    xmlns:xs="http://www.w3.org/2001/XMLSchema"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    Name="Subreads from run r001173_42129_130607"
+    MetaType="PacBio.DataSet.SubreadSet"
+    Tags="pacbio.secondary.instrument=RS"
+    Version="0.5"
+    UniqueId="abbc9183-b01e-4671-8c12-19efee534647">
+   <pbbase:ExternalResources>
+      <pbbase:ExternalResource MetaType="PacBio.SubreadFile.BaxFile"
+          ResourceId="file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.0.bax.h5"/>
+      <pbbase:ExternalResource MetaType="PacBio.SubreadFile.BaxFile"
+          ResourceId="file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.1.bax.h5"/>
+      <pbbase:ExternalResource MetaType="PacBio.SubreadFile.BaxFile"
+          ResourceId="file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.2.bax.h5"/>
+   </pbbase:ExternalResources>
+   <pbds:DataSetMetadata>
+      <pbds:TotalLength>50000000</pbds:TotalLength>
+      <pbds:NumRecords>150000</pbds:NumRecords>
+      <pbmeta:Collections>
+         <pbmeta:CollectionMetadata Context="m130608_033634_42129_c100515232550000001823076608221351_s1_p0"
+                             InstrumentName="42129"
+                             InstrumentId="1">
+            <pbmeta:InstCtrlVer>2.0.1.0.124174</pbmeta:InstCtrlVer>
+            <pbmeta:SigProcVer>NRT@172.31.128.10:8082, SwVer=2010.124174, HwVer=1.0</pbmeta:SigProcVer>
+            <pbmeta:RunDetails>
+               <pbmeta:RunId>r001173_42129_130607</pbmeta:RunId>
+               <pbmeta:Name>2013-06-07_42129_10kb_Ecoli_201-validation_2</pbmeta:Name>
+            </pbmeta:RunDetails>
+            <pbmeta:WellSample Name="P4-C2_Ecoli_10kb_MBS_stageHS">
+               <pbmeta:PlateId>2013-06-07_42129_10kb_Ecoli_201-validation_2</pbmeta:PlateId>
+               <pbmeta:WellName>P4-C2_Ecoli_10kb_MBS_stageHS</pbmeta:WellName>
+               <pbmeta:Concentration>0</pbmeta:Concentration>
+               <pbmeta:SampleReuseEnabled>false</pbmeta:SampleReuseEnabled>
+               <pbmeta:StageHotstartEnabled>true</pbmeta:StageHotstartEnabled>
+               <pbmeta:SizeSelectionEnabled>
+                                    false
+                                                               </pbmeta:SizeSelectionEnabled>
+               <pbmeta:UseCount>1</pbmeta:UseCount>
+               <pbmeta:Comments>P4-C2_Ecoli_10kb_MBS_stageHS</pbmeta:Comments>
+               <pbsample:BioSamplePointers>
+                  <pbsample:BioSamplePointer>abafd4ed-5cf7-4b83-a869-1a5d239d30e2</pbsample:BioSamplePointer>
+               </pbsample:BioSamplePointers>
+            </pbmeta:WellSample>
+            <pbmeta:AutomationName>MagBead Standard Seq v2</pbmeta:AutomationName>
+            <pbmeta:CollectionNumber>2</pbmeta:CollectionNumber>
+            <pbmeta:CellIndex>1</pbmeta:CellIndex>
+            <pbmeta:CellPac Barcode="10051523255000000182307660822135"/>
+            <pbmeta:Primary>
+               <pbmeta:AutomationName>BasecallerV1</pbmeta:AutomationName>
+               <pbmeta:ConfigFileName>2-0-0_P4-C2.xml</pbmeta:ConfigFileName>
+               <pbmeta:SequencingCondition/>
+               <pbmeta:ResultsFolder>Analysis_Results</pbmeta:ResultsFolder>
+               <pbmeta:CollectionPathUri>rsy://mp-f030-io/vol54//RS_DATA_STAGING/42129/2013-06-07_42129_10kb_Ecoli_201-validation_2_1173/A01_2/</pbmeta:CollectionPathUri>
+               <pbmeta:CopyFiles>
+                  <pbmeta:CollectionFileCopy>Fasta</pbmeta:CollectionFileCopy>
+               </pbmeta:CopyFiles>
+            </pbmeta:Primary>
+         </pbmeta:CollectionMetadata>
+      </pbmeta:Collections>
+      <pbsample:BioSamples>
+         <pbsample:BioSample Name="P4-C2_Ecoli_10kb_MBS_stageHS" Description="P4-C2_Ecoli_10kb_MBS_stageHS"
+                    UniqueId="abafd4ed-5cf7-4b83-a869-1a5d239d30e2"/>
+      </pbsample:BioSamples>
+   </pbds:DataSetMetadata>
+</pbds:HdfSubreadSet>
diff --git a/tests/data/empty.bam b/tests/data/empty.bam

new file mode 100644 (file)

index 0000000..1b22456

Binary files /dev/null and b/tests/data/empty.bam differ
diff --git a/tests/data/empty.bam.pbi b/tests/data/empty.bam.pbi

new file mode 100644 (file)

index 0000000..e398d79

Binary files /dev/null and b/tests/data/empty.bam.pbi differ
diff --git a/tests/data/group/group.fofn.in b/tests/data/group/group.fofn.in

new file mode 100644 (file)

index 0000000..c2621c5
--- /dev/null
+++ b/tests/data/group/group.fofn.in
@@ -0,0 +1,3 @@
+@PacBioBAM_TestsDir@/data/group/test1.bam
+@PacBioBAM_TestsDir@/data/group/test2.bam
+@PacBioBAM_TestsDir@/data/group/test3.bam
diff --git a/tests/data/group/test1.bam b/tests/data/group/test1.bam

new file mode 100644 (file)

index 0000000..2ba687b

Binary files /dev/null and b/tests/data/group/test1.bam differ
diff --git a/tests/data/group/test2.bam b/tests/data/group/test2.bam

new file mode 100644 (file)

index 0000000..9e22b30

Binary files /dev/null and b/tests/data/group/test2.bam differ
diff --git a/tests/data/group/test2.bam.pbi b/tests/data/group/test2.bam.pbi

new file mode 100644 (file)

index 0000000..761600b

Binary files /dev/null and b/tests/data/group/test2.bam.pbi differ
diff --git a/tests/data/group/test3.bam b/tests/data/group/test3.bam

new file mode 100644 (file)

index 0000000..093e93a

Binary files /dev/null and b/tests/data/group/test3.bam differ
diff --git a/tests/data/lambdaNEB.fa b/tests/data/lambdaNEB.fa

new file mode 100644 (file)

index 0000000..33011e5
--- /dev/null
+++ b/tests/data/lambdaNEB.fa
@@ -0,0 +1,608 @@
+>lambda_NEB3011
+GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTA
+ATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGACAGGTGCTGAAAGCGAGCTTTTTGGCCTCTGTCGTTTCC
+TTTCTCTGTTTTTGTCCGTGGAATGAACAATGGAAGTCAACAAAAAGCAGCTGGCTGACATTTTCGGTGCGAGTATCCGT
+ACCATTCAGAACTGGCAGGAACAGGGAATGCCCGTTCTGCGAGGCGGTGGCAAGGGTAATGAGGTGCTTTATGACTCTGC
+CGCCGTCATAAAATGGTATGCCGAAAGGGATGCTGAAATTGAGAACGAAAAGCTGCGCCGGGAGGTTGAAGAACTGCGGC
+AGGCCAGCGAGGCAGATCTCCAGCCAGGAACTATTGAGTACGAACGCCATCGACTTACGCGTGCGCAGGCCGACGCACAG
+GAACTGAAGAATGCCAGAGACTCCGCTGAAGTGGTGGAAACCGCATTCTGTACTTTCGTGCTGTCGCGGATCGCAGGTGA
+AATTGCCAGTATTCTCGACGGGCTCCCCCTGTCGGTGCAGCGGCGTTTTCCGGAACTGGAAAACCGACATGTTGATTTCC
+TGAAACGGGATATCATCAAAGCCATGAACAAAGCAGCCGCGCTGGATGAACTGATACCGGGGTTGCTGAGTGAATATATC
+GAACAGTCAGGTTAACAGGCTGCGGCATTTTGTCCGCGCCGGGCTTCGCTCACTGTTCAGGCCGGAGCCACAGACCGCCG
+TTGAATGGGCGGATGCTAATTACTATCTCCCGAAAGAATCCGCATACCAGGAAGGGCGCTGGGAAACACTGCCCTTTCAG
+CGGGCCATCATGAATGCGATGGGCAGCGACTACATCCGTGAGGTGAATGTGGTGAAGTCTGCCCGTGTCGGTTATTCCAA
+AATGCTGCTGGGTGTTTATGCCTACTTTATAGAGCATAAGCAGCGCAACACCCTTATCTGGTTGCCGACGGATGGTGATG
+CCGAGAACTTTATGAAAACCCACGTTGAGCCGACTATTCGTGATATTCCGTCGCTGCTGGCGCTGGCCCCGTGGTATGGC
+AAAAAGCACCGGGATAACACGCTCACCATGAAGCGTTTCACTAATGGGCGTGGCTTCTGGTGCCTGGGCGGTAAAGCGGC
+AAAAAACTACCGTGAAAAGTCGGTGGATGTGGCGGGTTATGATGAACTTGCTGCTTTTGATGATGATATTGAACAGGAAG
+GCTCTCCGACGTTCCTGGGTGACAAGCGTATTGAAGGCTCGGTCTGGCCAAAGTCCATCCGTGGCTCCACGCCAAAAGTG
+AGAGGCACCTGTCAGATTGAGCGTGCAGCCAGTGAATCCCCGCATTTTATGCGTTTTCATGTTGCCTGCCCGCATTGCGG
+GGAGGAGCAGTATCTTAAATTTGGCGACAAAGAGACGCCGTTTGGCCTCAAATGGACGCCGGATGACCCCTCCAGCGTGT
+TTTATCTCTGCGAGCATAATGCCTGCGTCATCCGCCAGCAGGAGCTGGACTTTACTGATGCCCGTTATATCTGCGAAAAG
+ACCGGGATCTGGACCCGTGATGGCATTCTCTGGTTTTCGTCATCCGGTGAAGAGATTGAGCCACCTGACAGTGTGACCTT
+TCACATCTGGACAGCGTACAGCCCGTTCACCACCTGGGTGCAGATTGTCAAAGACTGGATGAAAACGAAAGGGGATACGG
+GAAAACGTAAAACCTTCGTAAACACCACGCTCGGTGAGACGTGGGAGGCGAAAATTGGCGAACGTCCGGATGCTGAAGTG
+ATGGCAGAGCGGAAAGAGCATTATTCAGCGCCCGTTCCTGACCGTGTGGCTTACCTGACCGCCGGTATCGACTCCCAGCT
+GGACCGCTACGAAATGCGCGTATGGGGATGGGGGCCGGGTGAGGAAAGCTGGCTGATTGACCGGCAGATTATTATGGGCC
+GCCACGACGATGAACAGACGCTGCTGCGTGTGGATGAGGCCATCAATAAAACCTATACCCGCCGGAATGGTGCAGAAATG
+TCGATATCCCGTATCTGCTGGGATACTGGCGGGATTGACCCGACCATTGTGTATGAACGCTCGAAAAAACATGGGCTGTT
+CCGGGTGATCCCCATTAAAGGGGCATCCGTCTACGGAAAGCCGGTGGCCAGCATGCCACGTAAGCGAAACAAAAACGGGG
+TTTACCTTACCGAAATCGGTACGGATACCGCGAAAGAGCAGATTTATAACCGCTTCACACTGACGCCGGAAGGGGATGAA
+CCGCTTCCCGGTGCCGTTCACTTCCCGAATAACCCGGATATTTTTGATCTGACCGAAGCGCAGCAGCTGACTGCTGAAGA
+GCAGGTCGAAAAATGGGTGGATGGCAGGAAAAAAATACTGTGGGACAGCAAAAAGCGACGCAATGAGGCACTCGACTGCT
+TCGTTTATGCGCTGGCGGCGCTGCGCATCAGTATTTCCCGCTGGCAGCTGGATCTCAGTGCGCTGCTGGCGAGCCTGCAG
+GAAGAGGATGGTGCAGCAACCAACAAGAAAACACTGGCAGATTACGCCCGTGCCTTATCCGGAGAGGATGAATGACGCGA
+CAGGAAGAACTTGCCGCTGCCCGTGCGGCACTGCATGACCTGATGACAGGTAAACGGGTGGCAACAGTACAGAAAGACGG
+ACGAAGGGTGGAGTTTACGGCCACTTCCGTGTCTGACCTGAAAAAATATATTGCAGAGCTGGAAGTGCAGACCGGCATGA
+CACAGCGACGCAGGGGACCTGCAGGATTTTATGTATGAAAACGCCCACCATTCCCACCCTTCTGGGGCCGGACGGCATGA
+CATCGCTGCGCGAATATGCCGGTTATCACGGCGGTGGCAGCGGATTTGGAGGGCAGTTGCGGTCGTGGAACCCACCGAGT
+GAAAGTGTGGATGCAGCCCTGTTGCCCAACTTTACCCGTGGCAATGCCCGCGCAGACGATCTGGTACGCAATAACGGCTA
+TGCCGCCAACGCCATCCAGCTGCATCAGGATCATATCGTCGGGTCTTTTTTCCGGCTCAGTCATCGCCCAAGCTGGCGCT
+ATCTGGGCATCGGGGAGGAAGAAGCCCGTGCCTTTTCCCGCGAGGTTGAAGCGGCATGGAAAGAGTTTGCCGAGGATGAC
+TGCTGCTGCATTGACGTTGAGCGAAAACGCACGTTTACCATGATGATTCGGGAAGGTGTGGCCATGCACGCCTTTAACGG
+TGAACTGTTCGTTCAGGCCACCTGGGATACCAGTTCGTCGCGGCTTTTCCGGACACAGTTCCGGATGGTCAGCCCGAAGC
+GCATCAGCAACCCGAACAATACCGGCGACAGCCGGAACTGCCGTGCCGGTGTGCAGATTAATGACAGCGGTGCGGCGCTG
+GGATATTACGTCAGCGAGGACGGGTATCCTGGCTGGATGCCGCAGAAATGGACATGGATACCCCGTGAGTTACCCGGCGG
+GCGCGCCTCGTTCATTCACGTTTTTGAACCCGTGGAGGACGGGCAGACTCGCGGTGCAAATGTGTTTTACAGCGTGATGG
+AGCAGATGAAGATGCTCGACACGCTGCAGAACACGCAGCTGCAGAGCGCCATTGTGAAGGCGATGTATGCCGCCACCATT
+GAGAGTGAGCTGGATACGCAGTCAGCGATGGATTTTATTCTGGGCGCGAACAGTCAGGAGCAGCGGGAAAGGCTGACCGG
+CTGGATTGGTGAAATTGCCGCGTATTACGCCGCAGCGCCGGTCCGGCTGGGAGGCGCAAAAGTACCGCACCTGATGCCGG
+GTGACTCACTGAACCTGCAGACGGCTCAGGATACGGATAACGGCTACTCCGTGTTTGAGCAGTCACTGCTGCGGTATATC
+GCTGCCGGGCTGGGTGTCTCGTATGAGCAGCTTTCCCGGAATTACGCCCAGATGAGCTACTCCACGGCACGGGCCAGTGC
+GAACGAGTCGTGGGCGTACTTTATGGGGCGGCGAAAATTCGTCGCATCCCGTCAGGCGAGCCAGATGTTTCTGTGCTGGC
+TGGAAGAGGCCATCGTTCGCCGCGTGGTGACGTTACCTTCAAAAGCGCGCTTCAGTTTTCAGGAAGCCCGCAGTGCCTGG
+GGGAACTGCGACTGGATAGGCTCCGGTCGTATGGCCATCGATGGTCTGAAAGAAGTTCAGGAAGCGGTGATGCTGATAGA
+AGCCGGACTGAGTACCTACGAGAAAGAGTGCGCAAAACGCGGTGACGACTATCAGGAAATTTTTGCCCAGCAGGTCCGTG
+AAACGATGGAGCGCCGTGCAGCCGGTCTTAAACCGCCCGCCTGGGCGGCTGCAGCATTTGAATCCGGGCTGCGACAATCA
+ACAGAGGAGGAGAAGAGTGACAGCAGAGCTGCGTAATCTCCCGCATATTGCCAGCATGGCCTTTAATGAGCCGCTGATGC
+TTGAACCCGCCTATGCGCGGGTTTTCTTTTGTGCGCTTGCAGGCCAGCTTGGGATCAGCAGCCTGACGGATGCGGTGTCC
+GGCGACAGCCTGACTGCCCAGGAGGCACTCGCGACGCTGGCATTATCCGGTGATGATGACGGACCACGACAGGCCCGCAG
+TTATCAGGTCATGAACGGCATCGCCGTGCTGCCGGTGTCCGGCACGCTGGTCAGCCGGACGCGGGCGCTGCAGCCGTACT
+CGGGGATGACCGGTTACAACGGCATTATCGCCCGTCTGCAACAGGCTGCCAGCGATCCGATGGTGGACGGCATTCTGCTC
+GATATGGACACGCCCGGCGGGATGGTGGCGGGGGCATTTGACTGCGCTGACATCATCGCCCGTGTGCGTGACATAAAACC
+GGTATGGGCGCTTGCCAACGACATGAACTGCAGTGCAGGTCAGTTGCTTGCCAGTGCCGCCTCCCGGCGTCTGGTCACGC
+AGACCGCCCGGACAGGCTCCATCGGCGTCATGATGGCTCACAGTAATTACGGTGCTGCGCTGGAGAAACAGGGTGTGGAA
+ATCACGCTGATTTACAGCGGCAGCCATAAGGTGGATGGCAACCCCTACAGCCATCTTCCGGATGACGTCCGGGAGACACT
+GCAGTCCCGGATGGACGCAACCCGCCAGATGTTTGCGCAGAAGGTGTCGGCATATACCGGCCTGTCCGTGCAGGTTGTGC
+TGGATACCGAGGCTGCAGTGTACAGCGGTCAGGAGGCCATTGATGCCGGACTGGCTGATGAACTTGTTAACAGCACCGAT
+GCGATCACCGTCATGCGTGATGCACTGGATGCACGTAAATCCCGTCTCTCAGGAGGGCGAATGACCAAAGAGACTCAATC
+AACAACTGTTTCAGCCACTGCTTCGCAGGCTGACGTTACTGACGTGGTGCCAGCGACGGAGGGCGAGAACGCCAGCGCGG
+CGCAGCCGGACGTGAACGCGCAGATCACCGCAGCGGTTGCGGCAGAAAACAGCCGCATTATGGGGATCCTCAACTGTGAG
+GAGGCTCACGGACGCGAAGAACAGGCACGCGTGCTGGCAGAAACCCCCGGTATGACCGTGAAAACGGCCCGCCGCATTCT
+GGCCGCAGCACCACAGAGTGCACAGGCGCGCAGTGACACTGCGCTGGATCGTCTGATGCAGGGGGCACCGGCACCGCTGG
+CTGCAGGTAACCCGGCATCTGATGCCGTTAACGATTTGCTGAACACACCAGTGTAAGGGATGTTTATGACGAGCAAAGAA
+ACCTTTACCCATTACCAGCCGCAGGGCAACAGTGACCCGGCTCATACCGCAACCGCGCCCGGCGGATTGAGTGCGAAAGC
+GCCTGCAATGACCCCGCTGATGCTGGACACCTCCAGCCGTAAGCTGGTTGCGTGGGATGGCACCACCGACGGTGCTGCCG
+TTGGCATTCTTGCGGTTGCTGCTGACCAGACCAGCACCACGCTGACGTTCTACAAGTCCGGCACGTTCCGTTATGAGGAT
+GTGCTCTGGCCGGAGGCTGCCAGCGACGAGACGAAAAAACGGACCGCGTTTGCCGGAACGGCAATCAGCATCGTTTAACT
+TTACCCTTCATCACTAAAGGCCGCCTGTGCGGCTTTTTTTACGGGATTTTTTTATGTCGATGTACACAACCGCCCAACTG
+CTGGCGGCAAATGAGCAGAAATTTAAGTTTGATCCGCTGTTTCTGCGTCTCTTTTTCCGTGAGAGCTATCCCTTCACCAC
+GGAGAAAGTCTATCTCTCACAAATTCCGGGACTGGTAAACATGGCGCTGTACGTTTCGCCGATTGTTTCCGGTGAGGTTA
+TCCGTTCCCGTGGCGGCTCCACCTCTGAATTTACGCCGGGATATGTCAAGCCGAAGCATGAAGTGAATCCGCAGATGACC
+CTGCGTCGCCTGCCGGATGAAGATCCGCAGAATCTGGCGGACCCGGCTTACCGCCGCCGTCGCATCATCATGCAGAACAT
+GCGTGACGAAGAGCTGGCCATTGCTCAGGTCGAAGAGATGCAGGCAGTTTCTGCCGTGCTTAAGGGCAAATACACCATGA
+CCGGTGAAGCCTTCGATCCGGTTGAGGTGGATATGGGCCGCAGTGAGGAGAATAACATCACGCAGTCCGGCGGCACGGAG
+TGGAGCAAGCGTGACAAGTCCACGTATGACCCGACCGACGATATCGAAGCCTACGCGCTGAACGCCAGCGGTGTGGTGAA
+TATCATCGTGTTCGATCCGAAAGGCTGGGCGCTGTTCCGTTCCTTCAAAGCCGTCAAGGAGAAGCTGGATACCCGTCGTG
+GCTCTAATTCCGAGCTGGAGACAGCGGTGAAAGACCTGGGCAAAGCGGTGTCCTATAAGGGGATGTATGGCGATGTGGCC
+ATCGTCGTGTATTCCGGACAGTACGTGGAAAACGGCGTCAAAAAGAACTTCCTGCCGGACAACACGATGGTGCTGGGGAA
+CACTCAGGCACGCGGTCTGCGCACCTATGGCTGCATTCAGGATGCGGACGCACAGCGCGAAGGCATTAACGCCTCTGCCC
+GTTACCCGAAAAACTGGGTGACCACCGGCGATCCGGCGCGTGAGTTCACCATGATTCAGTCAGCACCGCTGATGCTGCTG
+GCTGACCCTGATGAGTTCGTGTCCGTACAACTGGCGTAATCATGGCCCTTCGGGGCCATTGTTTCTCTGTGGAGGAGTCC
+ATGACGAAAGATGAACTGATTGCCCGTCTCCGCTCGCTGGGTGAACAACTGAACCGTGATGTCAGCCTGACGGGGACGAA
+AGAAGAACTGGCGCTCCGTGTGGCAGAGCTGAAAGAGGAGCTTGATGACACGGATGAAACTGCCGGTCAGGACACCCCTC
+TCAGCCGGGAAAATGTGCTGACCGGACATGAAAATGAGGTGGGATCAGCGCAGCCGGATACCGTGATTCTGGATACGTCT
+GAACTGGTCACGGTCGTGGCACTGGTGAAGCTGCATACTGATGCACTTCACGCCACGCGGGATGAACCTGTGGCATTTGT
+GCTGCCGGGAACGGCGTTTCGTGTCTCTGCCGGTGTGGCAGCCGAAATGACAGAGCGCGGCCTGGCCAGAATGCAATAAC
+GGGAGGCGCTGTGGCTGATTTCGATAACCTGTTCGATGCTGCCATTGCCCGCGCCGATGAAACGATACGCGGGTACATGG
+GAACGTCAGCCACCATTACATCCGGTGAGCAGTCAGGTGCGGTGATACGTGGTGTTTTTGATGACCCTGAAAATATCAGC
+TATGCCGGACAGGGCGTGCGCGTTGAAGGCTCCAGCCCGTCCCTGTTTGTCCGGACTGATGAGGTGCGGCAGCTGCGGCG
+TGGAGACACGCTGACCATCGGTGAGGAAAATTTCTGGGTAGATCGGGTTTCGCCGGATGATGGCGGAAGTTGTCATCTCT
+GGCTTGGACGGGGCGTACCGCCTGCCGTTAACCGTCGCCGCTGAAAGGGGGATGTATGGCCATAAAAGGTCTTGAGCAGG
+CCGTTGAAAACCTCAGCCGTATCAGCAAAACGGCGGTGCCTGGTGCCGCCGCAATGGCCATTAACCGCGTTGCTTCATCC
+GCGATATCGCAGTCGGCGTCACAGGTTGCCCGTGAGACAAAGGTACGCCGGAAACTGGTAAAGGAAAGGGCCAGGCTGAA
+AAGGGCCACGGTCAAAAATCCGCAGGCCAGAATCAAAGTTAACCGGGGGGATTTGCCCGTAATCAAGCTGGGTAATGCGC
+GGGTTGTCCTTTCGCGCCGCAGGCGTCGTAAAAAGGGGCAGCGTTCATCCCTGAAAGGTGGCGGCAGCGTGCTTGTGGTG
+GGTAACCGTCGTATTCCCGGCGCGTTTATTCAGCAACTGAAAAATGGCCGGTGGCATGTCATGCAGCGTGTGGCTGGGAA
+AAACCGTTACCCCATTGATGTGGTGAAAATCCCGATGGCGGTGCCGCTGACCACGGCGTTTAAACAAAATATTGAGCGGA
+TACGGCGTGAACGTCTTCCGAAAGAGCTGGGCTATGCGCTGCAGCATCAACTGAGGATGGTAATAAAGCGATGAAACATA
+CTGAACTCCGTGCAGCCGTACTGGATGCACTGGAGAAGCATGACACCGGGGCGACGTTTTTTGATGGTCGCCCCGCTGTT
+TTTGATGAGGCGGATTTTCCGGCAGTTGCCGTTTATCTCACCGGCGCTGAATACACGGGCGAAGAGCTGGACAGCGATAC
+CTGGCAGGCGGAGCTGCATATCGAAGTTTTCCTGCCTGCTCAGGTGCCGGATTCAGAGCTGGATGCGTGGATGGAGTCCC
+GGATTTATCCGGTGATGAGCGATATCCCGGCACTGTCAGATTTGATCACCAGTATGGTGGCCAGCGGCTATGACTACCGG
+CGCGACGATGATGCGGGCTTGTGGAGTTCAGCCGATCTGACTTATGTCATTACCTATGAAATGTGAGGACGCTATGCCTG
+TACCAAATCCTACAATGCCGGTGAAAGGTGCCGGGACCACCCTGTGGGTTTATAAGGGGAGCGGTGACCCTTACGCGAAT
+CCGCTTTCAGACGTTGACTGGTCGCGTCTGGCAAAAGTTAAAGACCTGACGCCCGGCGAACTGACCGCTGAGTCCTATGA
+CGACAGCTATCTCGATGATGAAGATGCAGACTGGACTGCGACCGGGCAGGGGCAGAAATCTGCCGGAGATACCAGCTTCA
+CGCTGGCGTGGATGCCCGGAGAGCAGGGGCAGCAGGCGCTGCTGGCGTGGTTTAATGAAGGCGATACCCGTGCCTATAAA
+ATCCGCTTCCCGAACGGCACGGTCGATGTGTTCCGTGGCTGGGTCAGCAGTATCGGTAAGGCGGTGACGGCGAAGGAAGT
+GATCACCCGCACGGTGAAAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGTAACAGCGGCAACCG
+GCATGACCGTGACGCCTGCCAGCACCTCGGTGGTGAAAGGGCAGAGCACCACGCTGACCGTGGCCTTCCAGCCGGAGGGC
+GTAACCGACAAGAGCTTTCGTGCGGTGTCTGCGGATAAAACAAAAGCCACCGTGTCGGTCAGTGGTATGACCATCACCGT
+GAACGGCGTTGCTGCAGGCAAGGTCAACATTCCGGTTGTATCCGGTAATGGTGAGTTTGCTGCGGTTGCAGAAATTACCG
+TCACCGCCAGTTAATCCGGAGAGTCAGCGATGTTCCTGAAAACCGAATCATTTGAACATAACGGTGTGACCGTCACGCTT
+TCTGAACTGTCAGCCCTGCAGCGCATTGAGCATCTCGCCCTGATGAAACGGCAGGCAGAACAGGCGGAGTCAGACAGCAA
+CCGGAAGTTTACTGTGGAAGACGCCATCAGAACCGGCGCGTTTCTGGTGGCGATGTCCCTGTGGCATAACCATCCGCAGA
+AGACGCAGATGCCGTCCATGAATGAAGCCGTTAAACAGATTGAGCAGGAAGTGCTTACCACCTGGCCCACGGAGGCAATT
+TCTCATGCTGAAAACGTGGTGTACCGGCTGTCTGGTATGTATGAGTTTGTGGTGAATAATGCCCCTGAACAGACAGAGGA
+CGCCGGGCCCGCAGAGCCTGTTTCTGCGGGAAAGTGTTCGACGGTGAGCTGAGTTTTGCCCTGAAACTGGCGCGTGAGAT
+GGGGCGACCCGACTGGCGTGCCATGCTTGCCGGGATGTCATCCACGGAGTATGCCGACTGGCACCGCTTTTACAGTACCC
+ATTATTTTCATGATGTTCTGCTGGATATGCACTTTTCCGGGCTGACGTACACCGTGCTCAGCCTGTTTTTCAGCGATCCG
+GATATGCATCCGCTGGATTTCAGTCTGCTGAACCGGCGCGAGGCTGACGAAGAGCCTGAAGATGATGTGCTGATGCAGAA
+AGCGGCAGGGCTTGCCGGAGGTGTCCGCTTTGGCCCGGACGGGAATGAAGTTATCCCCGCTTCCCCGGATGTGGCGGACA
+TGACGGAGGATGACGTAATGCTGATGACAGTATCAGAAGGGATCGCAGGAGGAGTCCGGTATGGCTGAACCGGTAGGCGA
+TCTGGTCGTTGATTTGAGTCTGGATGCGGCCAGATTTGACGAGCAGATGGCCAGAGTCAGGCGTCATTTTTCTGGTACGG
+AAAGTGATGCGAAAAAAACAGCGGCAGTCGTTGAACAGTCGCTGAGCCGACAGGCGCTGGCTGCACAGAAAGCGGGGATT
+TCCGTCGGGCAGTATAAAGCCGCCATGCGTATGCTGCCTGCACAGTTCACCGACGTGGCCACGCAGCTTGCAGGCGGGCA
+AAGTCCGTGGCTGATCCTGCTGCAACAGGGGGGGCAGGTGAAGGACTCCTTCGGCGGGATGATCCCCATGTTCAGGGGGC
+TTGCCGGTGCGATCACCCTGCCGATGGTGGGGGCCACCTCGCTGGCGGTGGCGACCGGTGCGCTGGCGTATGCCTGGTAT
+CAGGGCAACTCAACCCTGTCCGATTTCAACAAAACGCTGGTCCTTTCCGGCAATCAGGCGGGACTGACGGCAGATCGTAT
+GCTGGTCCTGTCCAGAGCCGGGCAGGCGGCAGGGCTGACGTTTAACCAGACCAGCGAGTCACTCAGCGCACTGGTTAAGG
+CGGGGGTAAGCGGTGAGGCTCAGATTGCGTCCATCAGCCAGAGTGTGGCGCGTTTCTCCTCTGCATCCGGCGTGGAGGTG
+GACAAGGTCGCTGAAGCCTTCGGGAAGCTGACCACAGACCCGACGTCGGGGCTGACGGCGATGGCTCGCCAGTTCCATAA
+CGTGTCGGCGGAGCAGATTGCGTATGTTGCTCAGTTGCAGCGTTCCGGCGATGAAGCCGGGGCATTGCAGGCGGCGAACG
+AGGCCGCAACGAAAGGGTTTGATGACCAGACCCGCCGCCTGAAAGAGAACATGGGCACGCTGGAGACCTGGGCAGACAGG
+ACTGCGCGGGCATTCAAATCCATGTGGGATGCGGTGCTGGATATTGGTCGTCCTGATACCGCGCAGGAGATGCTGATTAA
+GGCAGAGGCTGCGTATAAGAAAGCAGACGACATCTGGAATCTGCGCAAGGATGATTATTTTGTTAACGATGAAGCGCGGG
+CGCGTTACTGGGATGATCGTGAAAAGGCCCGTCTTGCGCTTGAAGCCGCCCGAAAGAAGGCTGAGCAGCAGACTCAACAG
+GACAAAAATGCGCAGCAGCAGAGCGATACCGAAGCGTCACGGCTGAAATATACCGAAGAGGCGCAGAAGGCTTACGAACG
+GCTGCAGACGCCGCTGGAGAAATATACCGCCCGTCAGGAAGAACTGAACAAGGCACTGAAAGACGGGAAAATCCTGCAGG
+CGGATTACAACACGCTGATGGCGGCGGCGAAAAAGGATTATGAAGCGACGCTGAAAAAGCCGAAACAGTCCAGCGTGAAG
+GTGTCTGCGGGCGATCGTCAGGAAGACAGTGCTCATGCTGCCCTGCTGACGCTTCAGGCAGAACTCCGGACGCTGGAGAA
+GCATGCCGGAGCAAATGAGAAAATCAGCCAGCAGCGCCGGGATTTGTGGAAGGCGGAGAGTCAGTTCGCGGTACTGGAGG
+AGGCGGCGCAACGTCGCCAGCTGTCTGCACAGGAGAAATCCCTGCTGGCGCATAAAGATGAGACGCTGGAGTACAAACGC
+CAGCTGGCTGCACTTGGCGACAAGGTTACGTATCAGGAGCGCCTGAACGCGCTGGCGCAGCAGGCGGATAAATTCGCACA
+GCAGCAACGGGCAAAACGGGCCGCCATTGATGCGAAAAGCCGGGGGCTGACTGACCGGCAGGCAGAACGGGAAGCCACGG
+AACAGCGCCTGAAGGAACAGTATGGCGATAATCCGCTGGCGCTGAATAACGTCATGTCAGAGCAGAAAAAGACCTGGGCG
+GCTGAAGACCAGCTTCGCGGGAACTGGATGGCAGGCCTGAAGTCCGGCTGGAGTGAGTGGGAAGAGAGCGCCACGGACAG
+TATGTCGCAGGTAAAAAGTGCAGCCACGCAGACCTTTGATGGTATTGCACAGAATATGGCGGCGATGCTGACCGGCAGTG
+AGCAGAACTGGCGCAGCTTCACCCGTTCCGTGCTGTCCATGATGACAGAAATTCTGCTTAAGCAGGCAATGGTGGGGATT
+GTCGGGAGTATCGGCAGCGCCATTGGCGGGGCTGTTGGTGGCGGCGCATCCGCGTCAGGCGGTACAGCCATTCAGGCCGC
+TGCGGCGAAATTCCATTTTGCAACCGGAGGATTTACGGGAACCGGCGGCAAATATGAGCCAGCGGGGATTGTTCACCGTG
+GTGAGTTTGTCTTCACGAAGGAGGCAACCAGCCGGATTGGCGTGGGGAATCTTTACCGGCTGATGCGCGGCTATGCCACC
+GGCGGTTATGTCGGTACACCGGGCAGCATGGCAGACAGCCGGTCGCAGGCGTCCGGGACGTTTGAGCAGAATAACCATGT
+GGTGATTAACAACGACGGCACGAACGGGCAGATAGGTCCGGCTGCTCTGAAGGCGGTGTATGACATGGCCCGCAAGGGTG
+CCCGTGATGAAATTCAGACACAGATGCGTGATGGTGGCCTGTTCTCCGGAGGTGGACGATGAAGACCTTCCGCTGGAAAG
+TGAAACCCGGTATGGATGTGGCTTCGGTCCCTTCTGTAAGAAAGGTGCGCTTTGGTGATGGCTATTCTCAGCGAGCGCCT
+GCCGGGCTGAATGCCAACCTGAAAACGTACAGCGTGACGCTTTCTGTCCCCCGTGAGGAGGCCACGGTACTGGAGTCGTT
+TCTGGAAGAGCACGGGGGCTGGAAATCCTTTCTGTGGACGCCGCCTTATGAGTGGCGGCAGATAAAGGTGACCTGCGCAA
+AATGGTCGTCGCGGGTCAGTATGCTGCGTGTTGAGTTCAGCGCAGAGTTTGAACAGGTGGTGAACTGATGCAGGATATCC
+GGCAGGAAACACTGAATGAATGCACCCGTGCGGAGCAGTCGGCCAGCGTGGTGCTCTGGGAAATCGACCTGACAGAGGTC
+GGTGGAGAACGTTATTTTTTCTGTAATGAGCAGAACGAAAAAGGTGAGCCGGTCACCTGGCAGGGGCGACAGTATCAGCC
+GTATCCCATTCAGGGGAGCGGTTTTGAACTGAATGGCAAAGGCACCAGTACGCGCCCCACGCTGACGGTTTCTAACCTGT
+ACGGTATGGTCACCGGGATGGCGGAAGATATGCAGAGTCTGGTCGGCGGAACGGTGGTCCGGCGTAAGGTTTACGCCCGT
+TTTCTGGATGCGGTGAACTTCGTCAACGGAAACAGTTACGCCGATCCGGAGCAGGAGGTGATCAGCCGCTGGCGCATTGA
+GCAGTGCAGCGAACTGAGCGCGGTGAGTGCCTCCTTTGTACTGTCCACGCCGACGGAAACGGATGGCGCTGTTTTTCCGG
+GACGTATCATGCTGGCCAACACCTGCACCTGGACCTATCGCGGTGACGAGTGCGGTTATAGCGGTCCGGCTGTCGCGGAT
+GAATATGACCAGCCAACGTCCGATATCACGAAGGATAAATGCAGCAAATGCCTGAGCGGTTGTAAGTTCCGCAATAACGT
+CGGCAACTTTGGCGGCTTCCTTTCCATTAACAAACTTTCGCAGTAAATCCCATGACACAGACAGAATCAGCGATTCTGGC
+GCACGCCCGGCGATGTGCGCCAGCGGAGTCGTGCGGCTTCGTGGTAAGCACGCCGGAGGGGGAAAGATATTTCCCCTGCG
+TGAATATCTCCGGTGAGCCGGAGGCGTATTTCCGTATGTCGCCGGAAGACTGGCTGCAGGCAGAAATGCAGGGTGAGATT
+GTGGCGCTGGTCCACAGCCACCCCGGTGGTCTGCCCTGGCTGAGTGAGGCCGACCGGCGGCTGCAGGTGCAGAGTGATTT
+GCCGTGGTGGCTGGTCTGCCGGGGGACGATTCATAAGTTCCGCTGTGTGCCGCATCTCACCGGGCGGCGCTTTGAGCACG
+GTGTGACGGACTGTTACACACTGTTCCGGGATGCTTATCATCTGGCGGGGATTGAGATGCCGGACTTTCATCGTGAGGAT
+GACTGGTGGCGTAACGGCCAGAATCTCTATCTGGATAATCTGGAGGCGACGGGGCTGTATCAGGTGCCGTTGTCAGCGGC
+ACAGCCGGGCGATGTGCTGCTGTGCTGTTTTGGTTCATCAGTGCCGAATCACGCCGCAATTTACTGCGGCGACGGCGAGC
+TGCTGCACCATATTCCTGAACAACTGAGCAAACGAGAGAGGTACACCGACAAATGGCAGCGACGCACACACTCCCTCTGG
+CGTCACCGGGCATGGCGCGCATCTGCCTTTACGGGGATTTACAACGATTTGGTCGCCGCATCGACCTTCGTGTGAAAACG
+GGGGCTGAAGCCATCCGGGCACTGGCCACACAGCTCCCGGCGTTTCGTCAGAAACTGAGCGACGGCTGGTATCAGGTACG
+GATTGCCGGGCGGGACGTCAGCACGTCCGGGTTAACGGCGCAGTTACATGAGACTCTGCCTGATGGCGCTGTAATTCATA
+TTGTTCCCAGAGTCGCCGGGGCCAAGTCAGGTGGCGTATTCCAGATTGTCCTGGGGGCTGCCGCCATTGCCGGATCATTC
+TTTACCGCCGGAGCCACCCTTGCAGCATGGGGGGCAGCCATTGGGGCCGGTGGTATGACCGGCATCCTGTTTTCTCTCGG
+TGCCAGTATGGTGCTCGGTGGTGTGGCGCAGATGCTGGCACCGAAAGCCAGAACTCCCCGTATACAGACAACGGATAACG
+GTAAGCAGAACACCTATTTCTCCTCACTGGATAACATGGTTGCCCAGGGCAATGTTCTGCCTGTTCTGTACGGGGAAATG
+CGCGTGGGGTCACGCGTGGTTTCTCAGGAGATCAGCACGGCAGACGAAGGGGACGGTGGTCAGGTTGTGGTGATTGGTCG
+CTGATGCAAAATGTTTTATGTGAAACCGCCTGCGGGCGGTTTTGTCATTTATGGAGCGTGAGGAATGGGTAAAGGAAGCA
+GTAAGGGGCATACCCCGCGCGAAGCGAAGGACAACCTGAAGTCCACGCAGTTGCTGAGTGTGATCGATGCCATCAGCGAA
+GGGCCGATTGAAGGTCCGGTGGATGGCTTAAAAAGCGTGCTGCTGAACAGTACGCCGGTGCTGGACACTGAGGGGAATAC
+CAACATATCCGGTGTCACGGTGGTGTTCCGGGCTGGTGAGCAGGAGCAGACTCCGCCGGAGGGATTTGAATCCTCCGGCT
+CCGAGACGGTGCTGGGTACGGAAGTGAAATATGACACGCCGATCACCCGCACCATTACGTCTGCAAACATCGACCGTCTG
+CGCTTTACCTTCGGTGTACAGGCACTGGTGGAAACCACCTCAAAGGGTGACAGGAATCCGTCGGAAGTCCGCCTGCTGGT
+TCAGATACAACGTAACGGTGGCTGGGTGACGGAAAAAGACATCACCATTAAGGGCAAAACCACCTCGCAGTATCTGGCCT
+CGGTGGTGATGGGTAACCTGCCGCCGCGCCCGTTTAATATCCGGATGCGCAGGATGACGCCGGACAGCACCACAGACCAG
+CTGCAGAACAAAACGCTCTGGTCGTCATACACTGAAATCATCGATGTGAAACAGTGCTACCCGAACACGGCACTGGTCGG
+CGTGCAGGTGGACTCGGAGCAGTTCGGCAGCCAGCAGGTGAGCCGTAATTATCATCTGCGCGGGCGTATTCTGCAGGTGC
+CGTCGAACTATAACCCGCAGACGCGGCAATACAGCGGTATCTGGGACGGAACGTTTAAACCGGCATACAGCAACAACATG
+GCCTGGTGTCTGTGGGATATGCTGACCCATCCGCGCTACGGCATGGGGAAACGTCTTGGTGCGGCGGATGTGGATAAATG
+GGCGCTGTATGTCATCGGCCAGTACTGCGACCAGTCAGTGCCGGACGGCTTTGGCGGCACGGAGCCGCGCATCACCTGTA
+ATGCGTACCTGACCACACAGCGTAAGGCGTGGGATGTGCTCAGCGATTTCTGCTCGGCGATGCGCTGTATGCCGGTATGG
+AACGGGCAGACGCTGACGTTCGTGCAGGACCGACCGTCGGATAAGACGTGGACCTATAACCGCAGTAATGTGGTGATGCC
+GGATGATGGCGCGCCGTTCCGCTACAGCTTCAGCGCCCTGAAGGACCGCCATAATGCCGTTGAGGTGAACTGGATTGACC
+CGAACAACGGCTGGGAGACGGCGACAGAGCTTGTTGAAGATACGCAGGCCATTGCCCGTTACGGTCGTAATGTTACGAAG
+ATGGATGCCTTTGGCTGTACCAGCCGGGGGCAGGCACACCGCGCCGGGCTGTGGCTGATTAAAACAGAACTGCTGGAAAC
+GCAGACCGTGGATTTCAGCGTCGGCGCAGAAGGGCTTCGCCATGTACCGGGCGATGTTATTGAAATCTGCGATGATGACT
+ATGCCGGTATCAGCACCGGTGGTCGTGTGCTGGCGGTGAACAGCCAGACCCGGACGCTGACGCTCGACCGTGAAATCACG
+CTGCCATCCTCCGGTACCGCGCTGATAAGCCTGGTTGACGGAAGTGGCAATCCGGTCAGCGTGGAGGTTCAGTCCGTCAC
+CGACGGCGTGAAGGTAAAAGTGAGCCGTGTTCCTGACGGTGTTGCTGAATACAGCGTATGGGAGCTGAAGCTGCCGACGC
+TGCGCCAGCGACTGTTCCGCTGCGTGAGTATCCGTGAGAACGACGACGGCACGTATGCCATCACCGCCGTGCAGCATGTG
+CCGGAAAAAGAGGCCATCGTGGATAACGGGGCGCACTTTGACGGCGAACAGAGTGGCACGGTGAATGGTGTCACGCCGCC
+AGCGGTGCAGCACCTGACCGCAGAAGTCACTGCAGACAGCGGGGAATATCAGGTGCTGGCGCGATGGGACACACCGAAGG
+TGGTGAAGGGCGTGAGTTTCCTGCTCCGTCTGACCGTAACAGCGGACGACGGCAGTGAGCGGCTGGTCAGCACGGCCCGG
+ACGACGGAAACCACATACCGCTTCACGCAACTGGCGCTGGGGAACTACAGGCTGACAGTCCGGGCGGTAAATGCGTGGGG
+GCAGCAGGGCGATCCGGCGTCGGTATCGTTCCGGATTGCCGCACCGGCAGCACCGTCGAGGATTGAGCTGACGCCGGGCT
+ATTTTCAGATAACCGCCACGCCGCATCTTGCCGTTTATGACCCGACGGTACAGTTTGAGTTCTGGTTCTCGGAAAAGCAG
+ATTGCGGATATCAGACAGGTTGAAACCAGCACGCGTTATCTTGGTACGGCGCTGTACTGGATAGCCGCCAGTATCAATAT
+CAAACCGGGCCATGATTATTACTTTTATATCCGCAGTGTGAACACCGTTGGCAAATCGGCATTCGTGGAGGCCGTCGGTC
+GGGCGAGCGATGATGCGGAAGGTTACCTGGATTTTTTCAAAGGCAAGATAACCGAATCCCATCTCGGCAAGGAGCTGCTG
+GAAAAAGTCGAGCTGACGGAGGATAACGCCAGCAGACTGGAGGAGTTTTCGAAAGAGTGGAAGGATGCCAGTGATAAGTG
+GAATGCCATGTGGGCTGTCAAAATTGAGCAGACCAAAGACGGCAAACATTATGTCGCGGGTATTGGCCTCAGCATGGAGG
+ACACGGAGGAAGGCAAACTGAGCCAGTTTCTGGTTGCCGCCAATCGTATCGCATTTATTGACCCGGCAAACGGGAATGAA
+ACGCCGATGTTTGTGGCGCAGGGCAACCAGATATTCATGAACGACGTGTTCCTGAAGCGCCTGACGGCCCCCACCATTAC
+CAGCGGCGGCAATCCTCCGGCCTTTTCCCTGACACCGGACGGAAAGCTGACCGCTAAAAATGCGGATATCAGTGGCAGTG
+TGAATGCGAACTCCGGGACGCTCAGTAATGTGACGATAGCTGAAAACTGTACGATAAACGGTACGCTGAGGGCGGAAAAA
+ATCGTCGGGGACATTGTAAAGGCGGCGAGCGCGGCTTTTCCGCGCCAGCGTGAAAGCAGTGTGGACTGGCCGTCAGGTAC
+CCGTACTGTCACCGTGACCGATGACCATCCTTTTGATCGCCAGATAGTGGTGCTTCCGCTGACGTTTCGCGGAAGTAAGC
+GTACTGTCAGCGGCAGGACAACGTATTCGATGTGTTATCTGAAAGTACTGATGAACGGTGCGGTGATTTATGATGGCGCG
+GCGAACGAGGCGGTACAGGTGTTCTCCCGTATTGTTGACATGCCAGCGGGTCGGGGAAACGTGATCCTGACGTTCACGCT
+TACGTCCACACGGCATTCGGCAGATATTCCGCCGTATACGTTTGCCAGCGATGTGCAGGTTATGGTGATTAAGAAACAGG
+CGCTGGGCATCAGCGTGGTCTGAGTGTGTTACAGAGGTTCGTCCGGGAACGGGCGTTTTATTATAAAACAGTGAGAGGTG
+AACGATGCGTAATGTGTGTATTGCCGTTGCTGTCTTTGCCGCACTTGCGGTGACAGTCACTCCGGCCCGTGCGGAAGGTG
+GACATGGTACGTTTACGGTGGGCTATTTTCAAGTGAAACCGGGTACATTGCCGTCGTTGTCGGGCGGGGATACCGGTGTG
+AGTCATCTGAAAGGGATTAACGTGAAGTACCGTTATGAGCTGACGGACAGTGTGGGGGTGATGGCTTCCCTGGGGTTCGC
+CGCGTCGAAAAAGAGCAGCACAGTGATGACCGGGGAGGATACGTTTCACTATGAGAGCCTGCGTGGACGTTATGTGAGCG
+TGATGGCCGGACCGGTTTTACAAATCAGTAAGCAGGTCAGTGCGTACGCCATGGCCGGAGTGGCTCACAGTCGGTGGTCC
+GGCAGTACAATGGATTACCGTAAGACGGAAATCACTCCCGGGTATATGAAAGAGACGACCACTGCCAGGGACGAAAGTGC
+AATGCGGCATACCTCAGTGGCGTGGAGTGCAGGTATACAGATTAATCCGGCAGCGTCCGTCGTTGTTGATATTGCTTATG
+AAGGCTCCGGCAGTGGCGACTGGCGTACTGACGGATTCATCGTTGGGGTCGGTTATAAATTCTGATTAGCCAGGTAACAC
+AGTGTTATGACAGCCCGCCGGAACCGGTGGGCTTTTTTGTGGGGTGAATATGGCAGTAAAGATTTCAGGAGTCCTGAAAG
+ACGGCACAGGAAAACCGGTACAGAACTGCACCATTCAGCTGAAAGCCAGACGTAACAGCACCACGGTGGTGGTGAACACG
+GTGGGCTCAGAGAATCCGGATGAAGCCGGGCGTTACAGCATGGATGTGGAGTACGGTCAGTACAGTGTCATCCTGCAGGT
+TGACGGTTTTCCACCATCGCACGCCGGGACCATCACCGTGTATGAAGATTCACAACCGGGGACGCTGAATGATTTTCTCT
+GTGCCATGACGGAGGATGATGCCCGGCCGGAGGTGCTGCGTCGTCTTGAACTGATGGTGGAAGAGGTGGCGCGTAACGCG
+TCCGTGGTGGCACAGAGTACGGCAGACGCGAAGAAATCAGCCGGCGATGCCAGTGCATCAGCTGCTCAGGTCGCGGCCCT
+TGTGACTGATGCAACTGACTCAGCACGCGCCGCCAGCACGTCCGCCGGACAGGCTGCATCGTCAGCTCAGGAAGCGTCCT
+CCGGCGCAGAAGCGGCATCAGCAAAGGCCACTGAAGCGGAAAAAAGTGCCGCAGCCGCAGAGTCCTCAAAAAACGCGGCG
+GCCACCAGTGCCGGTGCGGCGAAAACGTCAGAAACGAATGCTGCAGCGTCACAACAATCAGCCGCCACGTCTGCCTCCAC
+CGCGGCCACGAAAGCGTCAGAGGCCGCCACTTCAGCACGAGATGCGGTGGCCTCAAAAGAGGCAGCAAAATCATCAGAAA
+CGAACGCATCATCAAGTGCCGGTCGTGCAGCTTCCTCGGCAACGGCGGCAGAAAATTCTGCCAGGGCGGCAAAAACGTCC
+GAGACGAATGCCAGGTCATCTGAAACAGCAGCGGAACGGAGCGCCTCTGCCGCGGCAGACGCAAAAACAGCGGCGGCGGG
+GAGTGCGTCAACGGCATCCACGAAGGCGACAGAGGCTGCGGGAAGTGCGGTATCAGCATCGCAGAGCAAAAGTGCGGCAG
+AAGCGGCGGCAATACGTGCAAAAAATTCGGCAAAACGTGCAGAAGATATAGCTTCAGCTGTCGCGCTTGAGGATGCGGAC
+ACAACGAGAAAGGGGATAGTGCAGCTCAGCAGTGCAACCAACAGCACGTCTGAAACGCTTGCTGCAACGCCAAAGGCGGT
+TAAGGTGGTAATGGATGAAACGAACAGAAAAGCCCACTGGACAGTCCGGCACTGACCGGAACGCCAACAGCACCAACCGC
+GCTCAGGGGAACAAACAATACCCAGATTGCGAACACCGCTTTTGTACTGGCCGCGATTGCAGATGTTATCGACGCGTCAC
+CTGACGCACTGAATACGCTGAATGAACTGGCCGCAGCGCTCGGGAATGATCCAGATTTTGCTACCACCATGACTAACGCG
+CTTGCGGGTAAACAACCGAAGAATGCGACACTGACGGCGCTGGCAGGGCTTTCCACGGCGAAAAATAAATTACCGTATTT
+TGCGGAAAATGATGCCGCCAGCCTGACTGAACTGACTCAGGTTGGCAGGGATATTCTGGCAAAAAATTCCGTTGCAGATG
+TTCTTGAATACCTTGGGGCCGGTGAGAATTCGGCCTTTCCGGCAGGTGCGCCGATCCCGTGGCCATCAGATATCGTTCCG
+TCTGGCTACGTCCTGATGCAGGGGCAGGCGTTTGACAAATCAGCCTACCCAAAACTTGCTGTCGCGTATCCATCGGGTGT
+GCTTCCTGATATGCGAGGCTGGACAATCAAGGGGAAACCCGCCAGCGGTCGTGCTGTATTGTCTCAGGAACAGGATGGAA
+TTAAGTCGCACACCCACAGTGCCAGTGCATCCGGTACGGATTTGGGGACGAAAACCACATCGTCGTTTGATTACGGGACG
+AAAACAACAGGCAGTTTCGATTACGGCACCAAATCGACGAATAACACGGGGGCTCATGCTCACAGTCTGAGCGGTTCAAC
+AGGGGCCGCGGGTGCTCATGCCCACACAAGTGGTTTAAGGATGAACAGTTCTGGCTGGAGTCAGTATGGAACAGCAACCA
+TTACAGGAAGTTTATCCACAGTTAAAGGAACCAGCACACAGGGTATTGCTTATTTATCGAAAACGGACAGTCAGGGCAGC
+CACAGTCACTCATTGTCCGGTACAGCCGTGAGTGCCGGTGCACATGCGCATACAGTTGGTATTGGTGCGCACCAGCATCC
+GGTTGTTATCGGTGCTCATGCCCATTCTTTCAGTATTGGTTCACACGGACACACCATCACCGTTAACGCTGCGGGTAACG
+CGGAAAACACCGTCAAAAACATTGCATTTAACTATATTGTGAGGCTTGCATAATGGCATTCAGAATGAGTGAACAACCAC
+GGACCATAAAAATTTATAATCTGCTGGCCGGAACTAATGAATTTATTGGTGAAGGTGACGCATATATTCCGCCTCATACC
+GGTCTGCCTGCAAACAGTACCGATATTGCACCGCCAGATATTCCGGCTGGCTTTGTGGCTGTTTTCAACAGTGATGAGGC
+ATCGTGGCATCTCGTTGAAGACCATCGGGGTAAAACCGTCTATGACGTGGCTTCCGGCGACGCGTTATTTATTTCTGAAC
+TCGGTCCGTTACCGGAAAATTTTACCTGGTTATCGCCGGGAGGGGAATATCAGAAGTGGAACGGCACAGCCTGGGTGAAG
+GATACGGAAGCAGAAAAACTGTTCCGGATCCGGGAGGCGGAAGAAACAAAAAAAAGCCTGATGCAGGTAGCCAGTGAGCA
+TATTGCGCCGCTTCAGGATGCTGCAGATCTGGAAATTGCAACGAAGGAAGAAACCTCGTTGCTGGAAGCCTGGAAGAAGT
+ATCGGGTGTTGCTGAACCGTGTTGATACATCAACTGCACCTGATATTGAGTGGCCTGCTGTCCCTGTTATGGAGTAATCG
+TTTTGTGATATGCCGCAGAAACGTTGTATGAAATAACGTTCTGCGGTTAGTTAGTATATTGTAAAGCTGAGTATTGGTTT
+ATTTGGCGATTATTATCTTCAGGAGAATAATGGAAGTTCTATGACTCAATTGTTCATAGTGTTTACATCACCGCCAATTG
+CTTTTAAGACTGAACGCATGAAATATGGTTTTTCGTCATGTTTTGAGTCTGCTGTTGATATTTCTAAAGTCGGTTTTTTT
+TCTTCGTTTTCTCTAACTATTTTCCATGAAATACATTTTTGATTATTATTTGAATCAATTCCAATTACCTGAAGTCTTTC
+ATCTATAATTGGCATTGTATGTATTGGTTTATTGGAGTAGATGCTTGCTTTTCTGAGCCATAGCTCTGATATCCAAATGA
+AGCCATAGGCATTTGTTATTTTGGCTCTGTCAGCTGCATAACGCCAAAAAATATATTTATCTGCTTGATCTTCAAATGTT
+GTATTGATTAAATCAATTGGATGGAATTGTTTATCATAAAAAATTAATGTTTGAATGTGATAACCGTCCTTTAAAAAAGT
+CGTTTCTGCAAGCTTGGCTGTATAGTCAACTAACTCTTCTGTCGAAGTGATATTTTTAGGCTTATCTACCAGTTTTAGAC
+GCTCTTTAATATCTTCAGGAATTATTTTATTGTCATATTGTATCATGCTAAATGACAATTTGCTTATGGAGTAATCTTTT
+AATTTTAAATAAGTTATTCTCCTGGCTTCATCAAATAAAGAGTCGAATGATGTTGGCGAAATCACATCGTCACCCATTGG
+ATTGTTTATTTGTATGCCAAGAGAGTTACAGCAGTTATACATTCTGCCATAGATTATAGCTAAGGCATGTAATAATTCGT
+AATCTTTTAGCGTATTAGCGACCCATCGTCTTTCTGATTTAATAATAGATGATTCAGTTAAATATGAAGGTAATTTCTTT
+TGTGCAAGTCTGACTAACTTTTTTATACCAATGTTTAACATACTTTCATTTGTAATAAACTCAATGTCATTTTCTTCAAT
+GTAAGATGAAATAAGAGTAGCCTTTGCCTCGCTATACATTTCTAAATCGCCTTGTTTTTCTATCGTATTGCGAGAATTTT
+TAGCCCAAGCCATTAATGGATCATTTTTCCATTTTTCAATAACATTATTGTTATACCAAATGTCATATCCTATAATCTGG
+TTTTTGTTTTTTTGAATAATAAATGTTACTGTTCTTGCGGTTTGGAGGAATTGATTCAAATTCAAGCGAAATAATTCAGG
+GTCAAAATATGTATCAATGCAGCATTTGAGCAAGTGCGATAAATCTTTAAGTCTTCTTTCCCATGGTTTTTTAGTCATAA
+AACTCTCCATTTTGATAGGTTGCATGCTAGATGCTGATATATTTTAGAGGTGATAAAATTAACTGCTTAACTGTCAATGT
+AATACAAGTTGTTTGATCTTTGCAATGATTCTTATCAGAAACCATATAGTAAATTAGTTACACAGGAAATTTTTAATATT
+ATTATTATCATTCATTATGTATTAAAATTAGAGTTGTGGCTTGGCTCTGCTAACACGTTGCTCATAGGAGATATGGTAGA
+GCCGCAGACACGTCGTATGCAGGAACGTGCTGCGGCTGGCTGGTGAACTTCCGATAGTGCGGGTGTTGAATGATTTCCAG
+TTGCTACCGATTTTACATATTTTTTGCATGAGAGAATTTGTACCACCTCCCACCGACCATCTATGACTGTACGCCACTGT
+CCCTAGGACTGCTATGTGCCGGAGCGGACATTACAAACGTCCTTCTCGGTGCATGCCACTGTTGCCAATGACCTGCCTAG
+GAATTGGTTAGCAAGTTACTACCGGATTTTGTAAAAACAGCCCTCCTCATATAAAAAGTATTCGTTCACTTCCGATAAGC
+GTCGTAATTTTCTATCTTTCATCATATTCTAGATCCCTCTGAAAAAATCTTCCGAGTTTGCTAGGCACTGATACATAACT
+CTTTTCCAATAATTGGGGAAGTCATTCAAATCTATAATAGGTTTCAGATTTGCTTCAATAAATTCTGACTGTAGCTGCTG
+AAACGTTGCGGTTGAACTATATTTCCTTATAACTTTTACGAAAGAGTTTCTTTGAGTAATCACTTCACTCAAGTGCTTCC
+CTGCCTCCAAACGATACCTGTTAGCAATATTTAATAGCTTGAAATGATGAAGAGCTCTGTGTTTGTCTTCCTGCCTCCAG
+TTCGCCGGGCATTCAACATAAAAACTGATAGCACCCGGAGTTCCGGAAACGAAATTTGCATATACCCATTGCTCACGAAA
+AAAAATGTCCTTGTCGATATAGGGATGAATCGCTTGGTGTACCTCATCTACTGCGAAAACTTGACCTTTCTCTCCCATAT
+TGCAGTCGCGGCACGATGGAACTAAATTAATAGGCATCACCGAAAATTCAGGATAATGTGCAATAGGAAGAAAATGATCT
+ATATTTTTTGTCTGTCCTATATCACCACAAAATGGACATTTTTCACCTGATGAAACAAGCATGTCATCGTAATATGTTCT
+AGCGGGTTTGTTTTTATCTCGGAGATTATTTTCATAAAGCTTTTCTAATTTAACCTTTGTCAGGTTACCAACTACTAAGG
+TTGTAGGCTCAAGAGGGTGTGTCCTGTCGTAGGTAAATAACTGACCTGTCGAGCTTAATATTCTATATTGTTGTTCTTTC
+TGCAAAAAAGTGGGGAAGTGAGTAATGAAATTATTTCTAACATTTATCTGCATCATACCTTCCGAGCATTTATTAAGCAT
+TTCGCTATAAGTTCTCGCTGGAAGAGGTAGTTTTTTCATTGTACTTTACCTTCATCTCTGTTCATTATCATCGCTTTTAA
+AACGGTTCGACCTTCTAATCCTATCTGACCATTATAATTTTTTAGAATGGTTTCATAAGAAAGCTCTGAATCAACGGACT
+GCGATAATAAGTGGTGGTATCCAGAATTTGTCACTTCAAGTAAAAACACCTCACGAGTTAAAACACCTAAGTTCTCACCG
+AATGTCTCAATATCCGGACGGATAATATTTATTGCTTCTCTTGACCGTAGGACTTTCCACATGCAGGATTTTGGAACCTC
+TTGCAGTACTACTGGGGAATGAGTTGCAATTATTGCTACACCATTGCGTGCATCGAGTAAGTCGCTTAATGTTCGTAAAA
+AAGCAGAGAGCAAAGGTGGATGCAGATGAACCTCTGGTTCATCGAATAAAACTAATGACTTTTCGCCAACGACATCTACT
+AATCTTGTGATAGTAAATAAAACAATTGCATGTCCAGAGCTCATTCGAAGCAGATATTTCTGGATATTGTCATAAAACAA
+TTTAGTGAATTTATCATCGTCCACTTGAATCTGTGGTTCATTACGTCTTAACTCTTCATATTTAGAAATGAGGCTGATGA
+GTTCCATATTTGAAAAGTTTTCATCACTACTTAGTTTTTTGATAGCTTCAAGCCAGAGTTGTCTTTTTCTATCTACTCTC
+ATACAACCAATAAATGCTGAAATGAATTCTAAGCGGAGATCGCCTAGTGATTTTAAACTATTGCTGGCAGCATTCTTGAG
+TCCAATATAAAAGTATTGTGTACCTTTTGCTGGGTCAGGTTGTTCTTTAGGAGGAGTAAAAGGATCAAATGCACTAAACG
+AAACTGAAACAAGCGATCGAAAATATCCCTTTGGGATTCTTGACTCGATAAGTCTATTATTTTCAGAGAAAAAATATTCA
+TTGTTTTCTGGGTTGGTGATTGCACCAATCATTCCATTCAAAATTGTTGTTTTACCACACCCATTCCGCCCGATAAAAGC
+ATGAATGTTCGTGCTGGGCATAGAATTAACCGTCACCTCAAAAGGTATAGTTAAATCACTGAATCCGGGAGCACTTTTTC
+TATTAAATGAAAAGTGGAAATCTGACAATTCTGGCAAACCATTTAACACACGTGCGAACTGTCCATGAATTTCTGAAAGA
+GTTACCCCTCTAAGTAATGAGGTGTTAAGGACGCTTTCATTTTCAATGTCGGCTAATCGATTTGGCCATACTACTAAATC
+CTGAATAGCTTTAAGAAGGTTATGTTTAAAACCATCGCTTAATTTGCTGAGATTAACATAGTAGTCAATGCTTTCACCTA
+AGGAAAAAAACATTTCAGGGAGTTGACTGAATTTTTTATCTATTAATGAATAAGTGCTTACTTCTTCTTTTTGACCTACA
+AAACCAATTTTAACATTTCCGATATCGCATTTTTCACCATGCTCATCAAAGACAGTAAGATAAAACATTGTAACAAAGGA
+ATAGTCATTCCAACCATCTGCTCGTAGGAATGCCTTATTTTTTTCTACTGCAGGAATATACCCGCCTCTTTCAATAACAC
+TAAACTCCAACATATAGTAACCCTTAATTTTATTAAAATAACCGCAATTTATTTGGCGGCAACACAGGATCTCTCTTTTA
+AGTTACTCTCTATTACATACGTTTTCCATCTAAAAATTAGTAGTATTGAACTTAACGGGGCATCGTATTGTAGTTTTCCA
+TATTTAGCTTTCTGCTTCCTTTTGGATAACCCACTGTTATTCATGTTGCATGGTGCACTGTTTATACCAACGATATAGTC
+TATTAATGCATATATAGTATCGCCGAACGATTAGCTCTTCAGGCTTCTGAAGAAGCGTTTCAAGTACTAATAAGCCGATA
+GATAGCCACGGACTTCGTAGCCATTTTTCATAAGTGTTAACTTCCGCTCCTCGCTCATAACAGACATTCACTACAGTTAT
+GGCGGAAAGGTATGCATGCTGGGTGTGGGGAAGTCGTGAAAGAAAAGAAGTCAGCTGCGTCGTTTGACATCACTGCTATC
+TTCTTACTGGTTATGCAGGTCGTAGTGGGTGGCACACAAAGCTTTGCACTGGATTGCGAGGCTTTGTGCTTCTCTGGAGT
+GCGACAGGTTTGATGACAAAAAATTAGCGCAAGAAGACAAAAATCACCTTGCGCTAATGCTCTGTTACAGGTCACTAATA
+CCATCTAAGTAGTTGATTCATAGTGACTGCATATGTTGTGTTTTACAGTATTATGTAGTCTGTTTTTTATGCAAAATCTA
+ATTTAATATATTGATATTTATATCATTTTACGTTTCTCGTTCAGCTTTTTTATACTAAGTTGGCATTATAAAAAAGCATT
+GCTTATCAATTTGTTGCAACGAACAGGTCACTATCAGTCAAAATAAAATCATTATTTGATTTCAATTTTGTCCCACTCCC
+TGCCTCTGTCATCACGATACTGTGATGCCATGGTGTCCGACTTATGCCCGAGAAGATGTTGAGCAAACTTATCGCTTATC
+TGCTTCTCATAGAGTCTTGCAGACAAACTGCGCAACTCGTGAAAGGTAGGCGGATCCCCTTCGAAGGAAAGACCTGATGC
+TTTTCGTGCGCGCATAAAATACCTTGATACTGTGCCGGATGAAAGCGGTTCGCGACGAGTAGATGCAATTATGGTTTCTC
+CGCCAAGAATCTCTTTGCATTTATCAAGTGTTTCCTTCATTGATATTCCGAGAGCATCAATATGCAATGCTGTTGGGATG
+GCAATTTTTACGCCTGTTTTGCTTTGCTCGACATAAAGATATCCATCTACGATATCAGACCACTTCATTTCGCATAAATC
+ACCAACTCGTTGCCCGGTAACAACAGCCAGTTCCATTGCAAGTCTGAGCCAACATGGTGATGATTCTGCTGCTTGATAAA
+TTTTCAGGTATTCGTCAGCCGTAAGTCTTGATCTCCTTACCTCTGATTTTGCTGCGCGAGTGGCAGCGACATGGTTTGTT
+GTTATATGGCCTTCAGCTATTGCCTCTCGGAATGCATCGCTCAGTGTTGATCTGATTAACTTGGCTGACGCCGCCTTGCC
+CTCGTCTATGTATCCATTGAGCATTGCCGCAATTTCTTTTGTGGTGATGTCTTCAAGTGGAGCATCAGGCAGACCCCTCC
+TTATTGCTTTAATTTTGCTCATGTAATTTATGAGTGTCTTCTGCTTGATTCCTCTGCTGGCCAGGATTTTTTCGTAGCGA
+TCAAGCCATGAATGTAACGTAACGGAATTATCACTGTTGATTCTCGCTGTCAGAGGCTTGTGTTTGTGTCCTGAAAATAA
+CTCAATGTTGGCCTGTATAGCTTCAGTGATTGCGATTCGCCTGTCTCTGCCTAATCCAAACTCTTTACCCGTCCTTGGGT
+CCCTGTAGCAGTAATATCCATTGTTTCTTATATAAAGGTTAGGGGGTAAATCCCGGCGCTCATGACTTCGCCTTCTTCCC
+ATTTCTGATCCTCTTCAAAAGGCCACCTGTTACTGGTCGATTTAAGTCAACCTTTACCGCTGATTCGTGGAACAGATACT
+CTCTTCCATCCTTAACCGGAGGTGGGAATATCCTGCATTCCCGAACCCATCGACGAACTGTTTCAAGGCTTCTTGGACGT
+CGCTGGCGTGCGTTCCACTCCTGAAGTGTCAAGTACATCGCAAAGTCTCCGCAATTACACGCAAGAAAAAACCGCCATCA
+GGCGGCTTGGTGTTCTTTCAGTTCTTCAATTCGAATATTGGTTACGTCTGCATGTGCTATCTGCGCCCATATCATCCAGT
+GGTCGTAGCAGTCGTTGATGTTCTCCGCTTCGATAACTCTGTTGAATGGCTCTCCATTCCATTCTCCTGTGACTCGGAAG
+TGCATTTATCATCTCCATAAAACAAAACCCGCCGTAGCGAGTTCAGATAAAATAAATCCCCGCGAGTGCGAGGATTGTTA
+TGTAATATTGGGTTTAATCATCTATATGTTTTGTACAGAGAGGGCAAGTATCGTTTCCACCGTACTCGTGATAATAATTT
+TGCACGGTATCAGTCATTTCTCGCACATTGCAGAATGGGGATTTGTCTTCATTAGACTTATAAACCTTCATGGAATATTT
+GTATGCCGACTCTATATCTATACCTTCATCTACATAAACACCTTCGTGATGTCTGCATGGAGACAAGACACCGGATCTGC
+ACAACATTGATAACGCCCAATCTTTTTGCTCAGACTCTAACTCATTGATACTCATTTATAAACTCCTTGCAATGTATGTC
+GTTTCAGCTAAACGGTATCAGCAATGTTTATGTAAAGAAACAGTAAGATAATACTCAACCCGATGTTTGAGTACGGTCAT
+CATCTGACACTACAGACTCTGGCATCGCTGTGAAGACGACGCGAAATTCAGCATTTTCACAAGCGTTATCTTTTACAAAA
+CCGATCTCACTCTCCTTTGATGCGAATGCCAGCGTCAGACATCATATGCAGATACTCACCTGCATCCTGAACCCATTGAC
+CTCCAACCCCGTAATAGCGATGCGTAATGATGTCGATAGTTACTAACGGGTCTTGTTCGATTAACTGCCGCAGAAACTCT
+TCCAGGTCACCAGTGCAGTGCTTGATAACAGGAGTCTTCCCAGGATGGCGAACAACAAGAAACTGGTTTCCGTCTTCACG
+GACTTCGTTGCTTTCCAGTTTAGCAATACGCTTACTCCCATCCGAGATAACACCTTCGTAATACTCACGCTGCTCGTTGA
+GTTTTGATTTTGCTGTTTCAAGCTCAACACGCAGTTTCCCTACTGTTAGCGCAATATCCTCGTTCTCCTGGTCGCGGCGT
+TTGATGTATTGCTGGTTTCTTTCCCGTTCATCCAGCAGTTCCAGCACAATCGATGGTGTTACCAATTCATGGAAAAGGTC
+TGCGTCAAATCCCCAGTCGTCATGCATTGCCTGCTCTGCCGCTTCACGCAGTGCCTGAGAGTTAATTTCGCTCACTTCGA
+ACCTCTCTGTTTACTGATAAGTTCCAGATCCTCCTGGCAACTTGCACAAGTCCGACAACCCTGAACGACCAGGCGTCTTC
+GTTCATCTATCGGATCGCCACACTCACAACAATGAGTGGCAGATATAGCCTGGTGGTTCAGGCGGCGCATTTTTATTGCT
+GTGTTGCGCTGTAATTCTTCTATTTCTGATGCTGAATCAATGATGTCTGCCATCTTTCATTAATCCCTGAACTGTTGGTT
+AATACGCTTGAGGGTGAATGCGAATAATAAAAAAGGAGCCTGTAGCTCCCTGATGATTTTGCTTTTCATGTTCATCGTTC
+CTTAAAGACGCCGTTTAACATGCCGATTGCCAGGCTTAAATGAGTCGGTGTGAATCCCATCAGCGTTACCGTTTCGCGGT
+GCTTCTTCAGTACGCTACGGCAAATGTCATCGACGTTTTTATCCGGAAACTGCTGTCTGGCTTTTTTTGATTTCAGAATT
+AGCCTGACGGGCAATGCTGCGAAGGGCGTTTTCCTGCTGAGGTGTCATTGAACAAGTCCCATGTCGGCAAGCATAAGCAC
+ACAGAATATGAAGCCCGCTGCCAGAAAAATGCATTCCGTGGTTGTCATACCTGGTTTCTCTCATCTGCTTCTGCTTTCGC
+CACCATCATTTCCAGCTTTTGTGAAAGGGATGCGGCTAACGTATGAAATTCTTCGTCTGTTTCTACTGGTATTGGCACAA
+ACCTGATTCCAATTTGAGCAAGGCTATGTGCCATCTCGATACTCGTTCTTAACTCAACAGAAGATGCTTTGTGCATACAG
+CCCCTCGTTTATTATTTATCTCCTCAGCCAGCCGCTGTGCTTTCAGTGGATTTCGGATAACAGAAAGGCCGGGAAATACC
+CAGCCTCGCTTTGTAACGGAGTAGACGAAAGTGATTGCGCCTACCCGGATATTATCGTGAGGATGCGTCATCGCCATTGC
+TCCCCAAATACAAAACCAATTTCAGCCAGTGCCTCGTCCATTTTTTCGATGAACTCCGGCACGATCTCGTCAAAACTCGC
+CATGTACTTTTCATCCCGCTCAATCACGACATAATGCAGGCCTTCACGCTTCATACGCGGGTCATAGTTGGCAAAGTACC
+AGGCATTTTTTCGCGTCACCCACATGCTGTACTGCACCTGGGCCATGTAAGCTGACTTTATGGCCTCGAAACCACCGAGC
+CGGAACTTCATGAAATCCCGGGAGGTAAACGGGCATTTCAGTTCAAGGCCGTTGCCGTCACTGCATAAACCATCGGGAGA
+GCAGGCGGTACGCATACTTTCGTCGCGATAGATGATCGGGGATTCAGTAACATTCACGCCGGAAGTGAATTCAAACAGGG
+TTCTGGCGTCGTTCTCGTACTGTTTTCCCCAGGCCAGTGCTTTAGCGTTAACTTCCGGAGCCACACCGGTGCAAACCTCA
+GCAAGCAGGGTGTGGAAGTAGGACATTTTCATGTCAGGCCACTTCTTTCCGGAGCGGGGTTTTGCTATCACGTTGTGAAC
+TTCTGAAGCGGTGATGACGCCGAGCCGTAATTTGTGCCACGCATCATCCCCCTGTTCGACAGCTCTCACATCGATCCCGG
+TACGCTGCAGGATAATGTCCGGTGTCATGCTGCCACCTTCTGCTCTGCGGCTTTCTGTTTCAGGAATCCAAGAGCTTTTA
+CTGCTTCGGCCTGTGTCAGTTCTGACGATGCACGAATGTCGCGGCGAAATATCTGGGAACAGAGCGGCAATAAGTCGTCA
+TCCCATGTTTTATCCAGGGCGATCAGCAGAGTGTTAATCTCCTGCATGGTTTCATCGTTAACCGGAGTGATGTCGCGTTC
+CGGCTGACGTTCTGCAGTGTATGCAGTATTTTCGACAATGCGCTCGGCTTCATCCTTGTCATAGATACCAGCAAATCCGA
+AGGCCAGACGGGCACACTGAATCATGGCTTTATGACGTAACATCCGTTTGGGATGCGACTGCCACGGCCCCGTGATTTCT
+CTGCCTTCGCGAGTTTTGAATGGTTCGCGGCGGCATTCATCCATCCATTCGGTAACGCAGATCGGATGATTACGGTCCTT
+GCGGTAAATCCGGCATGTACAGGATTCATTGTCCTGCTCAAAGTCCATGCCATCAAACTGCTGGTTTTCATTGATGATGC
+GGGACCAGCCATCAACGCCCACCACCGGAACGATGCCATTCTGCTTATCAGGAAAGGCGTAAATTTCTTTCGTCCACGGA
+TTAAGGCCGTACTGGTTGGCAACGATCAGTAATGCGATGAACTGCGCATCGCTGGCATCACCTTTAAATGCCGTCTGGCG
+AAGAGTGGTGATCAGTTCCTGTGGGTCGACAGAATCCATGCCGACACGTTCAGCCAGCTTCCCAGCCAGCGTTGCGAGTG
+CAGTACTCATTCGTTTTATACCTCTGAATCAATATCAACCTGGTGGTGAGCAATGGTTTCAACCATGTACCGGATGTGTT
+CTGCCATGCGCTCCTGAAACTCAACATCGTCATCAAACGCACGGGTAATGGATTTTTTGCTGGCCCCGTGGCGTTGCAAA
+TGATCGATGCATAGCGATTCAAACAGGTGCTGGGGCAGGCCTTTTTCCATGTCGTCTGCCAGTTCTGCCTCTTTCTCTTC
+ACGGGCGAGCTGCTGGTAGTGACGCGCCCAGCTCTGAGCCTCAAGACGATCCTGAATGTAATAAGCGTTCATGGCTGAAC
+TCCTGAAATAGCTGTGAAAATATCGCCCGCGAAATGCCGGGCTGATTAGGAAAACAGGAAAGGGGGTTAGTGAATGCTTT
+TGCTTGATCTCAGTTTCAGTATTAATATCCATTTTTTATAAGCGTCGACGGCTTCACGAAACATCTTTTCATCGCCAATA
+AAAGTGGCGATAGTGAATTTAGTCTGGATAGCCATAAGTGTTTGATCCATTCTTTGGGACTCCTGGCTGATTAAGTATGT
+CGATAAGGCGTTTCCATCCGTCACGTAATTTACGGGTGATTCGTTCAAGTAAAGATTCGGAAGGGCAGCCAGCAACAGGC
+CACCCTGCAATGGCATATTGCATGGTGTGCTCCTTATTTATACATAACGAAAAACGCCTCGAGTGAAGCGTTATTGGTAT
+GCGGTAAAACCGCACTCAGGCGGCCTTGATAGTCATATCATCTGAATCAAATATTCCTGATGTATCGATATCGGTAATTC
+TTATTCCTTCGCTACCATCCATTGGAGGCCATCCTTCCTGACCATTTCCATCATTCCAGTCGAACTCACACACAACACCA
+TATGCATTTAAGTCGCTTGAAATTGCTATAAGCAGAGCATGTTGCGCCAGCATGATTAATACAGCATTTAATACAGAGCC
+GTGTTTATTGAGTCGGTATTCAGAGTCTGACCAGAAATTATTAATCTGGTGAAGTTTTTCCTCTGTCATTACGTCATGGT
+CGATTTCAATTTCTATTGATGCTTTCCAGTCGTAATCAATGATGTATTTTTTGATGTTTGACATCTGTTCATATCCTCAC
+AGATAAAAAATCGCCCTCACACTGGAGGGCAAAGAAGATTTCCAATAATCAGAACAAGTCGGCTCCTGTTTAGTTACGAG
+CGACATTGCTCCGTGTATTCACTCGTTGGAATGAATACACAGTGCAGTGTTTATTCTGTTATTTATGCCAAAAATAAAGG
+CCACTATCAGGCAGCTTTGTTGTTCTGTTTACCAAGTTCTCTGGCAATCATTGCCGTCGTTCGTATTGCCCATTTATCGA
+CATATTTCCCATCTTCCATTACAGGAAACATTTCTTCAGGCTTAACCATGCATTCCGATTGCAGCTTGCATCCATTGCAT
+CGCTTGAATTGTCCACACCATTGATTTTTATCAATAGTCGTAGTCATACGGATAGTCCTGGTATTGTTCCATCACATCCT
+GAGGATGCTCTTCGAACTCTTCAAATTCTTCTTCCATATATCACCTTAAATAGTGGATTGCGGTAGTAAAGATTGTGCCT
+GTCTTTTAACCACATCAGGCTCGGTGGTTCTCGTGTACCCCTACAGCGAGAAATCGGATAAACTATTACAACCCCTACAG
+TTTGATGAGTATAGAAATGGATCCACTCGTTATTCTCGGACGAGTGTTCAGTAATGAACCTCTGGAGAGAACCATGTATA
+TGATCGTTATCTGGGTTGGACTTCTGCTTTTAAGCCCAGATAACTGGCCTGAATATGTTAATGAGAGAATCGGTATTCCT
+CATGTGTGGCATGTTTTCGTCTTTGCTCTTGCATTTTCGCTAGCAATTAATGTGCATCGATTATCAGCTATTGCCAGCGC
+CAGATATAAGCGATTTAAGCTAAGAAAACGCATTAAGATGCAAAACGATAAAGTGCGATCAGTAATTCAAAACCTTACAG
+AAGAGCAATCTATGGTTTTGTGCGCAGCCCTTAATGAAGGCAGGAAGTATGTGGTTACATCAAAACAATTCCCATACATT
+AGTGAGTTGATTGAGCTTGGTGTGTTGAACAAAACTTTTTCCCGATGGAATGGAAAGCATATATTATTCCCTATTGAGGA
+TATTTACTGGACTGAATTAGTTGCCAGCTATGATCCATATAATATTGAGATAAAGCCAAGGCCAATATCTAAGTAACTAG
+ATAAGAGGAATCGATTTTCCCTTAATTTTCTGGCGTCCACTGCATGTTATGCCGCGTTCGCCAGGCTTGCTGTACCATGT
+GCGCTGATTCTTGCGCTCAATACGTTGCAGGTTGCTTTCAATCTGTTTGTGGTATTCAGCCAGCACTGTAAGGTCTATCG
+GATTTAGTGCGCTTTCTACTCGTGATTTCGGTTTGCGATTCAGCGAGAGAATAGGGCGGTTAACTGGTTTTGCGCTTACC
+CCAACCAACAGGGGATTTGCTGCTTTCCATTGAGCCTGTTTCTCTGCGCGACGTTCGCGGCGGCGTGTTTGTGCATCCAT
+CTGGATTCTCCTGTCAGTTAGCTTTGGTGGTGTGTGGCAGTTGTAGTCCTGAACGAAAACCCCCCGCGATTGGCACATTG
+GCAGCTAATCCGGAATCGCACTTACGGCCAATGCTTCGTTTCGTATCACACACCCCAAAGCCTTCTGCTTTGAATGCTGC
+CCTTCTTCAGGGCTTAATTTTTAAGAGCGTCACCTTCATGGTGGTCAGTGCGTCCTGCTGATGTGCTCAGTATCACCGCC
+AGTGGTATTTATGTCAACACCGCCAGAGATAATTTATCACCGCAGATGGTTATCTGTATGTTTTTTATATGAATTTATTT
+TTTGCAGGGGGGCATTGTTTGGTAGGTGAGAGATCTGAATTGCTATGTTTAGTGAGTTGTATCTATTTATTTTTCAATAA
+ATACAATTGGTTATGTGTTTTGGGGGCGATCGTGAGGCAAAGAAAACCCGGCGCTGAGGCCGGGTTATTCTTGTTCTCTG
+GTCAAATTATATAGTTGGAAAACAAGGATGCATATATGAATGAACGATGCAGAGGCAATGCCGATGGCGATAGTGGGTAT
+CATGTAGCCGCTTATGCTGGAAAGAAGCAATAACCCGCAGAAAAACAAAGCTCCAAGCTCAACAAAACTAAGGGCATAGA
+CAATAACTACCGATGTCATATACCCATACTCTCTAATCTTGGCCAGTCGGCGCGTTCTGCTTCCGATTAGAAACGTCAAG
+GCAGCAATCAGGATTGCAATCATGGTTCCTGCATATGATGACAATGTCGCCCCAAGACCATCTCTATGAGCTGAAAAAGA
+AACACCAGGAATGTAGTGGCGGAAAAGGAGATAGCAAATGCTTACGATAACGTAAGGAATTATTACTATGTAAACACCAG
+GCATGATTCTGTTCCGCATAATTACTCCTGATAATTAATCCTTAACTTTGCCCACCTGCCTTTTAAAACATTCCAGTATA
+TCACTTTTCATTCTTGCGTAGCAATATGCCATCTCTTCAGCTATCTCAGCATTGGTGACCTTGTTCAGAGGCGCTGAGAG
+ATGGCCTTTTTCTGATAGATAATGTTCTGTTAAAATATCTCCGGCCTCATCTTTTGCCCGCAGGCTAATGTCTGAAAATT
+GAGGTGACGGGTTAAAAATAATATCCTTGGCAACCTTTTTTATATCCCTTTTAAATTTTGGCTTAATGACTATATCCAAT
+GAGTCAAAAAGCTCCCCTTCAATATCTGTTGCCCCTAAGACCTTTAATATATCGCCAAATACAGGTAGCTTGGCTTCTAC
+CTTCACCGTTGTTCGGCCGATGAAATGCATATGCATAACATCGTCTTTGGTGGTTCCCCTCATCAGTGGCTCTATCTGAA
+CGCGCTCTCCACTGCTTAATGACATTCCTTTCCCGATTAAAAAATCTGTCAGATCGGATGTGGTCGGCCCGAAAACAGTT
+CTGGCAAAACCAATGGTGTCGCCTTCAACAAACAAAAAAGATGGGAATCCCAATGATTCGTCATCTGCGAGGCTGTTCTT
+AATATCTTCAACTGAAGCTTTAGAGCGATTTATCTTCTGAACCAGACTCTTGTCATTTGTTTTGGTAAAGAGAAAAGTTT
+TTCCATCGATTTTATGAATATACAAATAATTGGAGCCAACCTGCAGGTGATGATTATCAGCCAGCAGAGAATTAAGGAAA
+ACAGACAGGTTTATTGAGCGCTTATCTTTCCCTTTATTTTTGCTGCGGTAAGTCGCATAAAAACCATTCTTCATAATTCA
+ATCCATTTACTATGTTATGTTCTGAGGGGAGTGAAAATTCCCCTAATTCGATGAAGATTCTTGCTCAATTGTTATCAGCT
+ATGCGCCGACCAGAACACCTTGCCGATCAGCCAAACGTCTCTTCAGGCCACTGACTAGCGATAACTTTCCCCACAACGGA
+ACAACTCTCATTGCATGGGATCATTGGGTACTGTGGGTTTAGTGGTTGTAAAAACACCTGACCGCTATCCCTGATCAGTT
+TCTTGAAGGTAAACTCATCACCCCCAAGTCTGGCTATGCAGAAATCACCTGGCTCAACAGCCTGCTCAGGGTCAACGAGA
+ATTAACATTCCGTCAGGAAAGCTTGGCTTGGAGCCTGTTGGTGCGGTCATGGAATTACCTTCAACCTCAAGCCAGAATGC
+AGAATCACTGGCTTTTTTGGTTGTGCTTACCCATCTCTCCGCATCACCTTTGGTAAAGGTTCTAAGCTTAGGTGAGAACA
+TCCCTGCCTGAACATGAGAAAAAACAGGGTACTCATACTCACTTCTAAGTGACGGCTGCATACTAACCGCTTCATACATC
+TCGTAGATTTCTCTGGCGATTGAAGGGCTAAATTCTTCAACGCTAACTTTGAGAATTTTTGTAAGCAATGCGGCGTTATA
+AGCATTTAATGCATTGATGCCATTAAATAAAGCACCAACGCCTGACTGCCCCATCCCCATCTTGTCTGCGACAGATTCCT
+GGGATAAGCCAAGTTCATTTTTCTTTTTTTCATAAATTGCTTTAAGGCGACGTGCGTCCTCAAGCTGCTCTTGTGTTAAT
+GGTTTCTTTTTTGTGCTCATACGTTAAATCTATCACCGCAAGGGATAAATATCTAACACCGTGCGTGTTGACTATTTTAC
+CTCTGGCGGTGATAATGGTTGCATGTACTAAGGAGGTTGTATGGAACAACGCATAACCCTGAAAGATTATGCAATGCGCT
+TTGGGCAAACCAAGACAGCTAAAGATCTCGGCGTATATCAAAGCGCGATCAACAAGGCCATTCATGCAGGCCGAAAGATT
+TTTTTAACTATAAACGCTGATGGAAGCGTTTATGCGGAAGAGGTAAAGCCCTTCCCGAGTAACAAAAAAACAACAGCATA
+AATAACCCCGCTCTTACACATTCCAGCCCTGAAAAAGGGCATCAAATTAAACCACACCTATGGTGTATGCATTTATTTGC
+ATACATTCAATCAATTGTTATCTAAGGAAATACTTACATATGGTTCGTGCAAACAAACGCAACGAGGCTCTACGAATCGA
+GAGTGCGTTGCTTAACAAAATCGCAATGCTTGGAACTGAGAAGACAGCGGAAGCTGTGGGCGTTGATAAGTCGCAGATCA
+GCAGGTGGAAGAGGGACTGGATTCCAAAGTTCTCAATGCTGCTTGCTGTTCTTGAATGGGGGGTCGTTGACGACGACATG
+GCTCGATTGGCGCGACAAGTTGCTGCGATTCTCACCAATAAAAAACGCCCGGCGGCAACCGAGCGTTCTGAACAAATCCA
+GATGGAGTTCTGAGGTCATTACTGGATCTATCAACAGGAGTCATTATGACAAATACAGCAAAAATACTCAACTTCGGCAG
+AGGTAACTTTGCCGGACAGGAGCGTAATGTGGCAGATCTCGATGATGGTTACGCCAGACTATCAAATATGCTGCTTGAGG
+CTTATTCGGGCGCAGATCTGACCAAGCGACAGTTTAAAGTGCTGCTTGCCATTCTGCGTAAAACCTATGGGTGGAATAAA
+CCAATGGACAGAATCACCGATTCTCAACTTAGCGAGATTACAAAGTTACCTGTCAAACGGTGCAATGAAGCCAAGTTAGA
+ACTCGTCAGAATGAATATTATCAAGCAGCAAGGCGGCATGTTTGGACCAAATAAAAACATCTCAGAATGGTGCATCCCTC
+AAAACGAGGGAAAATCCCCTAAAACGAGGGATAAAACATCCCTCAAATTGGGGGATTGCTATCCCTCAAAACAGGGGGAC
+ACAAAAGACACTATTACAAAAGAAAAAAGAAAAGATTATTCGTCAGAGAATTCTGGCGAATCCTCTGACCAGCCAGAAAA
+CGACCTTTCTGTGGTGAAACCGGATGCTGCAATTCAGAGCGGCAGCAAGTGGGGGACAGCAGAAGACCTGACCGCCGCAG
+AGTGGATGTTTGACATGGTGAAGACTATCGCACCATCAGCCAGAAAACCGAATTTTGCTGGGTGGGCTAACGATATCCGC
+CTGATGCGTGAACGTGACGGACGTAACCACCGCGACATGTGTGTGCTGTTCCGCTGGGCATGCCAGGACAACTTCTGGTC
+CGGTAACGTGCTGAGCCCGGCCAAACTCCGCGATAAGTGGACCCAACTCGAAATCAACCGTAACAAGCAACAGGCAGGCG
+TGACAGCCAGCAAACCAAAACTCGACCTGACAAACACAGACTGGATTTACGGGGTGGATCTATGAAAAACATCGCCGCAC
+AGATGGTTAACTTTGACCGTGAGCAGATGCGTCGGATCGCCAACAACATGCCGGAACAGTACGACGAAAAGCCGCAGGTA
+CAGCAGGTAGCGCAGATCATCAACGGTGTGTTCAGCCAGTTACTGGCAACTTTCCCGGCGAGCCTGGCTAACCGTGACCA
+GAACGAAGTGAACGAAATCCGTCGCCAGTGGGTTCTGGCTTTTCGGGAAAACGGGATCACCACGATGGAACAGGTTAACG
+CAGGAATGCGCGTAGCCCGTCGGCAGAATCGACCATTTCTGCCATCACCCGGGCAGTTTGTTGCATGGTGCCGGGAAGAA
+GCATCCGTTACCGCCGGACTGCCAAACGTCAGCGAGCTGGTTGATATGGTTTACGAGTATTGCCGGAAGCGAGGCCTGTA
+TCCGGATGCGGAGTCTTATCCGTGGAAATCAAACGCGCACTACTGGCTGGTTACCAACCTGTATCAGAACATGCGGGCCA
+ATGCGCTTACTGATGCGGAATTACGCCGTAAGGCCGCAGATGAGCTTGTCCATATGACTGCGAGAATTAACCGTGGTGAG
+GCGATCCCTGAACCAGTAAAACAACTTCCTGTCATGGGCGGTAGACCTCTAAATCGTGCACAGGCTCTGGCGAAGATCGC
+AGAAATCAAAGCTAAGTTCGGACTGAAAGGAGCAAGTGTATGACGGGCAAAGAGGCAATTATTCATTACCTGGGGACGCA
+TAATAGCTTCTGTGCGCCGGACGTTGCCGCGCTAACAGGCGCAACAGTAACCAGCATAAATCAGGCCGCGGCTAAAATGG
+CACGGGCAGGTCTTCTGGTTATCGAAGGTAAGGTCTGGCGAACGGTGTATTACCGGTTTGCTACCAGGGAAGAACGGGAA
+GGAAAGATGAGCACGAACCTGGTTTTTAAGGAGTGTCGCCAGAGTGCCGCGATGAAACGGGTATTGGCGGTATATGGAGT
+TAAAAGATGACCATCTACATTACTGAGCTAATAACAGGCCTGCTGGTAATCGCAGGCCTTTTTATTTGGGGGAGAGGGAA
+GTCATGAAAAAACTAACCTTTGAAATTCGATCTCCAGCACATCAGCAAAACGCTATTCACGCAGTACAGCAAATCCTTCC
+AGACCCAACCAAACCAATCGTAGTAACCATTCAGGAACGCAACCGCAGCTTAGACCAAAACAGGAAGCTATGGGCCTGCT
+TAGGTGACGTCTCTCGTCAGGTTGAATGGCATGGTCGCTGGCTGGATGCAGAAAGCTGGAAGTGTGTGTTTACCGCAGCA
+TTAAAGCAGCAGGATGTTGTTCCTAACCTTGCCGGGAATGGCTTTGTGGTAATAGGCCAGTCAACCAGCAGGATGCGTGT
+AGGCGAATTTGCGGAGCTATTAGAGCTTATACAGGCATTCGGTACAGAGCGTGGCGTTAAGTGGTCAGACGAAGCGAGAC
+TGGCTCTGGAGTGGAAAGCGAGATGGGGAGACAGGGCTGCATGATAAATGTCGTTAGTTTCTCCGGTGGCAGGACGTCAG
+CATATTTGCTCTGGCTAATGGAGCAAAAGCGACGGGCAGGTAAAGACGTGCATTACGTTTTCATGGATACAGGTTGTGAA
+CATCCAATGACATATCGGTTTGTCAGGGAAGTTGTGAAGTTCTGGGATATACCGCTCACCGTATTGCAGGTTGATATCAA
+CCCGGAGCTTGGACAGCCAAATGGTTATACGGTATGGGAACCAAAGGATATTCAGACGCGAATGCCTGTTCTGAAGCCAT
+TTATCGATATGGTAAAGAAATATGGCACTCCATACGTCGGCGGCGCGTTCTGCACTGACAGATTAAAACTCGTTCCCTTC
+ACCAAATACTGTGATGACCATTTCGGGCGAGGGAATTACACCACGTGGATTGGCATCAGAGCTGATGAACCGAAGCGGCT
+AAAGCCAAAGCCTGGAATCAGATATCTTGCTGAACTGTCAGACTTTGAGAAGGAAGATATCCTCGCATGGTGGAAGCAAC
+AACCATTCGATTTGCAAATACCGGAACATCTCGGTAACTGCATATTCTGCATTAAAAAATCAACGCAAAAAATCGGACTT
+GCCTGCAAAGATGAGGAGGGATTGCAGCGTGTTTTTAATGAGGTCATCACGGGATCCCATGTGCGTGACGGACATCGGGA
+AACGCCAAAGGAGATTATGTACCGAGGAAGAATGTCGCTGGACGGTATCGCGAAAATGTATTCAGAAAATGATTATCAAG
+CCCTGTATCAGGACATGGTACGAGCTAAAAGATTCGATACCGGCTCTTGTTCTGAGTCATGCGAAATATTTGGAGGGCAG
+CTTGATTTCGACTTCGGGAGGGAAGCTGCATGATGCGATGTTATCGGTGCGGTGAATGCAAAGAAGATAACCGCTTCCGA
+CCAAATCAACCTTACTGGAATCGATGGTGTCTCCGGTGTGAAAGAACACCAACAGGGGTGTTACCACTACCGCAGGAAAA
+GGAGGACGTGTGGCGAGACAGCGACGAAGTATCACCGACATAATCTGCGAAAACTGCAAATACCTTCCAACGAAACGCAC
+CAGAAATAAACCCAAGCCAATCCCAAAAGAATCTGACGTAAAAACCTTCAACTACACGGCTCACCTGTGGGATATCCGGT
+GGCTAAGACGTCGTGCGAGGAAAACAAGGTGATTGACCAAAATCGAAGTTACGAACAAGAAAGCGTCGAGCGAGCTTTAA
+CGTGCGCTAACTGCGGTCAGAAGCTGCATGTGCTGGAAGTTCACGTGTGTGAGCACTGCTGCGCAGAACTGATGAGCGAT
+CCGAATAGCTCGATGCACGAGGAAGAAGATGATGGCTAAACCAGCGCGAAGACGATGTAAAAACGATGAATGCCGGGAAT
+GGTTTCACCCTGCATTCGCTAATCAGTGGTGGTGCTCTCCAGAGTGTGGAACCAAGATAGCACTCGAACGACGAAGTAAA
+GAACGCGAAAAAGCGGAAAAAGCAGCAGAGAAGAAACGACGACGAGAGGAGCAGAAACAGAAAGATAAACTTAAGATTCG
+AAAACTCGCCTTAAAGCCCCGCAGTTACTGGATTAAACAAGCCCAACAAGCCGTAAACGCCTTCATCAGAGAAAGAGACC
+GCGACTTACCATGTATCTCGTGCGGAACGCTCACGTCTGCTCAGTGGGATGCCGGACATTACCGGACAACTGCTGCGGCA
+CCTCAACTCCGATTTAATGAACGCAATATTCACAAGCAATGCGTGGTGTGCAACCAGCACAAAAGCGGAAATCTCGTTCC
+GTATCGCGTCGAACTGATTAGCCGCATCGGGCAGGAAGCAGTAGACGAAATCGAATCAAACCATAACCGCCATCGCTGGA
+CTATCGAAGAGTGCAAGGCGATCAAGGCAGAGTACCAACAGAAACTCAAAGACCTGCGAAATAGCAGAAGTGAGGCCGCA
+TGACGTTCTCAGTAAAAACCATTCCAGACATGCTCGTTGAAACATACGGAAATCAGACAGAAGTAGCACGCAGACTGAAA
+TGTAGTCGCGGTACGGTCAGAAAATACGTTGATGATAAAGACGGGAAAATGCACGCCATCGTCAACGACGTTCTCATGGT
+TCATCGCGGATGGAGTGAAAGAGATGCGCTATTACGAAAAAATTGATGGCAGCAAATACCGAAATATTTGGGTAGTTGGC
+GATCTGCACGGATGCTACACGAACCTGATGAACAAACTGGATACGATTGGATTCGACAACAAAAAAGACCTGCTTATCTC
+GGTGGGCGATTTGGTTGATCGTGGTGCAGAGAACGTTGAATGCCTGGAATTAATCACATTCCCCTGGTTCAGAGCTGTAC
+GTGGAAACCATGAGCAAATGATGATTGATGGCTTATCAGAGCGTGGAAACGTTAATCACTGGCTGCTTAATGGCGGTGGC
+TGGTTCTTTAATCTCGATTACGACAAAGAAATTCTGGCTAAAGCTCTTGCCCATAAAGCAGATGAACTTCCGTTAATCAT
+CGAACTGGTGAGCAAAGATAAAAAATATGTTATCTGCCACGCCGATTATCCCTTTGACGAATACGAGTTTGGAAAGCCAG
+TTGATCATCAGCAGGTAATCTGGAACCGCGAACGAATCAGCAACTCACAAAACGGGATCGTGAAAGAAATCAAAGGCGCG
+GACACGTTCATCTTTGGTCATACGCCAGCAGTGAAACCACTCAAGTTTGCCAACCAAATGTATATCGATACCGGCGCAGT
+GTTCTGCGGAAACCTAACATTGATTCAGGTACAGGGAGAAGGCGCATGAGACTCGAAAGCGTAGCTAAATTTCATTCGCC
+AAAAAGCCCGATGATGAGCGACTCACCACGGGCCACGGCTTCTGACTCTCTTTCCGGTACTGATGTGATGGCTGCTATGG
+GGATGGCGCAATCACAAGCCGGATTCGGTATGGCTGCATTCTGCGGTAAGCACGAACTCAGCCAGAACGACAAACAAAAG
+GCTATCAACTATCTGATGCAATTTGCACACAAGGTATCGGGGAAATACCGTGGTGTGGCAAAGCTTGAAGGAAATACTAA
+GGCAAAGGTACTGCAAGTGCTCGCAACATTCGCTTATGCGGATTATTGCCGTAGTGCCGCGACGCCGGGGGCAAGATGCA
+GAGATTGCCATGGTACAGGCCGTGCGGTTGATATTGCCAAAACAGAGCTGTGGGGGAGAGTTGTCGAGAAAGAGTGCGGA
+AGATGCAAAGGCGTCGGCTATTCAAGGATGCCAGCAAGCGCAGCATATCGCGCTGTGACGATGCTAATCCCAAACCTTAC
+CCAACCCACCTGGTCACGCACTGTTAAGCCGCTGTATGACGCTCTGGTGGTGCAATGCCACAAAGAAGAGTCAATCGCAG
+ACAACATTTTGAATGCGGTCACACGTTAGCAGCATGATTGCCACGGATGGCAACATATTAACGGCATGATATTGACTTAT
+TGAATAAAATTGGGTAAATTTGACTCAACGATGGGTTAATTCGCTCGTTGTGGTAGTGAGATGAAAAGAGGCGGCGCTTA
+CTACCGATTCCGCCTAGTTGGTCACTTCGACGTATCGTCTGGAACTCCAACCATCGCAGGCAGAGAGGTCTGCAAAATGC
+AATCCCGAAACAGTTCGCAGGTAATAGTTAGAGCCTGCATAACGGTTTCGGGATTTTTTATATCTGCACAACAGGTAAGA
+GCATTGAGTCGATAATCGTGAAGAGTCGGCGAGCCTGGTTAGCCAGTGCTCTTTCCGTTGTGCTGAATTAAGCGAATACC
+GGAAGCAGAACCGGATCACCAAATGCGTACAGGCGTCATCGCCGCCCAGCAACAGCACAACCCAAACTGAGCCGTAGCCA
+CTGTCTGTCCTGAATTCATTAGTAATAGTTACGCTGCGGCCTTTTACACATGACCTTCGTGAAAGCGGGTGGCAGGAGGT
+CGCGCTAACAACCTCCTGCCGTTTTGCCCGTGCATATCGGTCACGAACAAATCTGATTACTAAACACAGTAGCCTGGATT
+TGTTCTATCAGTAATCGACCTTATTCCTAATTAAATAGAGCAAATCCCCTTATTGGGGGTAAGACATGAAGATGCCAGAA
+AAACATGACCTGTTGGCCGCCATTCTCGCGGCAAAGGAACAAGGCATCGGGGCAATCCTTGCGTTTGCAATGGCGTACCT
+TCGCGGCAGATATAATGGCGGTGCGTTTACAAAAACAGTAATCGACGCAACGATGTGCGCCATTATCGCCTAGTTCATTC
+GTGACCTTCTCGACTTCGCCGGACTAAGTAGCAATCTCGCTTATATAACGAGCGTGTTTATCGGCTACATCGGTACTGAC
+TCGATTGGTTCGCTTATCAAACGCTTCGCTGCTAAAAAAGCCGGAGTAGAAGATGGTAGAAATCAATAATCAACGTAAGG
+CGTTCCTCGATATGCTGGCGTGGTCGGAGGGAACTGATAACGGACGTCAGAAAACCAGAAATCATGGTTATGACGTCATT
+GTAGGCGGAGAGCTATTTACTGATTACTCCGATCACCCTCGCAAACTTGTCACGCTAAACCCAAAACTCAAATCAACAGG
+CGCCGGACGCTACCAGCTTCTTTCCCGTTGGTGGGATGCCTACCGCAAGCAGCTTGGCCTGAAAGACTTCTCTCCGAAAA
+GTCAGGACGCTGTGGCATTGCAGCAGATTAAGGAGCGTGGCGCTTTACCTATGATTGATCGTGGTGATATCCGTCAGGCA
+ATCGACCGTTGCAGCAATATCTGGGCTTCACTGCCGGGCGCTGGTTATGGTCAGTTCGAGCATAAGGCTGACAGCCTGAT
+TGCAAAATTCAAAGAAGCGGGCGGAACGGTCAGAGAGATTGATGTATGAGCAGAGTCACCGCGATTATCTCCGCTCTGGT
+TATCTGCATCATCGTCTGCCTGTCATGGGCTGTTAATCATTACCGTGATAACGCCATTACCTACAAAGCCCAGCGCGACA
+AAAATGCCAGAGAACTGAAGCTGGCGAACGCGGCAATTACTGACATGCAGATGCGTCAGCGTGATGTTGCTGCGCTCGAT
+GCAAAATACACGAAGGAGTTAGCTGATGCTAAAGCTGAAAATGATGCTCTGCGTGATGATGTTGCCGCTGGTCGTCGTCG
+GTTGCACATCAAAGCAGTCTGTCAGTCAGTGCGTGAAGCCACCACCGCCTCCGGCGTGGATAATGCAGCCTCCCCCCGAC
+TGGCAGACACCGCTGAACGGGATTATTTCACCCTCAGAGAGAGGCTGATCACTATGCAAAAACAACTGGAAGGAACCCAG
+AAGTATATTAATGAGCAGTGCAGATAGAGTTGCCCATATCGATGGGCAACTCATGCAATTATTGTGAGCAATACACACGC
+GCTTCCAGCGGAGTATAAATGCCTAAAGTAATAAAACCGAGCAATCCATTTACGAATGTTTGCTGGGTTTCTGTTTTAAC
+AACATTTTCTGCGCCGCCACAAATTTTGGCTGCATCGACAGTTTTCTTCTGCCCAATTCCAGAAACGAAGAAATGATGGG
+TGATGGTTTCCTTTGGTGCTACTGCTGCCGGTTTGTTTTGAACAGTAAACGTCTGTTGAGCACATCCTGTAATAAGCAGG
+GCCAGCGCAGTAGCGAGTAGCATTTTTTTCATGGTGTTATTCCCGATGCTTTTTGAAGTTCGCAGAATCGTATGTGTAGA
+AAATTAAACAAACCCTAAACAATGAGTTGAAATTTCATATTGTTAATATTTATTAATGTATGTCAGGTGCGATGAATCGT
+CATTGTATTCCCGGATTAACTATGTCCACAGCCCTGACGGGGAACTTCTCTGCGGGAGTGTCCGGGAATAATTAAAACGA
+TGCACACAGGGTTTAGCGCGTACACGTATTGCATTATGCCAACGCCCCGGTGCTGACACGGAAGAAACCGGACGTTATGA
+TTTAGCGTGGAAAGATTTGTGTAGTGTTCTGAATGCTCTCAGTAAATAGTAATGAATTATCAAAGGTATAGTAATATCTT
+TTATGTTCATGGATATTTGTAACCCATCGGAAAACTCCTGCTTTAGCAAGATTTTCCCTGTATTGCTGAAATGTGATTTC
+TCTTGATTTCAACCTATCATAGGACGTTTCTATAAGATGCGTGTTTCTTGAGAATTTAACATTTACAACCTTTTTAAGTC
+CTTTTATTAACACGGTGTTATCGTTTTCTAACACGATGTGAATATTATCTGTGGCTAGATAGTAAATATAATGTGAGACG
+TTGTGACGTTTTAGTTCAGAATAAAACAATTCACAGTCTAAATCTTTTCGCACTTGATCGAATATTTCTTTAAAAATGGC
+AACCTGAGCCATTGGTAAAACCTTCCATGTGATACGAGGGCGCGTAGTTTGCATTATCGTTTTTATCGTTTCAATCTGGT
+CTGACCTCCTTGTGTTTTGTTGATGATTTATGTCAAATATTAGGAATGTTTTCACTTAATAGTATTGGTTGCGTAACAAA
+GTGCGGTCCTGCTGGCATTCTGGAGGGAAATACAACCGACAGATGTATGTAAGGCCAACGTGCTCAAATCTTCATACAGA
+AAGATTTGAAGTAATATTTTAACCGCTAGATGAAGAGCAAGCGCATGGAGCGACAAAATGAATAAAGAACAATCTGCTGA
+TGATCCCTCCGTGGATCTGATTCGTGTAAAAAATATGCTTAATAGCACCATTTCTATGAGTTACCCTGATGTTGTAATTG
+CATGTATAGAACATAAGGTGTCTCTGGAAGCATTCAGAGCAATTGAGGCAGCGTTGGTGAAGCACGATAATAATATGAAG
+GATTATTCCCTGGTGGTTGACTGATCACCATAACTGCTAATCATTCAAACTATTTAGTCTGTGACAGAGCCAACACGCAG
+TCTGTCACTGTCAGGAAAGTGGTAAAACTGCAACTCAATTACTGCAATGCCCTCGTAATTAAGTGAATTTACAATATCGT
+CCTGTTCGGAGGGAAGAACGCGGGATGTTCATTCTTCATCACTTTTAATTGATGTATATGCTCTCTTTTCTGACGTTAGT
+CTCCGACGGCAGGCTTCAATGACCCAGGCTGAGAAATTCCCGGACCCTTTTTGCTCAAGAGCGATGTTAATTTGTTCAAT
+CATTTGGTTAGGAAAGCGGATGTTGCGGGTTGTTGTTCTGCGGGTTCTGTTCTTCGTTGACATGAGGTTGCCCCGTATTC
+AGTGTCGCTGATTTGTATTGTCTGAAGTTGTTTTTACGTTAAGTTGATGCAGATCAATTAATACGATACCTGCGTCATAA
+TTGATTATTTGACGTGGTTTGATGGCCTCCACGCACGTTGTGATATGTAGATGATAATCATTATCACTTTACGGGTCCTT
+TCCGGTGATCCGACAGGTTACG
diff --git a/tests/data/lambdaNEB.fa.fai b/tests/data/lambdaNEB.fa.fai

new file mode 100644 (file)

index 0000000..064af36
--- /dev/null
+++ b/tests/data/lambdaNEB.fa.fai
@@ -0,0 +1 @@
+lambda_NEB3011 48502   16      80      81
diff --git a/tests/data/long-cigar-1.7.bam b/tests/data/long-cigar-1.7.bam

new file mode 100644 (file)

index 0000000..480c776

Binary files /dev/null and b/tests/data/long-cigar-1.7.bam differ
diff --git a/tests/data/merge.fofn b/tests/data/merge.fofn

new file mode 100644 (file)

index 0000000..8a79dff
--- /dev/null
+++ b/tests/data/merge.fofn
@@ -0,0 +1,2 @@
+aligned.bam
+aligned2.bam
diff --git a/tests/data/pbbamify/input-aligned-1.bam b/tests/data/pbbamify/input-aligned-1.bam

new file mode 100644 (file)

index 0000000..b46607c

Binary files /dev/null and b/tests/data/pbbamify/input-aligned-1.bam differ
diff --git a/tests/data/pbbamify/input-aligned-2.bam b/tests/data/pbbamify/input-aligned-2.bam

new file mode 100644 (file)

index 0000000..345f5ac

Binary files /dev/null and b/tests/data/pbbamify/input-aligned-2.bam differ
diff --git a/tests/data/pbbamify/input-aligned-3.bam b/tests/data/pbbamify/input-aligned-3.bam

new file mode 100644 (file)

index 0000000..59b1e38

Binary files /dev/null and b/tests/data/pbbamify/input-aligned-3.bam differ
diff --git a/tests/data/pbbamify/input-aligned-all.bam b/tests/data/pbbamify/input-aligned-all.bam

new file mode 100644 (file)

index 0000000..7160327

Binary files /dev/null and b/tests/data/pbbamify/input-aligned-all.bam differ
diff --git a/tests/data/pbbamify/synthetic-ref-1.fa b/tests/data/pbbamify/synthetic-ref-1.fa

new file mode 100644 (file)

index 0000000..9e49565
--- /dev/null
+++ b/tests/data/pbbamify/synthetic-ref-1.fa
@@ -0,0 +1,2 @@
+>synthetic_ref_1
+GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGACAGGTGCTGAAAGCGAGGCTTTTTGGCCT
diff --git a/tests/data/pbbamify/synthetic-ref-1.fa.fai b/tests/data/pbbamify/synthetic-ref-1.fa.fai

new file mode 100644 (file)

index 0000000..d9d41c0
--- /dev/null
+++ b/tests/data/pbbamify/synthetic-ref-1.fa.fai
@@ -0,0 +1 @@
+synthetic_ref_1        150     17      150     151
diff --git a/tests/data/pbbamify/synthetic_movie_1.subreads.bam b/tests/data/pbbamify/synthetic_movie_1.subreads.bam

new file mode 100644 (file)

index 0000000..22da3d3

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_1.subreads.bam differ
diff --git a/tests/data/pbbamify/synthetic_movie_1.subreads.bam.bai b/tests/data/pbbamify/synthetic_movie_1.subreads.bam.bai

new file mode 100644 (file)

index 0000000..bfd4731

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_1.subreads.bam.bai differ
diff --git a/tests/data/pbbamify/synthetic_movie_1.subreads.bam.pbi b/tests/data/pbbamify/synthetic_movie_1.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..06fb614

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_1.subreads.bam.pbi differ
diff --git a/tests/data/pbbamify/synthetic_movie_2.subreads.bam b/tests/data/pbbamify/synthetic_movie_2.subreads.bam

new file mode 100644 (file)

index 0000000..9526f60

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_2.subreads.bam differ
diff --git a/tests/data/pbbamify/synthetic_movie_2.subreads.bam.bai b/tests/data/pbbamify/synthetic_movie_2.subreads.bam.bai

new file mode 100644 (file)

index 0000000..ac6f2ae

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_2.subreads.bam.bai differ
diff --git a/tests/data/pbbamify/synthetic_movie_2.subreads.bam.pbi b/tests/data/pbbamify/synthetic_movie_2.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..2f08904

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_2.subreads.bam.pbi differ
diff --git a/tests/data/pbbamify/synthetic_movie_3.subreads.bam b/tests/data/pbbamify/synthetic_movie_3.subreads.bam

new file mode 100644 (file)

index 0000000..d19da7b

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_3.subreads.bam differ
diff --git a/tests/data/pbbamify/synthetic_movie_3.subreads.bam.bai b/tests/data/pbbamify/synthetic_movie_3.subreads.bam.bai

new file mode 100644 (file)

index 0000000..8e3bba4

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_3.subreads.bam.bai differ
diff --git a/tests/data/pbbamify/synthetic_movie_3.subreads.bam.pbi b/tests/data/pbbamify/synthetic_movie_3.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..89b0adc

Binary files /dev/null and b/tests/data/pbbamify/synthetic_movie_3.subreads.bam.pbi differ
diff --git a/tests/data/pbbamify/synthetic_movie_all.subreadset.xml.in b/tests/data/pbbamify/synthetic_movie_all.subreadset.xml.in

new file mode 100644 (file)

index 0000000..508d167
--- /dev/null
+++ b/tests/data/pbbamify/synthetic_movie_all.subreadset.xml.in
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:SubreadSet CreatedAt="2017-12-05T04:48:31" MetaType="PacBio.DataSet.SubreadSet" Name="pacbio_dataset_subreadset-171205_124831161" Tags="" TimeStampedName="pacbio_dataset_subreadset-171205_124831161" UniqueId="a9e4086c-db3a-bded-923b-3c84fbeae52b" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_1.subreads.bam" TimeStampedName="pacbio_subreadfile_subreadbamfile-171205_124831158" UniqueId="f44a1fe8-236d-442a-80bb-50733b70cd36">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_1.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-171205_124831159" UniqueId="5de6a1ac-ffbd-41a8-b104-2d7ef4f4ff13"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_1.subreads.bam.bai" TimeStampedName="pacbio_index_bamindex-171205_124831158" UniqueId="30a69956-c1ff-4e39-a901-f35041c1071a"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_2.subreads.bam" TimeStampedName="pacbio_subreadfile_subreadbamfile-171205_124831158" UniqueId="80a97293-bb25-4cc5-a0b2-5189335beecf">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_2.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-171205_124831158" UniqueId="5482eae8-7c35-48c4-9a6f-dca2c1ee3181"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_2.subreads.bam.bai" TimeStampedName="pacbio_index_bamindex-171205_124831158" UniqueId="7e75f3b5-29ee-4c90-b0a6-7bc5e917d41f"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+               <pbbase:ExternalResource MetaType="PacBio.SubreadFile.SubreadBamFile" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_3.subreads.bam" TimeStampedName="pacbio_subreadfile_subreadbamfile-171205_124831158" UniqueId="4ddcb97d-e879-455f-bc99-97b24efe7d65">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_3.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-171205_124831158" UniqueId="d9d1d76b-2a63-4b39-b3bb-709c93c64114"/>
+                               <pbbase:FileIndex MetaType="PacBio.Index.BamIndex" ResourceId="@PacBioBAM_TestsDir@/data/pbbamify/synthetic_movie_3.subreads.bam.bai" TimeStampedName="pacbio_index_bamindex-171205_124831158" UniqueId="5310cd63-9f1f-47ee-992d-d60c7b16e278"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>502</pbds:TotalLength>
+               <pbds:NumRecords>6</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:SubreadSet>
diff --git a/tests/data/phi29.bam b/tests/data/phi29.bam

new file mode 100644 (file)

index 0000000..46176b6

Binary files /dev/null and b/tests/data/phi29.bam differ
diff --git a/tests/data/phi29.bam.pbi b/tests/data/phi29.bam.pbi

new file mode 100644 (file)

index 0000000..5282b94

Binary files /dev/null and b/tests/data/phi29.bam.pbi differ
diff --git a/tests/data/polymerase/consolidate.subread.dataset.xml b/tests/data/polymerase/consolidate.subread.dataset.xml

new file mode 100644 (file)

index 0000000..ca85a7a
--- /dev/null
+++ b/tests/data/polymerase/consolidate.subread.dataset.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+    TimeStampedName="subreadset_150304_231155" 
+    MetaType="PacBio.DataSet.SubreadSet" 
+    Name="DataSet_SubreadSet" 
+    Version="3.0.0" 
+    CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="production.subreads.bam">
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource
+        UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195"
+        TimeStampedName="scraps_bam_150304_231155"
+        MetaType="PacBio.SubreadFile.ScrapsBamFile"
+        ResourceId="production.scraps.bam">
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="qStart" Value="4000" Operator=">"/>
+            <pbbase:Property Name="qStart" Value="5000" Operator="<"/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
diff --git a/tests/data/polymerase/filtered_resources.subread.dataset.xml b/tests/data/polymerase/filtered_resources.subread.dataset.xml

new file mode 100644 (file)

index 0000000..e414e00
--- /dev/null
+++ b/tests/data/polymerase/filtered_resources.subread.dataset.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+    TimeStampedName="subreadset_150304_231155" 
+    MetaType="PacBio.DataSet.SubreadSet" 
+    Name="DataSet_SubreadSet" 
+    Version="3.0.0" 
+    CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="./production.subreads.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.ScrapsBamFile" 
+                ResourceId="./production.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="./internal.subreads.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.ScrapsBamFile" 
+                ResourceId="./internal.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5197" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.HqRegionBamFile" 
+        ResourceId="./production_hq.hqregion.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5199" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.HqScrapsBamFile" 
+                ResourceId="./production_hq.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="zm" Value="100000" Operator="=="/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
+\ No newline at end of file
diff --git a/tests/data/polymerase/internal.hqregions.bam b/tests/data/polymerase/internal.hqregions.bam

new file mode 100644 (file)

index 0000000..8e31e6b

Binary files /dev/null and b/tests/data/polymerase/internal.hqregions.bam differ
diff --git a/tests/data/polymerase/internal.hqregions.bam.pbi b/tests/data/polymerase/internal.hqregions.bam.pbi

new file mode 100644 (file)

index 0000000..b79e661

Binary files /dev/null and b/tests/data/polymerase/internal.hqregions.bam.pbi differ
diff --git a/tests/data/polymerase/internal.lqregions.bam b/tests/data/polymerase/internal.lqregions.bam

new file mode 100644 (file)

index 0000000..96878a3

Binary files /dev/null and b/tests/data/polymerase/internal.lqregions.bam differ
diff --git a/tests/data/polymerase/internal.lqregions.bam.pbi b/tests/data/polymerase/internal.lqregions.bam.pbi

new file mode 100644 (file)

index 0000000..a4b7237

Binary files /dev/null and b/tests/data/polymerase/internal.lqregions.bam.pbi differ
diff --git a/tests/data/polymerase/internal.polymerase.bam b/tests/data/polymerase/internal.polymerase.bam

new file mode 100644 (file)

index 0000000..8f293c1

Binary files /dev/null and b/tests/data/polymerase/internal.polymerase.bam differ
diff --git a/tests/data/polymerase/internal.polymerase.bam.pbi b/tests/data/polymerase/internal.polymerase.bam.pbi

new file mode 100644 (file)

index 0000000..c423905

Binary files /dev/null and b/tests/data/polymerase/internal.polymerase.bam.pbi differ
diff --git a/tests/data/polymerase/internal.scraps.bam b/tests/data/polymerase/internal.scraps.bam

new file mode 100644 (file)

index 0000000..47c1689

Binary files /dev/null and b/tests/data/polymerase/internal.scraps.bam differ
diff --git a/tests/data/polymerase/internal.scraps.bam.pbi b/tests/data/polymerase/internal.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..9db21f2

Binary files /dev/null and b/tests/data/polymerase/internal.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/internal.subreads.bam b/tests/data/polymerase/internal.subreads.bam

new file mode 100644 (file)

index 0000000..00ad171

Binary files /dev/null and b/tests/data/polymerase/internal.subreads.bam differ
diff --git a/tests/data/polymerase/internal.subreads.bam.pbi b/tests/data/polymerase/internal.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..b0d7e28

Binary files /dev/null and b/tests/data/polymerase/internal.subreads.bam.pbi differ
diff --git a/tests/data/polymerase/multiple_resources.subread.dataset.xml b/tests/data/polymerase/multiple_resources.subread.dataset.xml

new file mode 100644 (file)

index 0000000..109535d
--- /dev/null
+++ b/tests/data/polymerase/multiple_resources.subread.dataset.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+    TimeStampedName="subreadset_150304_231155" 
+    MetaType="PacBio.DataSet.SubreadSet" 
+    Name="DataSet_SubreadSet" 
+    Version="3.0.0" 
+    CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.SubreadBamFile" 
+        ResourceId="./production.subreads.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.ScrapsBamFile" 
+                ResourceId="./production.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource 
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5197" 
+        TimeStampedName="subread_bam_150304_231155" 
+        MetaType="PacBio.SubreadFile.HqRegionBamFile" 
+        ResourceId="./production_hq.hqregion.bam">
+        <pbbase:ExternalResources>
+            <pbbase:ExternalResource 
+                UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5199" 
+                TimeStampedName="scraps_bam_150304_231155" 
+                MetaType="PacBio.SubreadFile.HqScrapsBamFile" 
+                ResourceId="./production_hq.scraps.bam">
+            </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+</pbds:SubreadSet>
+\ No newline at end of file
diff --git a/tests/data/polymerase/production.polymerase.bam b/tests/data/polymerase/production.polymerase.bam

new file mode 100644 (file)

index 0000000..4c84b23

Binary files /dev/null and b/tests/data/polymerase/production.polymerase.bam differ
diff --git a/tests/data/polymerase/production.scraps.bam b/tests/data/polymerase/production.scraps.bam

new file mode 100644 (file)

index 0000000..a32bdfb

Binary files /dev/null and b/tests/data/polymerase/production.scraps.bam differ
diff --git a/tests/data/polymerase/production.scraps.bam.pbi b/tests/data/polymerase/production.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..5ef119d

Binary files /dev/null and b/tests/data/polymerase/production.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/production.subreads.bam b/tests/data/polymerase/production.subreads.bam

new file mode 100644 (file)

index 0000000..452aad5

Binary files /dev/null and b/tests/data/polymerase/production.subreads.bam differ
diff --git a/tests/data/polymerase/production.subreads.bam.pbi b/tests/data/polymerase/production.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..7ff2fcc

Binary files /dev/null and b/tests/data/polymerase/production.subreads.bam.pbi differ
diff --git a/tests/data/polymerase/production_hq.hqregion.bam b/tests/data/polymerase/production_hq.hqregion.bam

new file mode 100644 (file)

index 0000000..66d436b

Binary files /dev/null and b/tests/data/polymerase/production_hq.hqregion.bam differ
diff --git a/tests/data/polymerase/production_hq.hqregion.bam.pbi b/tests/data/polymerase/production_hq.hqregion.bam.pbi

new file mode 100644 (file)

index 0000000..ec8f166

Binary files /dev/null and b/tests/data/polymerase/production_hq.hqregion.bam.pbi differ
diff --git a/tests/data/polymerase/production_hq.scraps.bam b/tests/data/polymerase/production_hq.scraps.bam

new file mode 100644 (file)

index 0000000..716e098

Binary files /dev/null and b/tests/data/polymerase/production_hq.scraps.bam differ
diff --git a/tests/data/polymerase/production_hq.scraps.bam.pbi b/tests/data/polymerase/production_hq.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..1017562

Binary files /dev/null and b/tests/data/polymerase/production_hq.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/qnameFiltered.subreads.dataset.xml b/tests/data/polymerase/qnameFiltered.subreads.dataset.xml

new file mode 100644 (file)

index 0000000..c200ded
--- /dev/null
+++ b/tests/data/polymerase/qnameFiltered.subreads.dataset.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet
+    xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+    xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+    xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+    xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+    xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"
+    UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c"
+    TimeStampedName="subreadset_150304_231155"
+    MetaType="PacBio.DataSet.SubreadSet"
+    Name="DataSet_SubreadSet"
+    Version="3.0.0"
+    CreatedAt="2015-01-27T09:00:01">
+<pbbase:ExternalResources>
+    <pbbase:ExternalResource
+        UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193"
+        TimeStampedName="subread_bam_150304_231155"
+        MetaType="PacBio.SubreadFile.SubreadBamFile"
+        ResourceId="production.subreads.bam">
+    </pbbase:ExternalResource>
+    <pbbase:ExternalResource
+        UniqueId="b096d0a3-94b8-4918-b3af-a3f81bbe5195"
+        TimeStampedName="scraps_bam_150304_231155"
+        MetaType="PacBio.SubreadFile.ScrapsBamFile"
+        ResourceId="production.scraps.bam">
+    </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="qname_file" Value="qname_whitelist.txt" Operator="="/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
diff --git a/tests/data/polymerase/qname_whitelist.txt b/tests/data/polymerase/qname_whitelist.txt

new file mode 100644 (file)

index 0000000..0004061
--- /dev/null
+++ b/tests/data/polymerase/qname_whitelist.txt
@@ -0,0 +1,3 @@
+ArminsFakeMovie/0/3116_3628
+ArminsFakeMovie/0/3722_4267
+ArminsFakeMovie/0/6812_7034
diff --git a/tests/data/polymerase/scrapless.scraps.bam b/tests/data/polymerase/scrapless.scraps.bam

new file mode 100644 (file)

index 0000000..7b989c4

Binary files /dev/null and b/tests/data/polymerase/scrapless.scraps.bam differ
diff --git a/tests/data/polymerase/scrapless.scraps.bam.pbi b/tests/data/polymerase/scrapless.scraps.bam.pbi

new file mode 100644 (file)

index 0000000..140af8a

Binary files /dev/null and b/tests/data/polymerase/scrapless.scraps.bam.pbi differ
diff --git a/tests/data/polymerase/scrapless.subreads.bam b/tests/data/polymerase/scrapless.subreads.bam

new file mode 100644 (file)

index 0000000..739b3b4

Binary files /dev/null and b/tests/data/polymerase/scrapless.subreads.bam differ
diff --git a/tests/data/polymerase/scrapless.subreads.bam.pbi b/tests/data/polymerase/scrapless.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..a20a00f

Binary files /dev/null and b/tests/data/polymerase/scrapless.subreads.bam.pbi differ
diff --git a/tests/data/referenceset.xml b/tests/data/referenceset.xml

new file mode 100644 (file)

index 0000000..3099906
--- /dev/null
+++ b/tests/data/referenceset.xml
@@ -0,0 +1,21 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<pbds:ReferenceSet TimeStampedName="referenceset_150304_231155" MetaType="PacBio.DataSet.ReferenceSet" Name="lambdaNEB" CreatedAt="2016-01-04T18:02:13.181-08:00" UniqueId="3ad75a14-f43a-48bd-9dc1-8a08af29f587" Version="3.0.1" Author="pbscala 0.50.1-SNAPSHOT reference_info_dataset_0.4.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd">
+    <pbbase:ExternalResources>
+        <pbbase:ExternalResource Name="First References FASTA" Description="Points to an example references FASTA file." MetaType="PacBio.ReferenceFile.ReferenceFastaFile" ResourceId="./lambdaNEB.fa" UniqueId="fc549593-3c6b-4d21-82a6-5236fbbdf1c8" TimeStampedName="pacbio_dataset_index-fc549593-3c6b-4d21-82a6-5236fbbdf1c8" Tags="converted">
+            <pbbase:FileIndices>
+                <pbbase:FileIndex UniqueId="3a83d609-f6f1-49df-84d6-ace38fd6e9c2" TimeStampedName="pacbio_dataset_index-3a83d609-f6f1-49df-84d6-ace38fd6e9c2" MetaType="PacBio.Index.SamIndex" ResourceId="./lambdaNEB.fa.fai"/>
+            </pbbase:FileIndices>
+        </pbbase:ExternalResource>
+        <pbbase:ExternalResource Name="Second References FASTA" Description="Points to an example references FASTA file." MetaType="PacBio.ReferenceFile.ReferenceFastaFile" ResourceId="./chimera_minimal.fasta" UniqueId="fc549593-3c6b-4d21-82a6-5236fbbdf1c8" TimeStampedName="pacbio_dataset_index-fc549593-3c6b-4d21-82a6-5236fbbdf1c8" Tags="converted">
+        </pbbase:ExternalResource>
+    </pbbase:ExternalResources>
+    <pbds:DataSetMetadata>
+        <pbds:TotalLength>48502</pbds:TotalLength>
+        <pbds:NumRecords>1</pbds:NumRecords>
+        <pbds:Organism>lambdaNEB</pbds:Organism>
+        <pbds:Ploidy>haploid</pbds:Ploidy>
+        <pbds:Contigs>
+            <pbds:Contig Name="ref000001" Description="lambda_NEB3011" Length="48502" Digest="a1319ff90e994c8190a4fe6569d0822a"/>
+        </pbds:Contigs>
+    </pbds:DataSetMetadata>
+</pbds:ReferenceSet>
diff --git a/tests/data/relative/a/test.bam b/tests/data/relative/a/test.bam

new file mode 100644 (file)

index 0000000..bd06b8a

Binary files /dev/null and b/tests/data/relative/a/test.bam differ
diff --git a/tests/data/relative/b/test1.bam b/tests/data/relative/b/test1.bam

new file mode 100644 (file)

index 0000000..bd06b8a

Binary files /dev/null and b/tests/data/relative/b/test1.bam differ
diff --git a/tests/data/relative/b/test2.bam b/tests/data/relative/b/test2.bam

new file mode 100644 (file)

index 0000000..bd06b8a

Binary files /dev/null and b/tests/data/relative/b/test2.bam differ
diff --git a/tests/data/relative/relative.fofn b/tests/data/relative/relative.fofn

new file mode 100644 (file)

index 0000000..755c589
--- /dev/null
+++ b/tests/data/relative/relative.fofn
@@ -0,0 +1,3 @@
+a/test.bam
+b/test1.bam
+b/test2.bam
diff --git a/tests/data/relative/relative.xml b/tests/data/relative/relative.xml

new file mode 100644 (file)

index 0000000..0e78fe4
--- /dev/null
+++ b/tests/data/relative/relative.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet xmlns="http://pacificbiosciences.com/PacBioDataModel.xsd" MetaType="PacBio.DataSet.AlignmentSet" Name="DataSet_AlignmentSet" Tags="barcode moreTags mapping mytags" UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" Version="2.3.0" CreatedAt="2015-01-27T09:00:01" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd">
+        <pbbase:ExternalResources>
+                <pbbase:ExternalResource MetaType="SubreadFile.SubreadBamFile" ResourceId="./a/test.bam" />
+                <pbbase:ExternalResource MetaType="SubreadFile.SubreadBamFile" ResourceId="./b/test1.bam" />
+                <pbbase:ExternalResource MetaType="SubreadFile.SubreadBamFile" ResourceId="./b/test2.bam"/>
+        </pbbase:ExternalResources>
+</pbds:SubreadSet>
diff --git a/tests/data/relative/relative2.fofn b/tests/data/relative/relative2.fofn

new file mode 100644 (file)

index 0000000..f1969ac
--- /dev/null
+++ b/tests/data/relative/relative2.fofn
@@ -0,0 +1,4 @@
+a/test.bam
+b/test1.bam
+b/test2.bam
+relative.xml
diff --git a/tests/data/segfault.bam b/tests/data/segfault.bam

new file mode 100644 (file)

index 0000000..755c7eb

Binary files /dev/null and b/tests/data/segfault.bam differ
diff --git a/tests/data/test_group_query/group.fofn.in b/tests/data/test_group_query/group.fofn.in

new file mode 100644 (file)

index 0000000..4af9e82
--- /dev/null
+++ b/tests/data/test_group_query/group.fofn.in
@@ -0,0 +1,3 @@
+@PacBioBAM_TestsDir@/data/test_group_query/test1.bam
+@PacBioBAM_TestsDir@/data/test_group_query/test2.bam
+@PacBioBAM_TestsDir@/data/test_group_query/test3.bam
diff --git a/tests/data/test_group_query/test1.bam b/tests/data/test_group_query/test1.bam

new file mode 100644 (file)

index 0000000..5673abc

Binary files /dev/null and b/tests/data/test_group_query/test1.bam differ
diff --git a/tests/data/test_group_query/test2.bam b/tests/data/test_group_query/test2.bam

new file mode 100644 (file)

index 0000000..565b224

Binary files /dev/null and b/tests/data/test_group_query/test2.bam differ
diff --git a/tests/data/test_group_query/test2.bam.pbi b/tests/data/test_group_query/test2.bam.pbi

new file mode 100644 (file)

index 0000000..384ad28

Binary files /dev/null and b/tests/data/test_group_query/test2.bam.pbi differ
diff --git a/tests/data/test_group_query/test3.bam b/tests/data/test_group_query/test3.bam

new file mode 100644 (file)

index 0000000..3b1e21b

Binary files /dev/null and b/tests/data/test_group_query/test3.bam differ
diff --git a/tests/data/transcript.subreads.bam b/tests/data/transcript.subreads.bam

new file mode 100644 (file)

index 0000000..13f5efd

Binary files /dev/null and b/tests/data/transcript.subreads.bam differ
diff --git a/tests/data/transcript.subreads.bam.pbi b/tests/data/transcript.subreads.bam.pbi

new file mode 100644 (file)

index 0000000..2729f5d

Binary files /dev/null and b/tests/data/transcript.subreads.bam.pbi differ
diff --git a/tests/data/transcriptset.xml b/tests/data/transcriptset.xml

new file mode 100644 (file)

index 0000000..60e68c7
--- /dev/null
+++ b/tests/data/transcriptset.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<pbds:TranscriptSet CreatedAt="2018-04-20T17:33:59.218Z" MetaType="PacBio.DataSet.TranscriptSet" Name="2016-08-30_AppsInst_SMS_Flea_IsoSeq_RC0_largeSizeTranscripts_2 (P5--P3) (unpolished)" Tags="barcoded,unpolished" TimeStampedName="2016-08-30_AppsInst_SMS_Flea_IsoSeq_RC0_largeSizeTranscripts_2 (P5--P3) (unpolished)-180420_173359218" UniqueId="66b978ae-27bb-a9e9-abde-5a4e00d256e3" Version="3.0.1" xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd" xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pacificbiosciences.com/PacBioDatasets.xsd">
+       <pbbase:ExternalResources>
+               <pbbase:ExternalResource MetaType="PacBio.TranscriptReadFile.TranscriptReadBamFile" ResourceId="./transcript.subreads.bam" TimeStampedName="pacbio_transcriptreadfile_transcriptreadbamfile-180420_173359218" UniqueId="47874301-b8c8-415e-bcb9-d547a8527a78" Version="3.0.1">
+                       <pbbase:FileIndices>
+                               <pbbase:FileIndex MetaType="PacBio.Index.PacBioIndex" ResourceId="./transcript.subreads.bam.pbi" TimeStampedName="pacbio_index_pacbioindex-180420_173359218" UniqueId="097702db-54b9-4b7b-aa87-f21bfb820097" Version="3.0.1"/>
+                       </pbbase:FileIndices>
+               </pbbase:ExternalResource>
+       </pbbase:ExternalResources>
+       <pbds:Filters>
+               <pbds:Filter>
+                       <pbbase:Properties>
+                               <pbbase:Property Name="movie" Operator="=" Value="transcript"/>
+                       </pbbase:Properties>
+               </pbds:Filter>
+       </pbds:Filters>
+       <pbds:DataSetMetadata>
+               <pbds:TotalLength>0</pbds:TotalLength>
+               <pbds:NumRecords>4</pbds:NumRecords>
+       </pbds:DataSetMetadata>
+</pbds:TranscriptSet>
diff --git a/tests/data/unmap1.bam b/tests/data/unmap1.bam

new file mode 100644 (file)

index 0000000..3fe2af5

Binary files /dev/null and b/tests/data/unmap1.bam differ
diff --git a/tests/data/unmap1.bam.bai b/tests/data/unmap1.bam.bai

new file mode 100644 (file)

index 0000000..dd19971

Binary files /dev/null and b/tests/data/unmap1.bam.bai differ
diff --git a/tests/data/unmap2.bam b/tests/data/unmap2.bam

new file mode 100644 (file)

index 0000000..8feed79

Binary files /dev/null and b/tests/data/unmap2.bam differ
diff --git a/tests/data/unmap2.bam.bai b/tests/data/unmap2.bam.bai

new file mode 100644 (file)

index 0000000..f495714

Binary files /dev/null and b/tests/data/unmap2.bam.bai differ
diff --git a/tests/data/vcf/structural_variants.vcf b/tests/data/vcf/structural_variants.vcf

new file mode 100644 (file)

index 0000000..8a05b60
--- /dev/null
+++ b/tests/data/vcf/structural_variants.vcf
@@ -0,0 +1,32 @@
+##fileformat=VCFv4.2
+##fileDate=20180509
+##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variant">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant described in this record">
+##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
+##INFO=<ID=SVANN,Number=.,Type=String,Description="Repeat annotation of structural variant">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Per-sample read depth of this structural variant">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth at this position for this sample">
+#CHROM POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  UnnamedSample
+chrI   12573   pbsv.INS.1      A       ATGGGTAACAGGTGGTAATGAAGACGTAATTTCTGACTTGTTGATTGTATGGAAAGGTGGTTAATGATGAAGTAATTTCTGATTGGTGTTGTATGGTAAACAGGTGGGTAATGAAGAAGTATTTCGGACTTTGTTGCCACGGTAACGGTGGAAGATGAAGTAAGTAATTTCATGACTTGTTGTTGTCTTGTACTGGGTAACAAGGTAGGTAATGATGAAGTAATTTCTGACTTGTTGTTTACTGGTAACAGGTGGTACTGAAGAAAGTAATGCCTGACTGTTGTTGCACGGGTAACCGGTGGTAATGATGGAAGTTGAAGTAATTCCTGATTGTTGTTGTACTGGAACAGGGTGTAAGAAGAAAGAAGTAATTTCCTGACTTGTTGTTGTA .       PASS    IMPRECISE;SVTYPE=INS;END=12573;SVLEN=390;SVANN=TANDEM   GT:AD:DP        1/1:13:13
+chrIII 91471   pbsv.DEL.2      GCTATCGATGCTACAGGTGTTCCACTTCCAGATGAGGCGCTGGAAGCCTCCAAGAAGGCTGATGCCGTTTTGTTAGGTGCTGTGGGTGGTCCTAAATGGGGTACCGGTAGTGTTAGACCTGAACAAGGTTTACTAAAAATCCGTAAAGAACTTCAATTGTACGCCAACTTAAGACCATGTAACTTTGCATCCGACTCTCTTTTAGACTTATCTCCAATCAAGCCACAATTTGCTAAAGGTACTGACTTCGTTGTTGTCAGAGAATTAGTGGGAGGTATTTACTTTGGTAAGAGAAAGGAAGACGATGGTGATGGTGTCGCTTGGGATAGTGAACAATACACCGTTCCAGAAGTGCAAAGAATCACAAGAATGGCCGCTTTCATGGCCCTACAACATGAGCCACCATTGCCTATTTGGTCCTTGGATAAAGCTAATGTTTTGGCCTCTTCAAGATTATGGAGAAAAACTGTGGAGGAAACCATC     G       .       PASS    IMPRECISE;SVTYPE=DEL;END=91953;SVLEN=-482       GT:AD:DP        0/1:2:5
+chrIII 169209  pbsv.INS.3      T       TTGCCACACGGTGACTGGTTATTCAATAGCGGTATAGCTTCACTGACTGCGTGTTTTCTGCCTTCTGTAGTTTGAAGTGCCTGTTAACGAGCCCTTTGTCGACTTTCCTCCAATCACTTTTCCGTTGAGTAGGAAAATGTTCCCAATTTGTGACTTGTAATATGGTTTTTACCATATGACAGCATCGCTTTGCGACTAGTTTATTATCTGTTGGTAGGTTTGTTTTGTGCCATATACTTGTTTATCTCTAGTGTCCCACTGAATTGTATTACACTCATATGTCATGTCTAAAACTTGCCTAAGGGGAAAGTATTGTTGAGCAAGTGTGTTTGATGTAGTATATAAGTCAATCTACATGTTTATATCCAACATATGAATCTAGACCCAATCAACTTTTGCATTTTCATGTACCTTCTCTTTGTATTCATCTCATCTCTTTCAGTTCATCCTGTCATATAAAACCTGTTGCCCTGAGCGCTAATTTTTCTTCCTTTTGGAGTCAAAGTACGTTTAATTTGGGGTAATTTTCTCAGTTAATGAGTTTTCCATACCTAAATTTCATGTATTTACCTCTTTGATATTCGTTTCTAAGGCCAAGTATGTATATTGCATTTCCTCATCACTTCGCCCTAGATTTATAATCTTGGTGTCGTATTGCCTCTTAAGCTTTCTTCTCTAAATTCTTTGTTTGAATTTATATTTTATGCTAAACAATACCATACATATCATCCTACCGACTAAACAAATTGTCACTTGACTGTTTTTTAAATACGCAATGACCATCCACGAACTTCTTCCCTACCAATTGTTGTTCAGGTAATGATTTGATAGTTTCGTACCGTTCGCTCCACTTTGTTTTCAATCCATAAAGTTGATTTCTTTCTTCAAACGTATAACTTATCATTCATCCTAATGTGGTGGAGGTCTTATGTATATTCTTCTTTATTCCTGCATACAAATATGCCGAAGATATGTCTAATTGGTACTATAGTAGTTATTGTCTAAGCAAGTGACAGGGTGTCATTAATCTAGTGAGTACGGGTATTGGATTGCATGCCTGAGTCGTAAGTGTCGGATGCTGAATATCACCTCTTGCAACAAATCTAGCTTTATGAGGTACCAGTCACGTTTCTTGTTAAGATAAAACTGAGTTTATTACTCTTTTAGGGTCTATTTCGTTCTGTCCATAATATTCGTCAGTGTCCCCAAGTTTTCACTTCAACAGTTTGATTGACGTCTTCTTTGTGGTTATGCCTCGATCTATTTTTCTTTTTCTTTAATATCTTTATTATAGGTGATTGCCCTCATCGTATCGTAAGGTTGTCCGTATTGGTTTGATTGATTTTACTGCTTTTAACAGCTGGCAATCGGGAATTCGTTTCTCTCTCGATCTCGGAGGTTCTAACTACGGCATTTCTAGTATTCCCATGTGTCTCGTGATACCTTACTTTCAGTTTCTTATTTCTAATGATCTTTTCTTCTCTTTTCTTACTGTTGATAGTAGTATAGGCCATTAGCGTCACCATACCACCCACACTGGAATTAGTTTGACGAGAATTTATCGGTGGGAGTTTCTTTACATGGGTCAGGGACTTCGGTAAGGAGATTCTGGAGGTAGATCAGGAGTGGGAGATCAGCGATGATAGATTCTCGGTATTCTGTTTAGAAACATGTTGGCGTTTTGATAGGACAATATTGGTGCGATGAATTTTTTCCCGGTGGAGAAGCATCGATTGAAGGTGAAACGGTGGGGTATAATCCCTTTTTCTCAGTCTCTTCTTGGTCACTTATCTTGCGGACATTTTGTTGTTGGTAACCCCGTACTGGATATTGGTACGTTTGTATGATTAGTCCCATTTTCACTGTACGAGTTCTGAATGTCGAAATCTTAGATTACTGGCGTGCGACGACCATGTGTGTTAGATTGGACATGGGAGCAAGTAAAGGAACATTTAATTTATGCCATACCACCCAACCGGTACTTACTCTCGATATTGGAAATTTGGGGGGTGCTAGATCTCTTCTTTGAATGGAAGCAATATTAAGATTCAATAGTGGGGGTCCAACTCTCTGGTGCAACAAGATTGATTTCAGATATAGGTTGGAATCTTCAGTATGAGTTGACGGAGGTGTGGAATCGGTTGGACTCACAGCTTTTGACAGACATTTCTCGGTTGCCTCAGGATGTAGTTCAATGTCAGATTGGAAGTCAGGTCAGATTCTAATGTAAAAAACATTGGATCTTGATCTCCTTTGACGCAATACGAATGATATGAAAGTCATTTCAACGGTTTAAGTCTTCATCGAAAGTGAGTGCTCGTAAGTGAATGATCTTCTGGATTCCTTGCCCTGAAGAATAACATGTAGTTGTCTCTACTGTCTTCTTTAAGGAGGAAGATAGATGAATATATCCATAAGATTTCAGACGGATGTAGAGCTTAACCTGGATGCCACGAGGGATGTATTTGGAGTTAGGGTTGTATCATTGACTATAACAGGTTGACCGAAGTAACACTACTGATATCAAGTCCTCCAAGCCAGCATGTGGCTTGGCAGATTTTTTGCTTTAGGTGAAGCTAAGTGACTTCCTCACAATAGTAGAAAATCGATTGGCAGAAACCATAAATGGTTCGGTAAACCACTACCATGCAGTTGAGTAACGGCAGTCATCTAATAAGGTACTGTTGAGGCCGTTCAGCGACTCCAGTCTCGGGATCCGCTGTGGGTTGTATAGCAATGGATTTACCATTTTTTTCAAGGAATTTATGGAGAGTTCTGATAGTATACTCAGAACCACGGTCCTTTGTATAACCAAGACACTGCCTGAATTGGTTTTAATGAAAGCTAGTAATCGTGGTAAATAAAATCGAGGTATAGTCCTCCGCGACGGTCGTGCAATGGATCAACCCAACGGAATTTTGTTGTCTCATCAGACTGAGATGAAATAGGATGGTGCACTTTTTTGGTAGGTTGTGACTGGACCAAATATGTCATAGTAGGTATTGAAGGGTTCGTAAAATGACATTTTTGGATTTTATCGTGAACCTTGATATGTCTGTGGTTTGGTGCTTTTGCCGATTTAAACAATCAGGACATTGATAGTCAATAGCACTAACCAGTTCGACATCTGATTCGTTAACAATACGTGATTGGTGTTATTTTTTAAGTGATATCGAATTGTCTTGCTTGGTCATATGCGCAAGCATTCGATGAATGAAAGATAAGGATATTTGCGTGTACTTTCACTTGTATGACAGTATTGAGGTGGGTAACGGAGATATTTGGACTGAAGCAAGTACTTTTTAGATACCCGTAAAAGTCTCCAATTGTACATAGTGCAAGTCCATGCCGTTAGATCGTTCTAAGACGTTTTTGGTAAAGCTGCTTGCTATCTACTGCAGCAATTCATCAAACTGAGTAAGTCATAGGCTAGGTTAGGAGTTTGCAATACCTTTATTGATGTTTTTGGGTTGTTGTCCTGGAAAGTGAAATTGTAGGTCACCAATAGCGTTAATTGGTAATTTCGTTTTTTAGATCACTACGTTTATGCAGGGATTAGATGATGCTGATGTATGTGATGAGCAGATCTTTAAGGGTTCGTGATGCTCCTGAATCGAGAAAGAGGTGTCATGGATTTCATCTCAGAATGAATTAGTGTGATTTACCGTATATTCAGTAAAGTTCCTTGCCTAAGGTGAAGTCGGCTTATTGTTCAAGGAATCGTTCAGTAGTTTATTTACTGATGGAATCGTTGTCCGTGCTGGGAGAGTTATTAGATGTGGATACTTGTGGAGCCTGGCTGTTTCGATTTCGAATTATTTGTTTTTTGAGGATTCCGAGCTATAACTTTTTTGGGGTTGGTTGTATTCGTATAGCTGCAGAGAATCCTTTCTTCTCATCCTCAGATTTCTCCTGTATTAGGTTTGCTGTTTCTCGATCCCTTTTGTTCTTCATAAATAGCATGGATTCTAGAACAGTTCAGCGACTGTCATATTTAAGATGTCGAGAGTGTTAGCGTAAAAAATTTATATTCGCCAGATAGACCTCTTCCATAATTATTCATGCGACCTTGTTATTGATATGAATCCTTATTGTTTCAGTTCTGTCGATAATGTTTGTGACTTTTGTTTCAAATGCCTCTGCAGGTGTACCTGCCATTATATTGCAAATTTGCCAGGGTCACAATGTCGTTTTGCCTCTTGGTCATCAGATTGCTTTTTTCAATGACTTTTTGAAAGAATTTTCATGAGTATCCGTATAATCAACGGATAGGATGTCTTTGAACCCGGTAGGTTATGAATGAGAGGAGCAAATATTTGAAAAAGTGTTATACAAGAAGGGTATTCAATCCTCAGTGATCTTGACTACGGTTTTCGTTTACTGTCGGAATAAACCACCGAGATTCGAGTTTTTGTAAAATTTGATGTATGTTTTAACCCAATTTGGGAAAGTCCTTAGGTGAGGTTACCATTGGGGTGGTCTGACTATTTTTTAGGTGGATGCCATATCAGAGTCCGCGTGGGATGAATCAGTAAATGTAGTACCTGCTCAGTTGATGGAGTGCTCAGAGTCGTTCCACCAACTGATGATGGATACTGCGGAAACGTGATTGTGGCCAGGTGGGAAAGTAACCATAGGCGACATTTTGATAAGTGTATACGGAATCCTAGATGGGTGTCCCCTAAAATGACCAACCAGATGGATTTGCTTGGTTTTTGGGTCATCAAGCACTGCTGTGGGTACGGCCCATTCTGTGGAGTGGTACTGAAGCAGTGAGGAGAGGCATGATGGGGGTTCTCTCTGGAACAGCTGATGACGCAGTGTTGTTGTCTGTTTGAGAGTTTAGCCTTATGGAAGCCCTTATCATATTCTGGAATTTTGAAGCTGAAAACGTCTAATCGGATCTTGATTTGTGTTGGACTTCCTTATAATCACCGAAGCACAGGCGCTACATGAGAATTTTTGTGGGTAATTAGTAATTAGTTGGGATTCTTGTTGATAAAGCTATAATAATTATGTATCGAATATACTAGAAGTTCTCCTCGATGATTTAGGATCCAGAAAAGGGAATCTGCAATTCTACACAATTCTATAAATATTATTATCACGTTTTATATGTTAATATTCATTGATCTATGACACTTATCATCCTTGCGTTTTCAGCTTCCACTAATTTAGATACTATTTCTCAATCATTTGCGTCATCTTCTAACACCGTATATGATCATATACTCGTAACGTAAATACTAGTTAGTAATGATGTATTTTTTATCCACATTATATCCATATATAAGAAATAACATTCCGTGAATAATCTGATAACTGTTTTGAAAACTGGTTACTTCCCTAAAGACTGTTTATAATTAGGATTTCAAGACACTCCGGTATTACTCGAGCCTCGTAATACAACATAATATCCTACATTCTAGATCCCGACATATCAATCATAAATCTGAATATTGAGACATCTTTTAATTACGTTTGGTTGCCGCAAACCAAAAATGCACTTTACATGGGTGAATAATTGAATGTTAGAAATCTTTTGTTTTTGAATAAAAATCCACTTCGCTATCAACTAATAGTTATATTCTCATATATTAATCATATATACGGTGTTAAGATGATGACATAGTTAATAGAAAGCTGTCATCGTGTTAGAGGAAGCTGAAACGCAAGGATTGATACTGTAATAGGATCAATGAATATAAACATATAAACGGAATGAGGAATAATCGTAATATTAGTATGTAAAAATATAGATTCCATTTTGAGGATTCCTATATCCTCGCCGGAGAACTTCTAGTATATTCGGTTACCTAATATTATAGCCTTTATTCACACTGGAATCCCAACAATTATCTAATTACCCACACACACATTTCCTTTCTCATGGTAGCGCTGTGCTTCGGTTATTCTAAGGAAGGCCACACAATCAAGATCCGTTAGAACGTTTCGCTTCCAAAACCAAGAATGTGAGAAGGCCTCCACTAAGGCTACCTCTCAACAACAACAACACCTGCTTCATCGCTGTTCCATAGAACCCCCATCATGCCTCTCCTCAAACTGCTCAGTCACATTCACCACAGAATGGCCGTACCCACACAGTGCATGATGACCAAAACCAAGCCAATCCATCTGGTTGGTCATTTTACGGACACCCACTATGATTCCGTATACAACCTGATCAAATGTCGCCTATGTACTTCCACCTGGGGCCACAATCACAGTTTCCGCCGTCCCTCATCATTTGGAACGCCTCTGAGCACTCCATCACCTGAGTCAGGTAATACATTTACTGATTCATCCTCAGCAGACTCTGATATGACATCCCTAACAAATATGTCAGACCACCACCAATGTTAACCTCACCTAATGACTTTCCGATTGGTTAAAACATACCCAATTTTTACAAAACGTCGAACTCGTGGTATTAATTTCCCCCCCGAACAGTAACGAAAACCCGTACGTCGATCACTGCTGATGAAACTACTTTCTGTATAACACTTTTCAATATTGCTCCCTCTCACTTCCTACCTACCTGTCAAAGACATCCTATCCGTTGATTATACGGATCTCATGAAAATTCTTTCCAAGTATTGAAAAATGCAATCTGATACCCAAAGCAAACGACTTTGACCCTGGCACATTTTGCCATATAATGGCAAATACACCGCAGATGCCATTTGAACAAAGTCCCAAACATTTACTCGACAGACTACAATAATGGCATTCATATCAATAACACGGTCCGCATGCCAATTAATTATTAAGGTCTATCGGGCGAATATAAATTTTTACGCTAACACCGTCTCGACATCTAATATGACAGTCGCTGAACTGTCTTAATACCATGCTATTTATGAAGACAACAGGGATCGAAAACAGAAACTAATTCAGGAGAAATCGAGTGATGAGAAGAATGATTCTCGCAGCTCTACGAATTACCACCAAACCCAAATTTTAGCTCGGATCCTCAAAAAACACATAATTCGAAAATCGAAAACAGCAGGGCCCACAATGTATCCACATCTATCACTCTCCCAGCACGGACAACGATTCCATCATAAATCAACTAACGGAACCTTCAAGTTGAACAATTAAGCCCCGACCTTCATCTTATGCCAGAAACTTACTGAATCTACCGTAAATCATACTCATCTCTGATAGAACTCCCTGGACACCTCCTTCTCTGATTCAGAGCATCACGAACCCTTATAAGATCTGCTCATCACATACACTCCAGCATCATCAATCCTTACCTAAACGTAGTTGATGCTCAAAAAAAAATATACCAATTACGCTAATTGGTGACCCACAATTCACTTCAGGACAACACCAAAACATCAATAAAGGTATTGACACCTCCTAACATAGCCTATGACTTACTCAGTTTGAATTATTGGCTCAGTAGATATCACAGCATGGCTTACCAAAAACGTCTTAGACGGTCTGAGGCACTTACTTGCACCTATCGTAAAATAGGAGACGTTTACTGGGTACTAAAAAGTAACTTTGCTTCTCAATATCTCCTACCCCACCATCATATGTCCATCCCAGTGAAAGTTACACGCAAATATCCTTATCCTTTCATTCATCGAATGCTTGCGCATGCCCATGCATCAGACAATTCGATCTCACTTAAAATAACACCCCATCACGTAATTTAACGAATCAGATGTACGACTGGCTATGCTATTGACTATCAATGTCCTGATTGTTTTAATCGGCAAAAGCACCAAACACAGACATATCAAAGTTCACGACCTAAAATACAAAATTCATACGAACCCGTTCAATACCTACATACTGACCATTTGGTCAGTTTCACACCCTACCAAAAGTGCACCATCCTATTTCATCTCATTACTTTAACAACAAATTCCGTTGGTTTATCCTTGCACCCGTCCGAGGACTCATCCTCGATGTTTTACCCACGATACTAGCTTTTCATTAAAAACTCACTTTTCAGCCATGTCTGGTTATACAAATGGACCGTGGTTGCCTGAGTAACTACAGAACCTCCATACCTTCCTTGAAAAAAATGGGTATAACTCCATGCTATAACAACCACAGCGGATCCCGAGCACATGGAGTCGCTGACGGCTCAACCGTACCTTATTATATGACTGCCGTACTCACTGCACTGTAGTGGTTTACCGAACCATTTAGGGTTCTCTGCAATCGAATTTTCTACTCTTGTGAGAAATCACTAGCTTCACCAAAAGCAAAAATCTGCAAGCCAACATGCTTGGCTGGCAGACTTGATATCCAGTACTTTGTACCTTTCGGTCAACACTGTTATCGTCAATTATCACAACCCTAACTCCAAAATACATCCTCGTGGCTCCCAGGTTACGCTCTACATCCGTCTCGAAACTCTTATTGATATATATCTATCTTCCTCCTAAAGAAGACAGTAGATCAACTAACTATTTTATTCTTCCGGGGCAAGGATCCAATTAGATCAATTCAATCTCGACGCACTCACTTTCGATATGAAAGGACTTCACCGTTTACAATGCTTCCTATCATTCGTTTTCATTTGCGTCAAAATGAGATCCAAGCTGCCAATGATCTAACATAGAATCTGACATGACTTCCCATCTGACTTGAACACAATCCTGACAAAAAACCGAGAAATGTCCTTTCAAAAGCTGTAGTCCCAACCGATTCCACACCTCCGTCAACCATATGAAGATTTCAAACCTATTCTGAATCATCTTCGTGCACCCAGAGAAGTTGACCCCAACATATTGAATCTAATATTCTCCATCAAAAGCGATCTAAGCACCCCCCAAATTTCCAATATCGAGAGTACCCGGTTCGGGTGGTATGGCCATCAATTAAATGTTCCTTTACTGCTCCCAATGGTCCCAATCTAACACACATGAGTCTTTCGCAACGCCATAAATCTAAAGATTTCAGACACTCAAACTCGTAACATTGAAAATGAACTATCATACAAACGTCCAATTCCAGTACGGTGGTACCCAACACAAACTTTCCGCCGATAGTGGACCAAGAGACTGAGAAACGGATATAACACCGTTCAACCTTCAATCGATGCTTCTCTCCAACCGGACCAATAATTCACCAATCGCCCAATATTGTTCCTAGCAAAACGCCAACTCTTGTTTCTGACAGAAACCGAGGGAACTATCACATCGTGATCTCCACCCCTATCTACCTCCAGAATCTCCTACCGCAATTCCCTGACCCATTTAAAGAACCCCCCCCCCGATAAATTCCTCGTCAAACTATGCCAGTTTTTTTTTGGGTGGTATTGTTGACTCTAATGCTATACTAACTATCAACCGTAAGAAAAGATCAATTAGAAGATAATGAAACTGAAAATTACGGTATCACGAACACTGGAATACTAATAATATGCGTCGTTTAGAACCTCCGAGATCGAAGAAACGAATCCCCTGTTGCAGCTATAAAAGCAGTACACTCAATCAACCAATACGGACCCCCTTACGATACGATGATGCACACCTATAATAAAGATATTAAAAGAAAAAGAAAAATATATCAGGCTACCAACAAAGAAGTCAATCAACTGTTGAAGATGAAAACTGGGACACTGACGAATATTATGACAGAAAAGAAATAGACCTAAAAGAGTAATAAACTCAATGTTTTATCTTCAACACAAACGTGACGGGTACCATAAAGCTAGATTTGTTGCAAGCAGGGTATATTCAGCATCCTGAACACTACGACTCAGGCATGCAATCCACTACCGTACATCACTATCCTTAATGACATCCCTGTCACTTGCATCAGACCATAACTACTATATTACACACTTAGACCTAGTCTTCGGCAGATTTGTATGCAGACCTCAAAGAAGAAATTCTAATACATAAGACCTCCACCACATTTAGGAATGAATGATAAAAGTTGATACGTTTGACGAAATACTTATGGATTGAACAAAGTGAGCGAACCTGGTACGAAACTATCAAATCCATACTGATAACAAGTGTATGGAAGCGCAAGTTCGTGGATGTATGCTAAAATTAAACAGTCAGTGACCATTTGTTTTATTCGTCGATGATATGGTATTGTTTCGCAAAAATCTAATTCAAACAACAGAATTATAGAGGCTTAAGATGCAATACGACACCAAGATATCAAATCTAGGCGAAAGTGATGAGGAATCATCTGACATACTTGGCTTAGAATCAAATATCAAAGAGGTACATACATGAAATTAGGTAGGGCACATCTTTGACCGCCAAAAATTTAACCAAATTAAACGTAACCTTTGACTCCAAAAGAAAAACTTACGTCCAGGTCAACCAGGTCTTAATAACCAGATGAAACTAGAAATAGATGAAGAGAATACAAAGAGAAGTACATGAAATGCAAAAGTTTGATTGGTCGAGCTTCATATTTGGATATAAATTTAGATTTGACTTACTATACTACATCAACCCACTGCTCACATATACTATTCCCCTTCCCCTCTAGGCAAGTTTTAGACATGACATATGAGTTAATACAATTCATGTGGACACTAAGAAATACAACACTGATATTTCAACCAAAAAACAAACCGACGAGCCAGATAATAAACTTCCGCAATAAGTGGAGGCTTCCTATGGTAAACCAACCATATACAAGTCCAACATTGTCAACCATATATTTACTTATGGAAAGGTAATTGGAGAAGTCCCCCAGAGGCTTCATTCACATGTACTACTTCACAACTAGGACGCAGAAATACACCGCGATAAGTGATCTGCCCCATTATTAAAATAACCTCATCCACCTTGTGCAAGAACTTAACAAAAACCAATTACTAAAGGATTACTAACCGACAGTAAATCCCCCCCCCCTCCGTCTCCTTCTCCTCCCTCCCTGCCGCGCCCCCTTTCGCACCCCTTTTTTGGTACCTACGCAATGAGACTCGCCAGTGATATCAGGAACCTCCGCTGCACGTATGCTCCTATCGCCCCCCAAAACAGAATACTGCAGACGTATGACCAAACCTCTTCCGATAACAAATTCAAACTATTAACAAACAAATGGATCATTAGATTATTACATTATGGGTGGTAATG        .       PASS    IMPRECISE;SVTYPE=INS;END=169209;SVLEN=11071     GT:AD:DP        0/1:2:6
+chrIII 169539  pbsv.INS.4      T       TCTATGGTAGCGATGTGCTTCGGGACTCTAAGGAAGTCCACACAAATTCAAGCATCCGTTTTAGAACGTTTCAGCTTCAAAACAGAAGAATGTGAGAGGCGTCCCATAGGCCTTAACTCTCAACAGACAACAAACATGCTCATCAGCTTGTTCCAGAGGAACCCCCCATCATGCCTTCCTAAAACTGCGCAGTCACATTCACACAGAATGCGGACGTACCCAAGAGGCATATGACCAAAACAAGGGCAAATCCATCTGGTTGGTCATTTACGGACACCCATCTATGATTCGTATACACCTATCAAAATGTTCGCGATGTACTTTCCACATGGGCCACCCAATCACAGTTTCCGCAGTATCAATATCAGGTTGGAACGCTCGAGCACTCAATCACCGAGTCAGGTACTACATGTACTGATTCATCCTCAGCAGACTCTGATATGACATCACTAAAAAATATGTCAGACCCACCCACCAATTAAACCCACTATGACTTTCCAAATTGGGTTAAAAACATACATCAAATTTTACAAAACTCGAATCTCGGTGGTATTATTCCGACAGTAACGGAAAACCCGTACGTAGATCACTGGATGATGAACTCACCTTCTTGTATAAACACTTTTCACAATATTTGCTCCCTCTCAATTCCTACTACCGGGGTCAAAGACATCTAGCCGTTTGATTATACGGATATTCATGAAAATTCTTTCCAAAGTATTGAAAAAAATGCAATCGATACCCAAAGAGCAAACGACATGTGACCTGGCAAATTTGCCAATATAATGGCAGTCACCTGCAGAGGCTTTGAAACAAAAGTCAAAAACATTTCGACCAGGGCAGACTGAACAATAATGGCATTCATATCAATAACAGGGTCGCATGCCAATTAATTATGAGAGGTCTATCTGGCGAATATAAATTTTACGCTACACACGTCCTCGACATCTAAAAGACAGTCGCTGAACTGGTCTTAGATATCCATGCTATTTATGAAAGAAAACAGGGATCGGAAACAGGCAACCTAATTACAGGAGAAATCGAGTGGAGAAAGTGATGAGAAGAATGATTCCGCAGTATACATACAACCAACCCAAAGGTATAGCTCGGAATCCTCAAAAAAACAATAATTCGAAATCGAGACCACAGCCAGGGCTCACAATGTATCCACATGCTAATAACTCTCCCAGCCGGACAACGATTCTATCAGTAAATCAACTACTGAACCGATCAATTGGAACAATAAGCAACACCTTCCCATCTTCGGCCAGAAACTTACTGCATCTACAGTAAATCATATAATCATTCTGATGATGAACTCCCTGCGACACCTCTTCTCGATTCAGGGCATTCCGAACCCTATAAGATCTGCTCATCACATTACACTCAGCATAATCTATCCGAATGAAACGTAGGTTGATGCTCAAAAAGAAATATACCAATTAAACGCATTGGTGACTACCAATTTAACTTCAGGACAACACCAAACATCAATAAAGGTATTGCCACACTCCTAACATAGCCTATTTGCTTACTCAGTTTGAATGAATTGGCTCAGTTAGATATCACAGCATGCGTTACACCAAAAACGTATTAGAACGGTCGACGGCACGTGTACTTGCACCTATAGTAAATTGGAGACTTTTATGGGTATCTAAAAAGTACTTGCTGCCATCAAAGATCTCCGGTACCACCATCAATAATGTCAGTACAAGTGCCAGTACACGCAAAATCCTTATCCTTTCATCATCGAATGCTTGCGCATGCAATGCATAGACCAATGTACGATACTTCACTAAAATAACACCCATCAACGTATTTAACGAATCGAGTCGACTGGTCTATGCTATTGACTATCATGTCTGATTGTTTAACGGCAAAAAGCACCAAACACAGGACATATCAAAGTTCACGACTCTAAAAAATACCAAATTATTACGAACCCTTTCAATACCTACATACTGAATATTTGGTTCAGTCCCAAAACCTACCCAACAAAGTGCACCATCCTATTTCATCTTATTACTGATGAGACAACAAAATTCCGTTGGGTTTATCATGCCACGACCGTCGCGAGGACTCTATCCTCGATGTGTATTTACCACCGATACTAGCTTCATTAAAAACCAATTAGGCCAGGTCTTGGGATACAAATGGACGTGGTCTGGAGTATTACTAACAGAACTCTCCATAAATTCTTGGAAAAAAATGGAAAACTCCATTGCTATACAAACCACAGCGATTCCCGAGCACATGGAGTCGTGAACGGCTCAACCGTACCTTTTAGAGGACTGCCGTACGCAACTGCAAATGTGTGGTTTACCGAACCATTATGGTTCTCTGCATCGAATTTTCTACTATTGTGAGAAATTTCACTAGGCTTCACCTAAAAGCAAAAAATCTGCAAGCACATGCTGGTTGGCAGGACTTGATTATCAGTTTACTTGTTACCTTTCGGTCAACCTGTTATCGGCAATGATCAAAACCCTAACTCCAAAATACATCCTCGGGGGCAATCCCAGGTTACGCTTCTACATCCGTCTCGAAACTCTTATGGATATCTCATCTATCTTACATCTTAAAGAAGACAGTGAGATACAACTCACTATGTATTCTTCAGGGCAAGGGAATCCAGAATAAGAATCAATTCAATTACGACGCACTCATTTCGATGAAGACTTAAAACAAACCGTTTAACTGCTTCATATCATTCGTCAATTGCGTCAAATGAGATCCAAGAATCCAATGATATAACATAGATCTGACCATGACTTCCAATCTTGACATTGAATAATCCTGAGCAACCGAGGAAATGTCCTTCAAAAGCTGTGAGTCCACCCGATTCAACTACCGGCGTCAAACTCATACTGAAGATCAAACATAAACCATATCTGAAATCCAATTCGTGCACCCAGGAAGTGACCCCAACAATCTGAATCTAATATTCTTCCATCAAAGAGAGATCTAGCACCCCCCAAAGTTCCAGGATCGAGAGTACAGGTTCGGGTGGAGGCATAAATAAATGTTCCTTTACTTGCTCCATGTCCCAATATACACACATGAGTACCGCACGCCAGTAAATCTAAAGATTTCAGACACGCAGACTGTTACAGTGAAATGAGACTAATCATACAAACGTACCAATATCCAGTACGGGTGGTACCAACAACACAACTGTTCCGCAGATAAGTGACCAAAGAGACTGAGAACAGGATTATACACCGTTTGCACCTTCAATCGATGCTTATCCACCGTGAAAATATTCATCGCACAATATTGTTCTGATCAAAACGCAACTACTGTTTTGGAACAGAAATACGAGGAATCTATAATCGTGATCTCCCTCCCTGATTACCTACGAATCTCCTACCGAAATTCCTGACCCATTTAAAGAACTCCCACCGATAAATTCTCGGTCAAACTAATTCCAGTTTGGGTGGTATTGTGACTCTAATGCCTAGACTACTATCAACAGTAAGAAAAAGATCATTAGAGATAAGTGAAAACTGAAATTAAGGTATACGAGACACATGGGAATCCGAAGAATATGCGTACGTAGTTTAAACTCCGAGATCGAAGAAGAAACGATTCACCTGATTGCAGCTGTAAAGCCGTAAAATTCAATCAAACCAAACGGACAACCTTGACGATACGATGAGGCACTCACCTAGAATAAGGATATTAAAGAAAAAGAAATATATCGAGGCATACACAAAGAAGTCAATCAAACTGTGGAAGATGAAAACTTGGGACGACACGGACGAATATTATGACAGAAAAGAAATAGACCCAAAGAGTAATAACTCAAGGTTATCTCAACAAGGAACGTGACGGTACTCATAAAGTAGATGTGTTGCAAGAGGTGATATTCAGCATCCTGACCCTGACGACTCAGGCATTGCAATCCAATACCGACATCACTATGCATAATGACATCCCTGTCATGATTAGCAATTAAACTCAATATATTACACAATTAGCCCTATCTTCGGCATATTGTATGCGACATCAAAGAAGAAATTATACATAAGACCTCACCACATTTAGGAATGATGATAAGTTGATACGTTTGAAGAATCACTTGATGGATTGAACAACGTGGAGCGAGATGGTACGAACTATCCAAATCATACCTGATAAAACAATGTGGTGATGGAAGAGTTCGTGGATGGTCATGCGTATTTAAAAACAGTCAGTGGACAATTGTTTATTCGTAGATGATATGGTATTGTTTAGCAAAAATCTAAATTCAAACAAAAAGAATTATAGAGAAGCTTAAGATGCAATACGACACGAAGATTATAAATCTAGGCGAAAGTGTGAGAAAATTTCAAATGACATACTTGGCTTAGAAATCAACTATCAAAGAGGTAAATACAGGGAAATTAGCGCTATGGAAAAGCTTTGACAGAAAAATTACCCAAATTAAACGTACCTTTTGAATCAAAGGAAGAAAACTTAGCGCGCCAGTCAACAGGTCTTTCTAATAGACCAGATGTGAAACTAGAAATAGATGAAGATGAATACAAAGAGGAAGGTACATGAAATGAAAAGTTGATGGTTAGCTTCATATTTGATATATAAATTTAGATTTGACTTACTATACTACATCAAACACACTTGACTCACAATTAATCCCCTCTAGGCAAGTTTTGACTATGACATATGAGTTAATACAATTCATGTGGGACACTAGAGATAAACAACTGATATGGCACAAAAACAAAACCTACCGAGCCAGATAATAACTAGTCGCAATAAGTGAGCTGCATATGGTACCAAAACCATATAAGTCACAAATTGGTAAAAATTGGCAAAAATATATTTACTTAATGGAAAGGTAATGGAGGAAAGTTCCACCAAGGCTTCATTACATGTACTTCAAAATACGGAAGCAGAAATAACACGCGGTAAGTGGAAATCTGTCCCCCATTATTAAATAACCCTCAGTCCACTTGTGCAAGAACTTGAACAAGAAACCAATTACTAAGGATTACACGACAGTAAATCTACAAGTCAGTATAGTATAATTATATCCATAATGAAGAGAAAAATTGAGAAAAGATTTTGTGGTATAAAGCAATGAGACTAAGAGATGAGTATCAGGAAATCATCTGCACGTATGCTCTATCGAAACCCAAAAGAATATGCAGACGTATGACCAAACCTCTTCCGATAAACACATTCAAACTATTAACAAACAAATGGATCTAGATCTATTACATTATGGGTGGTATGTGGAATAAAAATCCACTATCGTCTATACTACAGGAGTTATATTATCAATATCTTATCATATCGTGTTAAGATGATGACATAAGTTATGAGACGCTGTCATCGAGTTTGTTAGAGGAAAGCTGAAACGCTAAGGATGATAATGTAATAGGATCAATGAATATAAACATTATAAAAAACGGAATGAGAATAATCGTAATATTAGTATGAGGAAATATAGATTCCATTTGGAGGATCCTATATCCTCGGAGGAGAACGTGAGTATATCTGTATACCTAATATTATAGCCTTATCAACAATGGAAATCCCAACACTTATCTAATTACCCACAACATT     .       PASS    IMPRECISE;SVTYPE=INS;END=169539;SVLEN=5570      GT:AD:DP        0/1:2:10
+chrIII 200765  pbsv.DEL.5      CGGTGTAAAACAAAATGTCTTGTCTTCTCTGCTCGCTGAAGAATGGCACGCGGACAAAATGCAGCACGGAATATGGGACTAC      C       .       PASS    IMPRECISE;SVTYPE=DEL;END=200846;SVLEN=-81       GT:AD:DP        0/1:7:13
+chrIV  461745  pbsv.DEL.6      GGTCGAAAAAAGAAAAGGAGAGGGCCAAGAGGGAGGGCATTGGTGACTATTGAGCACGTGAGTATACGTGATTAAGCACACAAAGGCAGCTTGGAGTATGTCTGTTATTAATTTCACAGGTAGTTCTGGTCCATTGGTGAAAGTTTGCGGCTTGCAGAGCACAGAGGCCGCAGAATGTGCTCTAGATTCCGATGCTGACTTGCTGGGTATTATATGTGTGCCCAATAGAAAGAGAACAATTGACCCGGTTATTGCAAGGAAAATTTCAAGTCTTGTAAAAGCATATAAAAATAGTTCAGGCACTCCGAAATACTTGGTTGGCGTGTTTCGTAATCAACCTAAGGAGGATGTTTTGGCTCTGGTCAATGATTACGGCATTGATATCGTCCAACTGCATGGAGATGAGTCGTGGCAAGAATACCAAGAGTTCCTCGGTTTGCCAGTTATTAAAAGACTCGTATTTCCAAAAGACTGCAACATACTACTCAGTGCAGCTTCACAGAAACCTCATTCGTTTATTCCCTTGTTTGATTCAGAAGCAGGTGGGACAGGTGAACTTTTGGATTGGAACTCGATTTCTGACTGGGTTGGAAGGCAAGAGAGCCCCGAA      G       .       PASS    IMPRECISE;SVTYPE=DEL;END=462354;SVLEN=-609      GT:AD:DP        0/1:8:9
+chrIV  1307718 pbsv.DEL.7      TGCAGTATCCTCGACGTACACGTCTTCACCATCAGCACCTGCTGCAATATCCTCAACGTACACGTCTTCACCATCGGCACCTGTTGCAGTATCCTCGACGTACACGTCTTCACCATCGGCACCTGCTGCAATATCCTCAACGTACACGTCTTCACCATCGGCACCTG T       .       PASS    IMPRECISE;SVTYPE=DEL;END=1307884;SVLEN=-166;SVANN=TANDEM        GT:AD:DP        1/1:6:6
+chrIX  25576   pbsv.INS.8      G       GCTTGAGACACCAGAAGGAGAGGAAGCGATTGACTGACAGAGTTGAGACATCAGAAGCTGAGGAAGAAGATTGATGAAAGAGGTTGATACATCAGAGGCTGAAGAAGAAGATTGACTGACAGAAGCTTGGACATCAGAAGTAGAGGAAGCTGATGGACTGCAGACCCCCGAG    .       PASS    IMPRECISE;SVTYPE=INS;END=25576;SVLEN=171;SVANN=TANDEM   GT:AD:DP        0/1:5:8
+chrV   116286  pbsv.INS.9      C       CTGTGGAAATAGAAATCACTATCATCTACCTAACCTAGTATTTACATTACTAGTATATATCATATACGTGTTAGCATATGACGCAAATGATGAAAATAGTCCATCCTAAATTAGTGGAAGGCTTAAACGCAAGGATTGATAAGTATAGGACAATGAATATAAACATATAAAAATGAAATGTAATAATATTTATAAAATTGTGTGAATGTGCAGATCCCTTTTATGGATTCCTAAATCCTTGAGGAGACTTCTAGTATATTCTGGGGTTACCTAATATTATAGGCCTTTATTCAACATGGAATCCCAACAATTATCTCAACACATTCACATATTTCTCATGGTAGCGAGCGCCTGTGCTCGGTTACGTCTAAGGAAGTCCAACACAAATCAAATCCGTGTAGACTTTTCAGCTTCCAAACAGAAGAATGTGAGAATGCTTCCACTAAGCGCTAACTCCAACAGACAACAACACCTGCTTCATCAGCTGTTCCAGAGAACCCCCATCATGGCCTCTCCTCAAACTGCTCAGTCACATGCACCACAGAATGGCCGTAACCCACAGCATGCATGATGACCCAAACCAAGCCAATCCTCGGTTGGTCTTTAACGGACACCCATCTCCATCTATGATCCGTATCAACACTTATCAAAATGTCGCCTATGTACTTTCCCCCTGGCCACCTCCAGTTTCCGCAGTACCCATCATCAGTTGGAACTGCCTCTGCAAGCACTCCATCACCTGAGTCAGGTATACATTTTACTGATTCATCCTTCAGCGGACTCTGAGATGACATCCACTATAAAAATATGTCAGACCACCACCCTTTTAACCTCACCTATGACTTTCCAAATTGGGTTAAACATACATCAAAAATTTTTACAAAACTCGAATCTCGGTGGTATTATTCCGACAGTAAACGGAAAACCGTACGTCAGATCACTGATGAATGACCTCACTTCTTGTATCACACTTTCAATATTTGCTCCCTCTCAAATTCCTACCTACCCTGGGTCAAGACATCCTAATCCGTTGATTATACGGTATCATATGAAAATTCTTTCCAAAAGTATGAAAAAATGCCATCTGATACCAAGAGGCAAACGACCATTGTGACCCTGGCAATTTGCAATATATGCATACACCTGCAGATGCATTTGAAACAAAAGTCACAAACATTTCGAAAAAAACAGACTGAACAATAAGGCATTCATATCATAACAAGTTCGCATGCCATTAATTATGAGAGGTCTATCTGGCGAATATAAATTTTTACGCTACAACACGTCATCGACATCCTAAATAGGACAGTCGCTGAAACTGTTCTTAGATATCCATGGCTATTTAATGAAAACAACAGGGATCGAGAACATTAAAACCTAATGACAGGAGAATCCGAAGTGATGATAAGAATGATTCTCGCAGCTATAACGAATACAAACCAAACCCAAATTTCTATCTCGGCGGAATCCTCAAAAAACAAATATTTCACTCGAAACAGCCATGGCTCACAATTATCCACATCCAATAACTCTCCCCAGCACGGACAAAACGATTCCATCAGTAAATCAACTACTGAACACCGTTCAAGTGAACAATAACACGACCGTTCATCTAGGCCAGAAACCTTACTGATCTACAGTAAATCATCTAATCATTCTGATGATGAACTCCCTGGACACCTCCTTTCGATTCAGGCGCATCACGCACCCTTGAAAGATCTGCTCACCACATCCTCAGCATCATCTAATCCTGACATAACGTATTGATGCTCAAACAAAGAAATATACATTAACGCTATTGGTGACCTACAATTCACTCAGGACACCAAAACATCAATAAAGTTATTGCACACTCCTAACATAGCCTATTACTTACTCAGTTTGAATGAATTGGCTGGCAGTAGATATCACAGCATGCTTACCAAAACGTCTTAGCACGTCTGACGGCAACTGTACTTGCACCTATCGTAAAATATGGAGACTTTTACTGGTCTCTAAAAAGTACTTGCTTCCATCAATATCTGTCCGTACCACCCTCAATAATGTCCCATACAGTGAAGTACAACACGCAAATATCCTATCCTTTCCTTTCTCATCGAATGCTTGCGCAATGCAATGCACAGACACTTCGATATCACTAAAAATAACACATCACGTACTTTTACGAATATGTCGACTGGTCTAGTGCTATTGACTATCAATGCCTGATTTTTTAATCGGCACAACCAACACAGACCTATCAAAGTTTCACGACTCAAATACCAAAATTAATACGACCCTTTCAATACCTACATACTGACATATTTGTCCAGTTCACAACCTACCCAATAGTGGCATCAACCATCCTATTTCAATCTCATTACTGTGAGACAACAAAATTCCGTTTGGTTTATCCTTACACGACCGTCGCGAAGGACCTATCCTCGTGTTTGTTTACTACGATACTAGCTTTATTAAGAACCATTTTTCAGGCCAGTGGTCTGGTTATACAAATGGACCGTGGTTCTGAGTATACTAACAGAACTCTCCATAAAATTCCTTGAAAAACAAATGGGTATAACTCCATGCTATACACCACAGCGGATTCCGAGCACTGGAGTCGCTGAACGGCTCAACCTACCTTATTAGATGACTGGCCGTACTCAACTGCAATGTAGTGGTTTTAACCGAACGCATTTTTATGGTTCTCTGCAATCGAATTTCTAACTATGTGAGAAATTCCACTAGCTTCACCTAAAGCAAAAATCTGCAAGACAACATGCTGGCTTGGCGGACTTGATATCAGTACTTTGTTACCTTCGGTCAACCTGTTATCGTCAAATGATCCACAACCCTAACTAACTCTCCAAAATACATCCTCGTGGCATCCCCGCTACCTCTACATCCGTCTCGAAACTCTTCTGGATCTATCATCTATCTTCATCCTTAGAAGAATAGATACACTAACTATGTTATTCTTCCAGGCAGGAATCCAGATTAATCAATTCATTAACGAACCGCCCTCACTTTCGATGAAGACTTAAACCGTTTAAACTGCTTCATATCAATCGTTCATTGCGTCAAATGAATTCCACAATCCGATGATCTTACAAAAATCTGACCATGACTTCCAATCTCCATCGAACTACATCCTGAGCAACCGAGAACTGTCCTTTTCAAAAGCTTGTGAGTCCAACCGATTCACACCTCCGTCAACTCATACTGAATATCGAAACGTGGTTTCAAAACCAATATCGATTCGCCATTCGCGCACCAGTAAGTTGGAACCCCCACATATCTGATCTAATTTCTTCCGATCAAAGAAGAGTTAGCACCCCCCAAATTTCCAATATCGAGAGTACCGGGTCGGGTGGTATGCATAAATTAATGTTCCTTTGTTACTTGCTCCCATGTCCCAATCTAACACACATGAGTCGTCGCACGCCAGTAAAATTAAGATTCAGACACTCAACTCGTACAGTGAAAATGAACTAATCATCAAACGTACCAAGACCCTACGGGTGGTAACCAACAACAAAACTGTTCCGCGATAAGTGACCAAGAGACTGAGAAAATGATTATACAACACCGTTCACTTCACCTTCAATCAATCGTCTTCTCCACCGGAACATAATTCATCGCCATATTGTTCCTTCAAACCGCCAACTACTGTTTCTGAACGAATACCGAGAATCTATCATCGCTGATCTCCCACTCCCGATCTACCTCCAGAATCTACCACCTACGAATTCCCTGACGCCATTTAAAGAACTCCACCGATCATTCTCGTCAACTAATTCCAGTTTGGGTGGTATTGGTGACTCTAATGCTATACTACTATCAACAGTAAGAAAAGATCATTAGAAGATAATGAACTGAAATTAAGGTATCACGAGACACATGGAATACTAAGAATATGGCGTAGTTTAGACCTCCGAGATCGAGAACGAATTCACCTGATTGGCAGCTGTAAAAGCAGTTCAAAATCAATCAAACCAATACGGACACCTTACGATACGATGAGGCAAGCACCTATAATAAGATATTAAAGAAAAGAAAAATATATCGAGGCAACCACAAAGAAGTCATCAACTGTTGAAGATGAAACTTGGGACACTGACGACTATTTGACAGAAAGAAAGACTCCTCAAAGAGTACTAACTCAATGTTTTATCTTCAACCAAGAAACGTGACGGTACTCTAAAGCTAGATTTTTGCAAGAGGTGATTATTTCAGCATCCTGACACTTACGACTCAGGCATGGCAATCCAATATACCGTACATCACTATGCATAATGACCTCCCTGTCATTGCATTAGACAATAAACACTATATTACCCAATAGACATATCTTCGGCATATTTGTATGCAGACATCAAAGAAGAATAACATAAGACCCCTCCACCACATTTTAGGAATGAATGATAAGTTGATATCGTTGAAGAAATCCTTAGGATTAAACAAAAGTGGAGCGAACTGGTACGAAACTATCAATCATACCTGATACACACAATTGTATGGAAGAAGTTCGTGGATGTCATGCTATTTAAAAACAGTCAAGTGCTCAATTTGTTCTATTCGTAGATGATATGTATTGTTAGCAAAAATCTAAATTCAAACAAAAGATTATAGAAAGCTTAGATGCAATACACACCAAGATTATAAATCTAGGCGAAAGTGGATAAGGAAATTCCCTATGACATTTGGCTAGAATCAAATATCAAAGAGGTAAATACTGAAATTAGGTATGGAAAACTCATTAACTGAGAAAATACCAAATTAAAACGTAACCTTTGAATCCAAAAGAAGAAACTTAGCGCTTCCAGGTCAACGCAGGTCTTTATATAGACCAGGATGAACTAGAAATAGAAATGAAGATGAATACAAAGAGAAGGTACATGAAATGCCAGTTGATTGGTCTCTTCATATGTTTTGGATATAAAGTTAGATTTTGACTTACTATCTACATCAACACACTGCTCAACATATCACTTTCCCCTCTAGGCAGTTTTAGAACAGACTATTAGTTTAATTACAATTCATGTGGGCCACTAGAGATAACAACTGATATTGCCAAAAACAAAACCTACCGAGCCAGATATAAACTCGTCGCAATAAGCGTGCTTCTATGGTACCAACCATATTACAAGTCACAAATTGGTAACTTTTTTCCTACTCAACGGAAAGTGATTGGAGGAAAGTCGACAAAGGCTCATAACATGTACTTCAACTACGAAGGGAGAAGAATACACGATAAGTGAATCTGTCCATTAAAATAATCTAAGTTACCTGATACAAAGAACTTAACAAGAAACCAATTATTAAAGGCTTACTTACTGATAGTTAAAAAAGAATTTCCCAAAAAACGATCAGTATACTAGTCTACAAATGAAGAGAAATTTAGAAAAAAGATTTTTTGGCACAAAGCAATGAGACTTCGAATGAAGTATCAGGTAATAATTTATACGTATCCTAACTACATCGAGACCAAGAGAACATTGGCTGATTGATGACAAACCTCTTCCGATAAAAACAATTTAAACTATTAACTACATGGATTCATTAGATCTATTACATTATGGGTGGTAATGTTTGGAATAGAAATCCAACTATCATCTCTAACTAGTATTTCTTACTAGTATATTATCATATACGGTGTTAGTAAGATGAACGCAAAATGATTGAGAATAGTCCATCTAAATTAGTGGAAGCTGAAACGCAAGGATTGATATGTAATAGGATCATGAATATACATTAAAATGGATGACATAATATTTTAGAATTGTGTAGAATTGCAGATTCCCTTGTTGGATCCTAAATCCTTGAGGAGAACGTCTAGTATATTCTGTATACTCATATTATAGCCTTTATCAACAATTGACTCCCAACATTATCTCAAACCTTCACATATTCTCAGTACC    .       PASS    IMPRECISE;SVTYPE=INS;END=116286;SVLEN=5899      GT:AD:DP        0/1:6:7
+chrV   444238  pbsv.INS.10     T       TGTTGTGCATATCAGGTGGTTATCTCTAGTGTCCACATGAATTGATATTCCAACTATAGGTCATGTCTAAAACTTGCCCTAGAGGGGAATAGTATATGTGAGCAAGGTGTTGAGTAGTATAGTAAGTCAAATCTAAATTTATATCCAACATATGAAGGGGAGACCAATCCAACTTTTGCAGTTATGTACCTTCTCTTGTAGTCATCTCAGCTATTTCTAGTTCATCCCTGGTCGATATAAGACGCCTGGTTGACCTGGAGCGTAAGTTTTTCTTCCTTTGGATGCAAGGTACGTTTAATTGGGGTATTTCTCAGTTGAATGGTTTCCATACCGAATTCAGGTATTTACCTCTGTGATATTTGATTCTAGCCAAGTATGTCGCGTATGAATTTGTGATCCACTTGCACCCAATTTTATTATCTTTGTATCGTACTTGTTTCTGAGTGGTGTGATGATTTTCTGATGTGCATTTAAGTCGTTGCTGAATAAAATCAGTCGTCAACAAAATAAACAGATTGTTACGATAGTTTAAATACGCATGACCATCCACGAAAACTTATTCCCAACCAACTGTTTTATCAGGTATGATTTGATAGTTTTGTACCCAGTTCGCGCCACTTTGTTCAATCCATAAAGTGATTTCTTCAAACGTTCAACTTATCATTAATTCCGAAATGTGGGGGGAGGTCTTATTGTGATAATTCTTCTTTGATGTCTCATACAAATATGCCCGAAGATGATGTCTAATTGTGTTAATATAGTAGTATTGTCTAATGCAAGTGACAGGGATGTTCATTAATGCATAGGGATGTGACGGGTATTGGATGCAGCCTGAGTCGTAAGGGTCTAGGATGCTGAATATCACCCTTGCAACAAATTAGCTTTAGTGAGTACGTCACGTCTTGTTGAAGCTACACATGAGTTTATTACTCTTTTAGGGTCTATTCTTTTCTGTCATAATCTTCGTCAGGTCCCAAGGTTTTCCTCTTCAAAGTATGACGGGTCTTGTGGTATGCCTCGAGATATTGTGCTTTCTTTAATACTTTATTTATAGGTGATTTGCTATCTATCGTAAGGTTGGCCGTAATTGGTTTGATTGGATTTTTAGGCGTTACAGCTGCAATCAGGTGAATTCGTTCGTCGATCTCGGAGGTTCTAAACTACGCATAGTATTAGTATTCCATGTGTCTGTGATACCTTAATTTCAGTTTCATATCTTCTAATGCTCTTTTCTTACTGTTGATGTAGTATAGGCATTTAGAGTCACCACTACACCAAACTGGAATGTTGATGAGAATTTACGGGGGGAGTTTCTTTAACATGGGTCAGGGGAATCGGTTAGGAGATCGGGAGGTAAGATCAGGGCGTGGGAGATCAGCGAGGGATAGATTCCTCGGTATTCTGTTCAGAAAAACAGTAGTTGGCGTTCAGTTTTGGGATAGGAAAACAAGGATTGGCGATGAATTATTTTCCGGTTTGGAGATGGAGAAGCATCGATTGAAGGTGAACGGTTTGGATAATCCTTTTCTCAGTCGCTTGGTCCCTTATCTGCGGAAAACAGTTTTGTTGTTGGTACCACCCGTATGGATATTGGTTACGTTGTGTATGATTAGTCTCATTTTCACTGTACGAGTCTGGTGTCATGAAAAAATCGTGAGATTTACTGGCGTGCGACGACTCTGTGTGTTAGTTGGGACATGGGGCAAGTAAAGGAACATTTAAGTTAGCATACCACCCGACACCGGTACTCTCGATATGGAAATTTGGGGGGTGCTAGAGGCTCGTCTTTGATGGAAGCATTTAGATTCAGATGATGTTGGGGGTGGTAAACTTCTCTGGGTGCGCGGGAATATTGGTTTTAGAAACACGTTTCGAATCTTCAGTAGGAGTGACGAGGTGTGGAATCGGTTGGGACTCCAACAGAGCTTGTGAAAGGAAACATTTCTCGGTTGCTCAGATGTCGGTCGAATGTCAGATGGAAAGTCATGGTCAGATGTCTATTAAGATATCGATTTGTTGGATCTATTTGACGCAATGAACGATTGATATGAAGCAGTTAACGGTTTAATCTTAATGGAAAGGAGGGCGTCGTAATTGAATTGATCTAATGGATTCCTTGCCCTGAAGCAATAACATATTAGTTGATCTACTGCTTCTTTAGGATGGAAGGATTAGAGATATATCATTAAGAGTTTCGAGACGGATGTAGAGCGTAGCTGGGATGCCACGATGAATGTATTTGGAGTTAGGTTGTGATCATTGACGATAACAGGTTGACCGAAAGGAACAAAAGTATGATAATAAGTCTGCCAAGCCAGCATGTTGTCTTGCTTTGCGATTTTTGCTTTGAGGTTGAAGCTAGGTGGATGAATGTTCACAATAGTAGAAAAATTCGATTGCAGAGAACCAAAATGGTTCGGTTGGTTAAACCTACATTGCAGGGGAGTACGGGCAGTCATTAATAAGGTAAGGTTGAGCAGGTTCAGCGACTCCATGTGCTAGGGATCGCTTGTGGTTGTATAGCATGGAGTTATTTTATACCATTTTTTCAAGGAATTGTATGGAGAGTCTGTTAGTATATCAGAAACCCGGTCCATGTATAACCAAGACACTGGCGACTGAAACTGGTTCTTCATAAAAGCTAGTACGTAGTAAAAACATCGAGGAATAGAGATCCTCGCGACGGTCGTTTGTAAAGATACACCCAACGGAATTTTTGTTGTGCATCAGTAAAGGAGATGAAATAGGTGGTTGCAACCTTTGTGGTAGGTTGTGAACGGGACCGCAAATAGGTCAGTATGTAGTATTGAAGGGTCGTATGAAATTTTGGTATTTTAGTCGGTTGAACCCTTGAGATGTCTGTGTTGTGCTTTCCGAGTAAACAATCGGACATTGATAGTCATAGCCATAGAGTCCAGTCGACATCTGATCGTTTAAAATAGTGATGGTGGTATTTTTAAGTGAGTATCGAATTGTATGGGCATGGGCATGTGCAAGCATTGTAATATCCGATGATGAAAGGATAAGGATATTGCGTGTGTATTTCACTTGTATGGACTTATTGATGGTGGGTACGGCGGATTTGATTGGAAGCAAGTACTTTTAGAGACCCAGTAAAAGTCTCCATATTTTACGATAGGTGCAAGTGGACCGTGCCGTCAGATCGTTCTAAGAGTTTTTGGGAAGCATGCTGTGATATCTATGCAGCCAATCATTCAAACTGAGTAAGTCAAGTCATAGGCTAGGTTAGGAGTGTGGCAATAAACCTTTATTGATGTTTTGTGTTGTCTGGGAAGTGAATTGTAGGCACCAATAGGTTAATTGGGATATTTCTTTTGAGCATCAACTACGTTTATGTAAGGATTAGTGATGCTGAGTGTAGTGATGAGCAAGATCTTATAAGGTTTCGTGATGCTCCTGAATCGAGAAGGAGGTGTCCAGGGGAGTTCATCTCAGAAATGATTAGGTGATTAACCGTTAGATTCAGTAAGGTTCCGGCCCTAAGGTGAAGGTCGTGCTTAGTGTTCAATTGAATCGGTTCAAGTAGGTGATTTACTGATGGAATCGTTGTCCGTGGCTGGAGAGAGGAGTTATTAAGTGGTACAGTTGAGCCCTGGTGTTTTCCGATTTCGATTATTGGTTTTTGAGGATTCCGAGCTTAACTTTGGGTTTGTTGTATTCGTATAGCTGCGAGAATCATTTTTCTTCATCACTCAGATTTCTCCTGTAATTAGGTTGTGTTTCTCGATCCCTGTTGTTCTTCATAAATAGCATGGTATCTAAGAACAGTTCAGCGATGTCAATGTAGATGTCGATGACGTGTTGTTAGCGTAAAAATTTATAGTCGCAAGATAGACCCTTCTCATTAATTAATTGGCATGCGACCTTGTTATTGATAGGGGAGCCATTATTGTTCAGTCGTCGGATAATGTTTGTGACTTTTGTTTAAATGCATCTGCAGGTGTTAACTGCCATTACATTATTTGCAAATTCCGGGTACATGTCGTTTGCCTCTTGGTATCAGATTGCATTGTTTTCAATACTTTGGAAAGAATTTCATGAGCCGTATAATCAAAGGATAGGATGTCTTTGGACCCAGGTAGGTAGGAATTGAGGGAGCAAATTTGAAAAGTGTTATACAAAGAGGTGAGTTATCATCAGTGATTGACGTTACGGGTTTTCCGTTTACTGTCGAATAATACACACCGAGATTGCGAGTTTTGTAAAATTTGAGTGGTTTAACCCAATTTGGAAGTCATGAGGTGAGGTTACATTGGTGGGGGTCTGACATATTTTTAGTGATGTCATATCAGAGTCCGCTGAGGATGAAGCAGTAAATGTATTACCTGGATCAGGTGATGGAGTGCTTCAGAGGCGTTAACTGATGATGGATACTGCGGAAACACTGTTGTTGTGGCAGGTGGAAAGTACATAGGCGCATTGATAAGGTGTATACGGATCATAGATGGGTTGTCCGTTAAATGACCACCAGATGGATTGGCTTGGGTGTGGGTCTCATGCACTGCTGTGGGACGGCCCATTCTTGGAGTGGTATGAAGCCAGGTTGGGAGAGCATGATGGGGGTTCTCTGGAACAGCTGATTGAAGCAGGTTGTTGTTGTCTTCTGTTGAGAGTTAGCCTTAGGGAAGCTTCTCACATTCTTCTGTTTGGAAGCTGGAAACGTCTAAACGGATCTTGATTGTTGGATTCCTAGAAGTAACCGAAGCACAGGCGCTACCATGAGAAATGGGTTGAATGTTGAGATAGATAATTGTTGGATTCCATTGTTGATAAAGGCCTTATAAGATTAGGTATACAGAATATACTAGAAGTTCTCCTCGAGGCTAATAGGAATCCTAAAATGGCATCTATATTTCGTACTTAATATTACGATTTTCCTCATTCCGTTTTATATGTTTATATTCATTGATCTATTACAGTATCAATCCTGGGTTCAGCTTCACTAATTTAGATGGACTATTTCTCATAATTGTGCGATCTTCTAACACCGTATATGATATATACTAGTAATGTAAATACTAGTAGTAGATGATAGTGAGTTCGATCCAACAACCACCCATAAGTGTAATAGATCTAATGAATCCATTGGTTAGTTAATAGTTTAAATGTTTTTATCGGAGAGGTTTTTGTCATCACATCCAGCAATGTTTCTGGTTCGAATGTAGGATACGTATAATGATTACCTGATACTTCATCTCTAAGTCTCATTGCTTTGTGCCAAAAAAATCTGTTTCTAAATTTTCTTCAGTGTAGACTTAATTATTACTGATCGTTGATTATACAGTTAAGGAAGCCTTAATAAGTGGTTTCATTGTTAAGTTCTGGTATCAGGTAACTTAGATTAGTAATAATGGACAGATTTCCTTACGCGTGTATTTCTGCTTCCGTAGGTGAAAAAGTACATGTTTAATGAAGCCTTGGGGGATTTCTCCAAGTACCGTTCCATTAAGTAAATATATGTGCCAATTGTGATTTATAATCGGTTGAGTTGCCCTACGAAGCATCAATTATGAGATCGTTATTATCTGCTCGGTAGGTTTTGT    .       PASS    IMPRECISE;SVTYPE=INS;END=444238;SVLEN=5563      GT:AD:DP        0/1:2:7
+chrVII 530040  pbsv.INS.11     A       AGGAGCAGGCAGCAGGCAGCAGCAGCAGAAGCAGCAGGCAGCAGGCCAGAGCGGGAGCGCAGCCGCGAGAGAGACCGAGCCGCAGGCA        .       PASS    IMPRECISE;SVTYPE=INS;END=530040;SVLEN=87;SVANN=TANDEM   GT:AD:DP        0/1:3:5
+chrVII 530453  pbsv.INS.12     C       CGGAAGGGGTTGGGGGGGGGGGGGGGGTTGGGGGGGGGGTTGGGGGGGGTGGGGTGGGGTGGGGGGG     .       PASS    IMPRECISE;SVTYPE=INS;END=530453;SVLEN=66;SVANN=TANDEM   GT:AD:DP        0/1:2:5
+chrVIII        1807    pbsv.INS.13     A       AGGGTAGTCGCACTAGTCCGGAAGGGGGAGGAGTTTTTGGCAGTAGTAGTAGCACTAGTCCTGACGTTGGTGATGGCAGTTGGTAGTAGCATGAGTGCTGAGTTGGTACTTTCAGTGGTAGTGCACTAGTGTTGGAGTGGTACTTCA     .       PASS    IMPRECISE;SVTYPE=INS;END=1807;SVLEN=146;SVANN=TANDEM    GT:AD:DP        1/1:12:13
+chrX   715090  pbsv.INS.14     T       TGGAGAAGTTGTAGAAAGTTGTAGAAGTTGTAGAAGTTGTAGAAGTGGTTCAGAGGTT      .       PASS    IMPRECISE;SVTYPE=INS;END=715090;SVLEN=57;SVANN=TANDEM   GT:AD:DP        0/1:8:14
+chrXII 319384  pbsv.INS.15     A       AGAAACCTATATGGGGATCTTAAGGTTAGAGGGTTGAAGACATGCTGAAAATTTTAAGTGTCAGTCAGAAAAATGAATTGGGGGG   .       PASS    IMPRECISE;SVTYPE=INS;END=319384;SVLEN=84        GT:AD:DP        0/1:2:9
+chrXIII        202197  pbsv.INS.16     T       TATAGCTTTATCAACATGGAATCCCCACACTTATCTCACTCACATTCACCCCCCATTTCTCACTAGAATAGTACCTGAAAAGGTGAATTTTTGAAATTGTTTGGGATTCCATTGTTGATAAAGGCTATAATATCAAGCTCTACAGAATAC  .       PASS    IMPRECISE;SVTYPE=INS;END=202197;SVLEN=149       GT:AD:DP        1/1:3:3
+chrXIII        908459  pbsv.INS.17     A       AGCTCAGTAAGTTCGGAAAGCCCATTGGCAACGTCTAGCGTAGTGAGGTTTCAGAAGCTCCATCGTCAAACATCTAGCTCAGTGAGGTTCAGAAGATCCATGTCAAACAACATCTAGTCAGTGATGTCAGAAGCTCCATCGTCAACGTCTAGCTCAGTAAGTTCAGAACTAAATCGGCAACGTCTTAGCGTAATTAGTTCAGAAGCTTCATGGGAACGTTCTAGCTCAGTGAGCTCGGAAGTACCATTGGCAACGTCTAGCGTAGTAGTTAGAAAGCTCCATCGGCAAACTAGTTCAGTGGTTCGAAATTCGTCAACAACAGCTA   .       PASS    IMPRECISE;SVTYPE=INS;END=908459;SVLEN=324;SVANN=TANDEM  GT:AD:DP        0/1:3:8
+chrXIII        908705  pbsv.INS.18     A       ACGTCTAGCTAGTGAGTTCAGAAGCTCCCATCGTCAACCATCTAGCTCGTGAGTTCCGAAATTTCGTCAACAACATCTAGCTCAGTAAGTTCGAAGCTTCCATTGGCAAACGTCGAGCGTAGTGAGTTCAGGAAGCTCCATCGTCAAACATCTCGCTCAGTGAGTTCAGAGCTCCATCGTCAACATCTAGCTCCGTGAGGTTCAGAAGCTCCATCGGCAAACGTCAGGCTCAGTGAGTTCGGAAATTAGTCACAAAATCTAGCGTAATGAGTTCAGAAGTGTCATTCGGCAACGTTTCTGCATAGGAGTCGGGAAGCTCATCGGCAA .       PASS    IMPRECISE;SVTYPE=INS;END=908705;SVLEN=326;SVANN=TANDEM  GT:AD:DP        0/1:4:8
+chrXV  31152   pbsv.INS.19     G       GCTGACCTGGATGTAATGGAAGTAGAGGAACCGGAGATGGAGCCCGATTCCAGTGGGGAGG   .       PASS    IMPRECISE;SVTYPE=INS;END=31152;SVLEN=60 GT:AD:DP        0/1:8:9
+chrXV  721730  pbsv.DEL.20     TTTCTTTTTCTATTACTCTTGGCCTCCTCTAGTACACTCTATATTTTTTTATGCCTCGGTAATGATTTTCATTTTTTTTTTTCCACCTAGCGGATGACTCTTTTTTTTTCTTAGCGATTGGCATTATCACATAATGAATTATACATTATATAAAGTAATGTGATTTCTTCGAAGAATATACTAAAAAATGAGCAGGCAAGATAAACGAAGGCAAAGATGACAGAGCAGAAAGCCCTAGTAAAGCGTATTACAAATGAAACCAAGATTCAGATTGCGATCTCTTTAAAGGGTGGTCCCCTAGCGATAGAGCACTCGATCTTCCCAGAAAAAGAGGCAGAAGCAGTAGCAGAACAGGCCACACAATCGCAAGTGATTAACGTCCACACAGGTATAGGGTTTCTGGACCATATGATACATGCTCTGGCCAAGCATTCCGGCTGGTCGCTAATCGTTGAGTGCATTGGTGACTTACACATAGACGACCATCACACCACTGAAGACTGCGGGATTGCTCTCGGTCAAGCTTTTAAAGAGGCCCTAGGGGCCGTGCGTGGAGTAAAAAGGTTTGGATCAGGATTTGCGCCTTTGGATGAGGCACTTTCCAGAGCGGTGGTAGATCTTTCGAACAGGCCGTACGCAGTTGTCGAACTTGGTTTGCAAAGGGAGAAAGTAGGAGATCTCTCTTGCGAGATGATCCCGCATTTTCTTGAAAGCTTTGCAGAGGCTAGCAGAATTACCCTCCACGTTGATTGTCTGCGAGGCAAGAATGATCATCACCGTAGTGAGAGTGCGTTCAAGGCTCTTGCGGTTGCCATAAGAGAAGCCACCTCGCCCAATGGTACCAACGATGTTCCCTCCACCAAAGGTGTTCTTATGTAGTGACACCGATTATTTAAAGCTGCAGCATACGATATATATACATGTGTATATATGTATACCTATGAATGTCAGTAAGTATGTATACGAACAGTATGATACTGAAGATGACAAGGTAATGCATCATTCTATACGTGTCATTCTGAACGAGGCGCGC       T       .       PASS    IMPRECISE;SVTYPE=DEL;END=722762;SVLEN=-1032     GT:AD:DP        0/1:3:6
+chrXVI 660831  pbsv.INS.21     C       CAAAGGAATGGTAAAGATGGGGGGTCAACGGACAAGGGAAAGGATCCATGGGGGCA        .       PASS    IMPRECISE;SVTYPE=INS;END=660831;SVLEN=55        GT:AD:DP        0/1:2:5
diff --git a/tests/data/vcf/unsorted.vcf b/tests/data/vcf/unsorted.vcf

new file mode 100644 (file)

index 0000000..62933f4
--- /dev/null
+++ b/tests/data/vcf/unsorted.vcf
@@ -0,0 +1,12 @@
+##fileformat=VCFv4.2
+##fileDate=20180531
+##contig=<ID=ctg1,length=4200,assembly=foo,md5=dead>
+##contig=<ID=ctg2,length=5000,assembly=foo,md5=beef>
+##contig=<ID=ctg3,length=3000,assembly=foo,md5=1234>
+#CHROM POS     ID      REF     ALT     QUAL    FILTER  INFO
+ctg1   1       variant0        A       T       .       PASS
+ctg1   10      variant1        A       T       .       PASS
+ctg3   50      variant2        A       T       .       PASS
+ctg2   20      variant3        A       T       .       PASS
+ctg3   10      variant4        A       T       .       PASS
+ctg1   5       variant5        A       T       .       PASS
diff --git a/tests/files.cmake b/tests/files.cmake

new file mode 100644 (file)

index 0000000..a57dad4
--- /dev/null
+++ b/tests/files.cmake
@@ -0,0 +1,66 @@
+# test case headers
+set( PacBioBAMTest_H
+
+)
+
+# test case sources
+set( PacBioBAMTest_CPP
+
+    ${PacBioBAM_TestsDir}/src/test_Accuracy.cpp
+    ${PacBioBAM_TestsDir}/src/test_AlignmentPrinter.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamFile.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamHeader.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecord.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordBuilder.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordClipping.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordImplCore.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordImplTags.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordImplVariableData.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamRecordMapping.cpp
+    ${PacBioBAM_TestsDir}/src/test_BamWriter.cpp
+    ${PacBioBAM_TestsDir}/src/test_BarcodeQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_Cigar.cpp
+    ${PacBioBAM_TestsDir}/src/test_Compare.cpp
+    ${PacBioBAM_TestsDir}/src/test_DataSetCore.cpp
+    ${PacBioBAM_TestsDir}/src/test_DataSetIO.cpp
+    ${PacBioBAM_TestsDir}/src/test_DataSetQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_DataSetXsd.cpp
+    ${PacBioBAM_TestsDir}/src/test_EndToEnd.cpp
+    ${PacBioBAM_TestsDir}/src/test_EntireFileQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_Fasta.cpp
+    ${PacBioBAM_TestsDir}/src/test_Fastq.cpp
+    ${PacBioBAM_TestsDir}/src/test_FileUtils.cpp
+    ${PacBioBAM_TestsDir}/src/test_Frames.cpp
+    ${PacBioBAM_TestsDir}/src/test_GenomicIntervalQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_IndexedBamWriter.cpp
+    ${PacBioBAM_TestsDir}/src/test_IndexedFastaReader.cpp
+    ${PacBioBAM_TestsDir}/src/test_Intervals.cpp
+    ${PacBioBAM_TestsDir}/src/test_LongCigar.cpp
+    ${PacBioBAM_TestsDir}/src/test_PacBioIndex.cpp
+    ${PacBioBAM_TestsDir}/src/test_PbiFilter.cpp
+    ${PacBioBAM_TestsDir}/src/test_PbiFilterQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_QNameQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_QualityValues.cpp
+    ${PacBioBAM_TestsDir}/src/test_Pulse2BaseCache.cpp
+    ${PacBioBAM_TestsDir}/src/test_ReadAccuracyQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_ReadGroupInfo.cpp
+    ${PacBioBAM_TestsDir}/src/test_SamWriter.cpp
+    ${PacBioBAM_TestsDir}/src/test_SequenceUtils.cpp
+    ${PacBioBAM_TestsDir}/src/test_StringUtils.cpp
+    ${PacBioBAM_TestsDir}/src/test_SubreadLengthQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_Tags.cpp
+    ${PacBioBAM_TestsDir}/src/test_TimeUtils.cpp
+    ${PacBioBAM_TestsDir}/src/test_Validator.cpp
+    ${PacBioBAM_TestsDir}/src/test_VcfFile.cpp
+    ${PacBioBAM_TestsDir}/src/test_VcfFormat.cpp
+    ${PacBioBAM_TestsDir}/src/test_VcfHeader.cpp
+    ${PacBioBAM_TestsDir}/src/test_VcfReader.cpp
+    ${PacBioBAM_TestsDir}/src/test_VcfSort.cpp
+    ${PacBioBAM_TestsDir}/src/test_VcfQuery.cpp
+    ${PacBioBAM_TestsDir}/src/test_VcfVariant.cpp
+    ${PacBioBAM_TestsDir}/src/test_VcfWriter.cpp
+    ${PacBioBAM_TestsDir}/src/test_Version.cpp
+    ${PacBioBAM_TestsDir}/src/test_WhitelistedZmwReadStitcher.cpp
+    ${PacBioBAM_TestsDir}/src/test_ZmwReadStitcher.cpp
+    ${PacBioBAM_TestsDir}/src/test_ZmwQuery.cpp
+)
diff --git a/tests/meson.build b/tests/meson.build

new file mode 100644 (file)

index 0000000..bfa9187
--- /dev/null
+++ b/tests/meson.build
@@ -0,0 +1,70 @@
+subdir('src')
+
+pbbam_cram_script = find_program('cram', required : false)
+if not pbbam_cram_script.found()
+  warning('Using bundled cram script')
+  pbbam_cram_script = find_program('scripts/cram.py', required : true)
+endif
+
+pbbam_gtest_dep = dependency('gtest_main', fallback : ['gtest', 'gtest_main_dep'])
+
+pbbam_PbbamTestData_h_config = configuration_data()
+pbbam_PbbamTestData_h_config.set('PacBioBAM_BinDir', join_paths([meson.current_build_dir(), '../tools']))
+pbbam_PbbamTestData_h_config.set('PacBioBAM_TestsDir', meson.current_source_dir())
+pbbam_PbbamTestData_h_config.set('CMAKE_CURRENT_BINARY_DIR', meson.current_build_dir())
+pbbam_PbbamTestData_h_config.set('GeneratedDir', meson.current_build_dir())
+pbbam_PbbamTestData_h_config.set('GeneratedTestDataDir', meson.current_build_dir())
+
+pbbam_group_fofn_in = configure_file(
+  input : files('data/group/group.fofn.in'),
+  output : 'group.fofn',
+  configuration : pbbam_PbbamTestData_h_config)
+pbbam_PbbamTestData_h = configure_file(
+  input : files('src/PbbamTestData.h.in'),
+  output : 'PbbamTestData.h',
+  configuration : pbbam_PbbamTestData_h_config)
+pbbam_test_cpp_sources += pbbam_PbbamTestData_h
+
+pbbam_test = executable(
+  'pbbam_test',
+  pbbam_test_cpp_sources,
+  dependencies : [pbbam_gtest_dep, pbbam_boost_dep, pbbam_htslib_dep, pbbam_zlib_dep],
+  include_directories : pbbam_include_directories,
+  link_with : pbbam_lib,
+  cpp_args : pbbam_warning_flags,
+  install : false)
+
+custom_target('pbbam_generate_data',
+  output : 'input.fa',
+  command : [
+    pbbam_python,
+    files('scripts/generate_data.py'),
+    join_paths([meson.current_source_dir(), 'data']),
+    meson.current_build_dir()],
+  build_by_default : true,
+  install : false)
+
+pbbamify_synthetic_dataset = configure_file(
+  input : files('data/pbbamify/synthetic_movie_all.subreadset.xml.in'),
+  output : 'synthetic_movie_all.subreadset.xml',
+  configuration : pbbam_PbbamTestData_h_config)
+
+#########
+# tests #
+#########
+
+test(
+  'pbbam formatting check',
+  pbbam_clang_formatter,
+  args : [
+    '--all'],
+  workdir : meson.source_root())
+
+test(
+  'pbbam gtest unittests',
+  pbbam_test,
+  args : [
+    '--gtest_output=xml:' + join_paths(meson.build_root(), 'pbbam-gtest-unittests.xml')],
+  env : [
+    'ARGS=-V',
+    'VERBOSE=1'])
diff --git a/tests/scripts/cram.py b/tests/scripts/cram.py

new file mode 100755 (executable)

index 0000000..33f118e
--- /dev/null
+++ b/tests/scripts/cram.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python
+import sys
+
+import cram
+
+try:
+    sys.exit(cram.main(sys.argv[1:]))
+except KeyboardInterrupt:
+    pass
diff --git a/tests/scripts/cram/__init__.py b/tests/scripts/cram/__init__.py

new file mode 100644 (file)

index 0000000..4b626c4
--- /dev/null
+++ b/tests/scripts/cram/__init__.py
@@ -0,0 +1,6 @@
+"""Functional testing framework for command line applications"""
+
+from cram._main import main
+from cram._test import test, testfile
+
+__all__ = ['main', 'test', 'testfile']
diff --git a/tests/scripts/cram/__main__.py b/tests/scripts/cram/__main__.py

new file mode 100644 (file)

index 0000000..e6b0aef
--- /dev/null
+++ b/tests/scripts/cram/__main__.py
@@ -0,0 +1,10 @@
+"""Main module (invoked by "python -m cram")"""
+
+import sys
+
+import cram
+
+try:
+    sys.exit(cram.main(sys.argv[1:]))
+except KeyboardInterrupt:
+    pass
diff --git a/tests/scripts/cram/_cli.py b/tests/scripts/cram/_cli.py

new file mode 100644 (file)

index 0000000..8333b6b
--- /dev/null
+++ b/tests/scripts/cram/_cli.py
@@ -0,0 +1,134 @@
+"""The command line interface implementation"""
+
+import os
+import sys
+
+from cram._encoding import b, bytestype, stdoutb
+from cram._process import execute
+
+__all__ = ['runcli']
+
+def _prompt(question, answers, auto=None):
+    """Write a prompt to stdout and ask for answer in stdin.
+
+    answers should be a string, with each character a single
+    answer. An uppercase letter is considered the default answer.
+
+    If an invalid answer is given, this asks again until it gets a
+    valid one.
+
+    If auto is set, the question is answered automatically with the
+    specified value.
+    """
+    default = [c for c in answers if c.isupper()]
+    while True:
+        sys.stdout.write('%s [%s] ' % (question, answers))
+        sys.stdout.flush()
+        if auto is not None:
+            sys.stdout.write(auto + '\n')
+            sys.stdout.flush()
+            return auto
+
+        answer = sys.stdin.readline().strip().lower()
+        if not answer and default:
+            return default[0]
+        elif answer and answer in answers.lower():
+            return answer
+
+def _log(msg=None, verbosemsg=None, verbose=False):
+    """Write msg to standard out and flush.
+
+    If verbose is True, write verbosemsg instead.
+    """
+    if verbose:
+        msg = verbosemsg
+    if msg:
+        if isinstance(msg, bytestype):
+            stdoutb.write(msg)
+        else: # pragma: nocover
+            sys.stdout.write(msg)
+        sys.stdout.flush()
+
+def _patch(cmd, diff):
+    """Run echo [lines from diff] | cmd -p0"""
+    out, retcode = execute([cmd, '-p0'], stdin=b('').join(diff))
+    return retcode == 0
+
+def runcli(tests, quiet=False, verbose=False, patchcmd=None, answer=None):
+    """Run tests with command line interface input/output.
+
+    tests should be a sequence of 2-tuples containing the following:
+
+        (test path, test function)
+
+    This function yields a new sequence where each test function is wrapped
+    with a function that handles CLI input/output.
+
+    If quiet is True, diffs aren't printed. If verbose is True,
+    filenames and status information are printed.
+
+    If patchcmd is set, a prompt is written to stdout asking if
+    changed output should be merged back into the original test. The
+    answer is read from stdin. If 'y', the test is patched using patch
+    based on the changed output.
+    """
+    total, skipped, failed = [0], [0], [0]
+
+    for path, test in tests:
+        def testwrapper():
+            """Test function that adds CLI output"""
+            total[0] += 1
+            _log(None, path + b(': '), verbose)
+
+            refout, postout, diff = test()
+            if refout is None:
+                skipped[0] += 1
+                _log('s', 'empty\n', verbose)
+                return refout, postout, diff
+
+            abspath = os.path.abspath(path)
+            errpath = abspath + b('.err')
+
+            if postout is None:
+                skipped[0] += 1
+                _log('s', 'skipped\n', verbose)
+            elif not diff:
+                _log('.', 'passed\n', verbose)
+                if os.path.exists(errpath):
+                    os.remove(errpath)
+            else:
+                failed[0] += 1
+                _log('!', 'failed\n', verbose)
+                if not quiet:
+                    _log('\n', None, verbose)
+
+                errfile = open(errpath, 'wb')
+                try:
+                    for line in postout:
+                        errfile.write(line)
+                finally:
+                    errfile.close()
+
+                if not quiet:
+                    origdiff = diff
+                    diff = []
+                    for line in origdiff:
+                        stdoutb.write(line)
+                        diff.append(line)
+
+                    if (patchcmd and
+                        _prompt('Accept this change?', 'yN', answer) == 'y'):
+                        if _patch(patchcmd, diff):
+                            _log(None, path + b(': merged output\n'), verbose)
+                            os.remove(errpath)
+                        else:
+                            _log(path + b(': merge failed\n'))
+
+            return refout, postout, diff
+
+        yield (path, testwrapper)
+
+    if total[0] > 0:
+        _log('\n', None, verbose)
+        _log('# Ran %s tests, %s skipped, %s failed.\n'
+             % (total[0], skipped[0], failed[0]))
diff --git a/tests/scripts/cram/_diff.py b/tests/scripts/cram/_diff.py

new file mode 100644 (file)

index 0000000..4877305
--- /dev/null
+++ b/tests/scripts/cram/_diff.py
@@ -0,0 +1,158 @@
+"""Utilities for diffing test files and their output"""
+
+import codecs
+import difflib
+import re
+
+from cram._encoding import b
+
+__all__ = ['esc', 'glob', 'regex', 'unified_diff']
+
+def _regex(pattern, s):
+    """Match a regular expression or return False if invalid.
+
+    >>> from cram._encoding import b
+    >>> [bool(_regex(r, b('foobar'))) for r in (b('foo.*'), b('***'))]
+    [True, False]
+    """
+    try:
+        return re.match(pattern + b(r'\Z'), s)
+    except re.error:
+        return False
+
+def _glob(el, l):
+    r"""Match a glob-like pattern.
+
+    The only supported special characters are * and ?. Escaping is
+    supported.
+
+    >>> from cram._encoding import b
+    >>> bool(_glob(b(r'\* \\ \? fo?b*'), b('* \\ ? foobar')))
+    True
+    """
+    i, n = 0, len(el)
+    res = b('')
+    while i < n:
+        c = el[i:i + 1]
+        i += 1
+        if c == b('\\') and el[i] in b('*?\\'):
+            res += el[i - 1:i + 1]
+            i += 1
+        elif c == b('*'):
+            res += b('.*')
+        elif c == b('?'):
+            res += b('.')
+        else:
+            res += re.escape(c)
+    return _regex(res, l)
+
+def _matchannotation(keyword, matchfunc, el, l):
+    """Apply match function based on annotation keyword"""
+    ann = b(' (%s)\n' % keyword)
+    return el.endswith(ann) and matchfunc(el[:-len(ann)], l[:-1])
+
+def regex(el, l):
+    """Apply a regular expression match to a line annotated with '(re)'"""
+    return _matchannotation('re', _regex, el, l)
+
+def glob(el, l):
+    """Apply a glob match to a line annotated with '(glob)'"""
+    return _matchannotation('glob', _glob, el, l)
+
+def esc(el, l):
+    """Apply an escape match to a line annotated with '(esc)'"""
+    ann = b(' (esc)\n')
+
+    if el.endswith(ann):
+        el = codecs.escape_decode(el[:-len(ann)])[0] + b('\n')
+    if el == l:
+        return True
+
+    if l.endswith(ann):
+        l = codecs.escape_decode(l[:-len(ann)])[0] + b('\n')
+    return el == l
+
+class _SequenceMatcher(difflib.SequenceMatcher, object):
+    """Like difflib.SequenceMatcher, but supports custom match functions"""
+    def __init__(self, *args, **kwargs):
+        self._matchers = kwargs.pop('matchers', [])
+        super(_SequenceMatcher, self).__init__(*args, **kwargs)
+
+    def _match(self, el, l):
+        """Tests for matching lines using custom matchers"""
+        for matcher in self._matchers:
+            if matcher(el, l):
+                return True
+        return False
+
+    def find_longest_match(self, alo, ahi, blo, bhi):
+        """Find longest matching block in a[alo:ahi] and b[blo:bhi]"""
+        # SequenceMatcher uses find_longest_match() to slowly whittle down
+        # the differences between a and b until it has each matching block.
+        # Because of this, we can end up doing the same matches many times.
+        matches = []
+        for n, (el, line) in enumerate(zip(self.a[alo:ahi], self.b[blo:bhi])):
+            if el != line and self._match(el, line):
+                # This fools the superclass's method into thinking that the
+                # regex/glob in a is identical to b by replacing a's line (the
+                # expected output) with b's line (the actual output).
+                self.a[alo + n] = line
+                matches.append((n, el))
+        ret = super(_SequenceMatcher, self).find_longest_match(alo, ahi,
+                                                               blo, bhi)
+        # Restore the lines replaced above. Otherwise, the diff output
+        # would seem to imply that the tests never had any regexes/globs.
+        for n, el in matches:
+            self.a[alo + n] = el
+        return ret
+
+def unified_diff(l1, l2, fromfile=b(''), tofile=b(''), fromfiledate=b(''),
+                 tofiledate=b(''), n=3, lineterm=b('\n'), matchers=None):
+    r"""Compare two sequences of lines; generate the delta as a unified diff.
+
+    This is like difflib.unified_diff(), but allows custom matchers.
+
+    >>> from cram._encoding import b
+    >>> l1 = [b('a\n'), b('? (glob)\n')]
+    >>> l2 = [b('a\n'), b('b\n')]
+    >>> (list(unified_diff(l1, l2, b('f1'), b('f2'), b('1970-01-01'),
+    ...                    b('1970-01-02'))) ==
+    ...  [b('--- f1\t1970-01-01\n'), b('+++ f2\t1970-01-02\n'),
+    ...   b('@@ -1,2 +1,2 @@\n'), b(' a\n'), b('-? (glob)\n'), b('+b\n')])
+    True
+
+    >>> from cram._diff import glob
+    >>> list(unified_diff(l1, l2, matchers=[glob]))
+    []
+    """
+    if matchers is None:
+        matchers = []
+    started = False
+    matcher = _SequenceMatcher(None, l1, l2, matchers=matchers)
+    for group in matcher.get_grouped_opcodes(n):
+        if not started:
+            if fromfiledate:
+                fromdate = b('\t') + fromfiledate
+            else:
+                fromdate = b('')
+            if tofiledate:
+                todate = b('\t') + tofiledate
+            else:
+                todate = b('')
+            yield b('--- ') + fromfile + fromdate + lineterm
+            yield b('+++ ') + tofile + todate + lineterm
+            started = True
+        i1, i2, j1, j2 = group[0][1], group[-1][2], group[0][3], group[-1][4]
+        yield (b("@@ -%d,%d +%d,%d @@" % (i1 + 1, i2 - i1, j1 + 1, j2 - j1)) +
+               lineterm)
+        for tag, i1, i2, j1, j2 in group:
+            if tag == 'equal':
+                for line in l1[i1:i2]:
+                    yield b(' ') + line
+                continue
+            if tag == 'replace' or tag == 'delete':
+                for line in l1[i1:i2]:
+                    yield b('-') + line
+            if tag == 'replace' or tag == 'insert':
+                for line in l2[j1:j2]:
+                    yield b('+') + line
diff --git a/tests/scripts/cram/_encoding.py b/tests/scripts/cram/_encoding.py

new file mode 100644 (file)

index 0000000..d639cce
--- /dev/null
+++ b/tests/scripts/cram/_encoding.py
@@ -0,0 +1,106 @@
+"""Encoding utilities"""
+
+import os
+import sys
+
+try:
+    import builtins
+except ImportError:
+    import __builtin__ as builtins
+
+__all__ = ['b', 'bchr', 'bytestype', 'envencode', 'fsdecode', 'fsencode',
+           'stdoutb', 'stderrb', 'u', 'ul', 'unicodetype']
+
+bytestype = getattr(builtins, 'bytes', str)
+unicodetype = getattr(builtins, 'unicode', str)
+
+if getattr(os, 'fsdecode', None) is not None:
+    fsdecode = os.fsdecode
+    fsencode = os.fsencode
+elif bytestype is not str:
+    if sys.platform == 'win32':
+        def fsdecode(s):
+            """Decode a filename from the filesystem encoding"""
+            if isinstance(s, unicodetype):
+                return s
+            encoding = sys.getfilesystemencoding()
+            if encoding == 'mbcs':
+                return s.decode(encoding)
+            else:
+                return s.decode(encoding, 'surrogateescape')
+
+        def fsencode(s):
+            """Encode a filename to the filesystem encoding"""
+            if isinstance(s, bytestype):
+                return s
+            encoding = sys.getfilesystemencoding()
+            if encoding == 'mbcs':
+                return s.encode(encoding)
+            else:
+                return s.encode(encoding, 'surrogateescape')
+    else:
+        def fsdecode(s):
+            """Decode a filename from the filesystem encoding"""
+            if isinstance(s, unicodetype):
+                return s
+            return s.decode(sys.getfilesystemencoding(), 'surrogateescape')
+
+        def fsencode(s):
+            """Encode a filename to the filesystem encoding"""
+            if isinstance(s, bytestype):
+                return s
+            return s.encode(sys.getfilesystemencoding(), 'surrogateescape')
+else:
+    def fsdecode(s):
+        """Decode a filename from the filesystem encoding"""
+        return s
+
+    def fsencode(s):
+        """Encode a filename to the filesystem encoding"""
+        return s
+
+if bytestype is str:
+    def envencode(s):
+        """Encode a byte string to the os.environ encoding"""
+        return s
+else:
+    envencode = fsdecode
+
+if getattr(sys.stdout, 'buffer', None) is not None:
+    stdoutb = sys.stdout.buffer
+    stderrb = sys.stderr.buffer
+else:
+    stdoutb = sys.stdout
+    stderrb = sys.stderr
+
+if bytestype is str:
+    def b(s):
+        """Convert an ASCII string literal into a bytes object"""
+        return s
+
+    bchr = chr
+
+    def u(s):
+        """Convert an ASCII string literal into a unicode object"""
+        return s.decode('ascii')
+else:
+    def b(s):
+        """Convert an ASCII string literal into a bytes object"""
+        return s.encode('ascii')
+
+    def bchr(i):
+        """Return a bytes character for a given integer value"""
+        return bytestype([i])
+
+    def u(s):
+        """Convert an ASCII string literal into a unicode object"""
+        return s
+
+try:
+    eval(r'u""')
+except SyntaxError:
+    ul = eval
+else:
+    def ul(e):
+        """Evaluate e as a unicode string literal"""
+        return eval('u' + e)
diff --git a/tests/scripts/cram/_main.py b/tests/scripts/cram/_main.py

new file mode 100644 (file)

index 0000000..11d457b
--- /dev/null
+++ b/tests/scripts/cram/_main.py
@@ -0,0 +1,211 @@
+"""Main entry point"""
+
+import optparse
+import os
+import shlex
+import shutil
+import sys
+import tempfile
+
+try:
+    import configparser
+except ImportError: # pragma: nocover
+    import ConfigParser as configparser
+
+from cram._cli import runcli
+from cram._encoding import b, fsencode, stderrb, stdoutb
+from cram._run import runtests
+from cram._xunit import runxunit
+
+def _which(cmd):
+    """Return the path to cmd or None if not found"""
+    cmd = fsencode(cmd)
+    for p in os.environ['PATH'].split(os.pathsep):
+        path = os.path.join(fsencode(p), cmd)
+        if os.path.isfile(path) and os.access(path, os.X_OK):
+            return os.path.abspath(path)
+    return None
+
+def _expandpath(path):
+    """Expands ~ and environment variables in path"""
+    return os.path.expanduser(os.path.expandvars(path))
+
+class _OptionParser(optparse.OptionParser):
+    """Like optparse.OptionParser, but supports setting values through
+    CRAM= and .cramrc."""
+
+    def __init__(self, *args, **kwargs):
+        self._config_opts = {}
+        optparse.OptionParser.__init__(self, *args, **kwargs)
+
+    def add_option(self, *args, **kwargs):
+        option = optparse.OptionParser.add_option(self, *args, **kwargs)
+        if option.dest and option.dest != 'version':
+            key = option.dest.replace('_', '-')
+            self._config_opts[key] = option.action == 'store_true'
+        return option
+
+    def parse_args(self, args=None, values=None):
+        config = configparser.RawConfigParser()
+        config.read(_expandpath(os.environ.get('CRAMRC', '.cramrc')))
+        defaults = {}
+        for key, isbool in self._config_opts.items():
+            try:
+                if isbool:
+                    try:
+                        value = config.getboolean('cram', key)
+                    except ValueError:
+                        value = config.get('cram', key)
+                        self.error('--%s: invalid boolean value: %r'
+                                   % (key, value))
+                else:
+                    value = config.get('cram', key)
+            except (configparser.NoSectionError, configparser.NoOptionError):
+                pass
+            else:
+                defaults[key] = value
+        self.set_defaults(**defaults)
+
+        eargs = os.environ.get('CRAM', '').strip()
+        if eargs:
+            args = args or []
+            args += shlex.split(eargs)
+
+        try:
+            return optparse.OptionParser.parse_args(self, args, values)
+        except optparse.OptionValueError:
+            self.error(str(sys.exc_info()[1]))
+
+def _parseopts(args):
+    """Parse command line arguments"""
+    p = _OptionParser(usage='cram [OPTIONS] TESTS...', prog='cram')
+    p.add_option('-V', '--version', action='store_true',
+                 help='show version information and exit')
+    p.add_option('-q', '--quiet', action='store_true',
+                 help="don't print diffs")
+    p.add_option('-v', '--verbose', action='store_true',
+                 help='show filenames and test status')
+    p.add_option('-i', '--interactive', action='store_true',
+                 help='interactively merge changed test output')
+    p.add_option('-d', '--debug', action='store_true',
+                 help='write script output directly to the terminal')
+    p.add_option('-y', '--yes', action='store_true',
+                 help='answer yes to all questions')
+    p.add_option('-n', '--no', action='store_true',
+                 help='answer no to all questions')
+    p.add_option('-E', '--preserve-env', action='store_true',
+                 help="don't reset common environment variables")
+    p.add_option('--keep-tmpdir', action='store_true',
+                 help='keep temporary directories')
+    p.add_option('--shell', action='store', default='/bin/sh', metavar='PATH',
+                 help='shell to use for running tests (default: %default)')
+    p.add_option('--shell-opts', action='store', metavar='OPTS',
+                 help='arguments to invoke shell with')
+    p.add_option('--indent', action='store', default=2, metavar='NUM',
+                 type='int', help=('number of spaces to use for indentation '
+                                   '(default: %default)'))
+    p.add_option('--xunit-file', action='store', metavar='PATH',
+                 help='path to write xUnit XML output')
+    opts, paths = p.parse_args(args)
+    paths = [fsencode(path) for path in paths]
+    return opts, paths, p.get_usage
+
+def main(args):
+    """Main entry point.
+
+    If you're thinking of using Cram in other Python code (e.g., unit tests),
+    consider using the test() or testfile() functions instead.
+
+    :param args: Script arguments (excluding script name)
+    :type args: str
+    :return: Exit code (non-zero on failure)
+    :rtype: int
+    """
+    opts, paths, getusage = _parseopts(args)
+    if opts.version:
+        sys.stdout.write("""Cram CLI testing framework (version 0.7)
+
+Copyright (C) 2010-2016 Brodie Rao <brodie@bitheap.org> and others
+This is free software; see the source for copying conditions. There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+""")
+        return
+
+    conflicts = [('--yes', opts.yes, '--no', opts.no),
+                 ('--quiet', opts.quiet, '--interactive', opts.interactive),
+                 ('--debug', opts.debug, '--quiet', opts.quiet),
+                 ('--debug', opts.debug, '--interactive', opts.interactive),
+                 ('--debug', opts.debug, '--verbose', opts.verbose),
+                 ('--debug', opts.debug, '--xunit-file', opts.xunit_file)]
+    for s1, o1, s2, o2 in conflicts:
+        if o1 and o2:
+            sys.stderr.write('options %s and %s are mutually exclusive\n'
+                             % (s1, s2))
+            return 2
+
+    shellcmd = _which(opts.shell)
+    if not shellcmd:
+        stderrb.write(b('shell not found: ') + fsencode(opts.shell) + b('\n'))
+        return 2
+    shell = [shellcmd]
+    if opts.shell_opts:
+        shell += shlex.split(opts.shell_opts)
+
+    patchcmd = None
+    if opts.interactive:
+        patchcmd = _which('patch')
+        if not patchcmd:
+            sys.stderr.write('patch(1) required for -i\n')
+            return 2
+
+    if not paths:
+        sys.stdout.write(getusage())
+        return 2
+
+    badpaths = [path for path in paths if not os.path.exists(path)]
+    if badpaths:
+        stderrb.write(b('no such file: ') + badpaths[0] + b('\n'))
+        return 2
+
+    if opts.yes:
+        answer = 'y'
+    elif opts.no:
+        answer = 'n'
+    else:
+        answer = None
+
+    tmpdir = os.environ['CRAMTMP'] = tempfile.mkdtemp('', 'cramtests-')
+    tmpdirb = fsencode(tmpdir)
+    proctmp = os.path.join(tmpdir, 'tmp')
+    for s in ('TMPDIR', 'TEMP', 'TMP'):
+        os.environ[s] = proctmp
+
+    os.mkdir(proctmp)
+    try:
+        tests = runtests(paths, tmpdirb, shell, indent=opts.indent,
+                         cleanenv=not opts.preserve_env, debug=opts.debug)
+        if not opts.debug:
+            tests = runcli(tests, quiet=opts.quiet, verbose=opts.verbose,
+                           patchcmd=patchcmd, answer=answer)
+            if opts.xunit_file is not None:
+                tests = runxunit(tests, opts.xunit_file)
+
+        hastests = False
+        failed = False
+        for path, test in tests:
+            hastests = True
+            refout, postout, diff = test()
+            if diff:
+                failed = True
+
+        if not hastests:
+            sys.stderr.write('no tests found\n')
+            return 2
+
+        return int(failed)
+    finally:
+        if opts.keep_tmpdir:
+            stdoutb.write(b('# Kept temporary directory: ') + tmpdirb +
+                          b('\n'))
+        else:
+            shutil.rmtree(tmpdir)
diff --git a/tests/scripts/cram/_process.py b/tests/scripts/cram/_process.py

new file mode 100644 (file)

index 0000000..decdfbc
--- /dev/null
+++ b/tests/scripts/cram/_process.py
@@ -0,0 +1,54 @@
+"""Utilities for running subprocesses"""
+
+import os
+import signal
+import subprocess
+import sys
+
+from cram._encoding import fsdecode
+
+__all__ = ['PIPE', 'STDOUT', 'execute']
+
+PIPE = subprocess.PIPE
+STDOUT = subprocess.STDOUT
+
+def _makeresetsigpipe():
+    """Make a function to reset SIGPIPE to SIG_DFL (for use in subprocesses).
+
+    Doing subprocess.Popen(..., preexec_fn=makeresetsigpipe()) will prevent
+    Python's SIGPIPE handler (SIG_IGN) from being inherited by the
+    child process.
+    """
+    if (sys.platform == 'win32' or
+        getattr(signal, 'SIGPIPE', None) is None): # pragma: nocover
+        return None
+    return lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+
+def execute(args, stdin=None, stdout=None, stderr=None, cwd=None, env=None):
+    """Run a process and return its output and return code.
+
+    stdin may either be None or a string to send to the process.
+
+    stdout may either be None or PIPE. If set to PIPE, the process's output
+    is returned as a string.
+
+    stderr may either be None or STDOUT. If stdout is set to PIPE and stderr
+    is set to STDOUT, the process's stderr output will be interleaved with
+    stdout and returned as a string.
+
+    cwd sets the process's current working directory.
+
+    env can be set to a dictionary to override the process's environment
+    variables.
+
+    This function returns a 2-tuple of (output, returncode).
+    """
+    if sys.platform == 'win32': # pragma: nocover
+        args = [fsdecode(arg) for arg in args]
+
+    p = subprocess.Popen(args, stdin=PIPE, stdout=stdout, stderr=stderr,
+                         cwd=cwd, env=env, bufsize=-1,
+                         preexec_fn=_makeresetsigpipe(),
+                         close_fds=os.name == 'posix')
+    out, err = p.communicate(stdin)
+    return out, p.returncode
diff --git a/tests/scripts/cram/_run.py b/tests/scripts/cram/_run.py

new file mode 100644 (file)

index 0000000..9111c0f
--- /dev/null
+++ b/tests/scripts/cram/_run.py
@@ -0,0 +1,77 @@
+"""The test runner"""
+
+import os
+import sys
+
+from cram._encoding import b, fsdecode, fsencode
+from cram._test import testfile
+
+__all__ = ['runtests']
+
+if sys.platform == 'win32': # pragma: nocover
+    def _walk(top):
+        top = fsdecode(top)
+        for root, dirs, files in os.walk(top):
+            yield (fsencode(root),
+                   [fsencode(p) for p in dirs],
+                   [fsencode(p) for p in files])
+else:
+    _walk = os.walk
+
+def _findtests(paths):
+    """Yield tests in paths in sorted order"""
+    for p in paths:
+        if os.path.isdir(p):
+            for root, dirs, files in _walk(p):
+                if os.path.basename(root).startswith(b('.')):
+                    continue
+                for f in sorted(files):
+                    if not f.startswith(b('.')) and f.endswith(b('.t')):
+                        yield os.path.normpath(os.path.join(root, f))
+        else:
+            yield os.path.normpath(p)
+
+def runtests(paths, tmpdir, shell, indent=2, cleanenv=True, debug=False):
+    """Run tests and yield results.
+
+    This yields a sequence of 2-tuples containing the following:
+
+        (test path, test function)
+
+    The test function, when called, runs the test in a temporary directory
+    and returns a 3-tuple:
+
+        (list of lines in the test, same list with actual output, diff)
+    """
+    cwd = os.getcwd()
+    seen = set()
+    basenames = set()
+    for i, path in enumerate(_findtests(paths)):
+        abspath = os.path.abspath(path)
+        if abspath in seen:
+            continue
+        seen.add(abspath)
+
+        if not os.stat(path).st_size:
+            yield (path, lambda: (None, None, None))
+            continue
+
+        basename = os.path.basename(path)
+        if basename in basenames:
+            basename = basename + b('-%s' % i)
+        else:
+            basenames.add(basename)
+
+        def test():
+            """Run test file"""
+            testdir = os.path.join(tmpdir, basename)
+            os.mkdir(testdir)
+            try:
+                os.chdir(testdir)
+                return testfile(abspath, shell, indent=indent,
+                                cleanenv=cleanenv, debug=debug,
+                                testname=path)
+            finally:
+                os.chdir(cwd)
+
+        yield (path, test)
diff --git a/tests/scripts/cram/_test.py b/tests/scripts/cram/_test.py

new file mode 100644 (file)

index 0000000..27ef99c
--- /dev/null
+++ b/tests/scripts/cram/_test.py
@@ -0,0 +1,230 @@
+"""Utilities for running individual tests"""
+
+import itertools
+import os
+import re
+import time
+
+from cram._encoding import b, bchr, bytestype, envencode, unicodetype
+from cram._diff import esc, glob, regex, unified_diff
+from cram._process import PIPE, STDOUT, execute
+
+__all__ = ['test', 'testfile']
+
+_needescape = re.compile(b(r'[\x00-\x09\x0b-\x1f\x7f-\xff]')).search
+_escapesub = re.compile(b(r'[\x00-\x09\x0b-\x1f\\\x7f-\xff]')).sub
+_escapemap = dict((bchr(i), b(r'\x%02x' % i)) for i in range(256))
+_escapemap.update({b('\\'): b('\\\\'), b('\r'): b(r'\r'), b('\t'): b(r'\t')})
+
+def _escape(s):
+    """Like the string-escape codec, but doesn't escape quotes"""
+    return (_escapesub(lambda m: _escapemap[m.group(0)], s[:-1]) +
+            b(' (esc)\n'))
+
+def test(lines, shell='/bin/sh', indent=2, testname=None, env=None,
+         cleanenv=True, debug=False):
+    r"""Run test lines and return input, output, and diff.
+
+    This returns a 3-tuple containing the following:
+
+        (list of lines in test, same list with actual output, diff)
+
+    diff is a generator that yields the diff between the two lists.
+
+    If a test exits with return code 80, the actual output is set to
+    None and diff is set to [].
+
+    Note that the TESTSHELL environment variable is available in the
+    test (set to the specified shell). However, the TESTDIR and
+    TESTFILE environment variables are not available. To run actual
+    test files, see testfile().
+
+    Example usage:
+
+    >>> from cram._encoding import b
+    >>> refout, postout, diff = test([b('  $ echo hi\n'),
+    ...                               b('  [a-z]{2} (re)\n')])
+    >>> refout == [b('  $ echo hi\n'), b('  [a-z]{2} (re)\n')]
+    True
+    >>> postout == [b('  $ echo hi\n'), b('  hi\n')]
+    True
+    >>> bool(diff)
+    False
+
+    lines may also be a single bytes string:
+
+    >>> refout, postout, diff = test(b('  $ echo hi\n  bye\n'))
+    >>> refout == [b('  $ echo hi\n'), b('  bye\n')]
+    True
+    >>> postout == [b('  $ echo hi\n'), b('  hi\n')]
+    True
+    >>> bool(diff)
+    True
+    >>> (b('').join(diff) ==
+    ...  b('--- \n+++ \n@@ -1,2 +1,2 @@\n   $ echo hi\n-  bye\n+  hi\n'))
+    True
+
+    Note that the b() function is internal to Cram. If you're using Python 2,
+    use normal string literals instead. If you're using Python 3, use bytes
+    literals.
+
+    :param lines: Test input
+    :type lines: bytes or collections.Iterable[bytes]
+    :param shell: Shell to run test in
+    :type shell: bytes or str or list[bytes] or list[str]
+    :param indent: Amount of indentation to use for shell commands
+    :type indent: int
+    :param testname: Optional test file name (used in diff output)
+    :type testname: bytes or None
+    :param env: Optional environment variables for the test shell
+    :type env: dict or None
+    :param cleanenv: Whether or not to sanitize the environment
+    :type cleanenv: bool
+    :param debug: Whether or not to run in debug mode (don't capture stdout)
+    :type debug: bool
+    :return: Input, output, and diff iterables
+    :rtype: (list[bytes], list[bytes], collections.Iterable[bytes])
+    """
+    indent = b(' ') * indent
+    cmdline = indent + b('$ ')
+    conline = indent + b('> ')
+    usalt = 'CRAM%s' % time.time()
+    salt = b(usalt)
+
+    if env is None:
+        env = os.environ.copy()
+
+    if cleanenv:
+        for s in ('LANG', 'LC_ALL', 'LANGUAGE'):
+            env[s] = 'C'
+        env['TZ'] = 'GMT'
+        env['CDPATH'] = ''
+        env['COLUMNS'] = '80'
+        env['GREP_OPTIONS'] = ''
+
+    if isinstance(lines, bytestype):
+        lines = lines.splitlines(True)
+
+    if isinstance(shell, (bytestype, unicodetype)):
+        shell = [shell]
+    env['TESTSHELL'] = shell[0]
+
+    if debug:
+        stdin = []
+        for line in lines:
+            if not line.endswith(b('\n')):
+                line += b('\n')
+            if line.startswith(cmdline):
+                stdin.append(line[len(cmdline):])
+            elif line.startswith(conline):
+                stdin.append(line[len(conline):])
+
+        execute(shell + ['-'], stdin=b('').join(stdin), env=env)
+        return ([], [], [])
+
+    after = {}
+    refout, postout = [], []
+    i = pos = prepos = -1
+    stdin = []
+    for i, line in enumerate(lines):
+        if not line.endswith(b('\n')):
+            line += b('\n')
+        refout.append(line)
+        if line.startswith(cmdline):
+            after.setdefault(pos, []).append(line)
+            prepos = pos
+            pos = i
+            stdin.append(b('echo %s %s $?\n' % (usalt, i)))
+            stdin.append(line[len(cmdline):])
+        elif line.startswith(conline):
+            after.setdefault(prepos, []).append(line)
+            stdin.append(line[len(conline):])
+        elif not line.startswith(indent):
+            after.setdefault(pos, []).append(line)
+    stdin.append(b('echo %s %s $?\n' % (usalt, i + 1)))
+
+    output, retcode = execute(shell + ['-'], stdin=b('').join(stdin),
+                              stdout=PIPE, stderr=STDOUT, env=env)
+    if retcode == 80:
+        return (refout, None, [])
+
+    pos = -1
+    ret = 0
+    for i, line in enumerate(output[:-1].splitlines(True)):
+        out, cmd = line, None
+        if salt in line:
+            out, cmd = line.split(salt, 1)
+
+        if out:
+            if not out.endswith(b('\n')):
+                out += b(' (no-eol)\n')
+
+            if _needescape(out):
+                out = _escape(out)
+            postout.append(indent + out)
+
+        if cmd:
+            ret = int(cmd.split()[1])
+            if ret != 0:
+                postout.append(indent + b('[%s]\n' % (ret)))
+            postout += after.pop(pos, [])
+            pos = int(cmd.split()[0])
+
+    postout += after.pop(pos, [])
+
+    if testname:
+        diffpath = testname
+        errpath = diffpath + b('.err')
+    else:
+        diffpath = errpath = b('')
+    diff = unified_diff(refout, postout, diffpath, errpath,
+                        matchers=[esc, glob, regex])
+    for firstline in diff:
+        return refout, postout, itertools.chain([firstline], diff)
+    return refout, postout, []
+
+def testfile(path, shell='/bin/sh', indent=2, env=None, cleanenv=True,
+             debug=False, testname=None):
+    """Run test at path and return input, output, and diff.
+
+    This returns a 3-tuple containing the following:
+
+        (list of lines in test, same list with actual output, diff)
+
+    diff is a generator that yields the diff between the two lists.
+
+    If a test exits with return code 80, the actual output is set to
+    None and diff is set to [].
+
+    Note that the TESTDIR, TESTFILE, and TESTSHELL environment
+    variables are available to use in the test.
+
+    :param path: Path to test file
+    :type path: bytes or str
+    :param shell: Shell to run test in
+    :type shell: bytes or str or list[bytes] or list[str]
+    :param indent: Amount of indentation to use for shell commands
+    :type indent: int
+    :param env: Optional environment variables for the test shell
+    :type env: dict or None
+    :param cleanenv: Whether or not to sanitize the environment
+    :type cleanenv: bool
+    :param debug: Whether or not to run in debug mode (don't capture stdout)
+    :type debug: bool
+    :param testname: Optional test file name (used in diff output)
+    :type testname: bytes or None
+    :return: Input, output, and diff iterables
+    :rtype: (list[bytes], list[bytes], collections.Iterable[bytes])
+    """
+    f = open(path, 'rb')
+    try:
+        abspath = os.path.abspath(path)
+        env = env or os.environ.copy()
+        env['TESTDIR'] = envencode(os.path.dirname(abspath))
+        env['TESTFILE'] = envencode(os.path.basename(abspath))
+        if testname is None: # pragma: nocover
+            testname = os.path.basename(abspath)
+        return test(f, shell, indent=indent, testname=testname, env=env,
+                    cleanenv=cleanenv, debug=debug)
+    finally:
+        f.close()
diff --git a/tests/scripts/cram/_xunit.py b/tests/scripts/cram/_xunit.py

new file mode 100644 (file)

index 0000000..0b3cb49
--- /dev/null
+++ b/tests/scripts/cram/_xunit.py
@@ -0,0 +1,173 @@
+"""xUnit XML output"""
+
+import locale
+import os
+import re
+import socket
+import sys
+import time
+
+from cram._encoding import u, ul
+
+__all__ = ['runxunit']
+
+_widecdataregex = ul(r"'(?:[^\x09\x0a\x0d\x20-\ud7ff\ue000-\ufffd"
+                     r"\U00010000-\U0010ffff]|]]>)'")
+_narrowcdataregex = ul(r"'(?:[^\x09\x0a\x0d\x20-\ud7ff\ue000-\ufffd]"
+                       r"|]]>)'")
+_widequoteattrregex = ul(r"'[^\x20\x21\x23-\x25\x27-\x3b\x3d"
+                         r"\x3f-\ud7ff\ue000-\ufffd"
+                         r"\U00010000-\U0010ffff]'")
+_narrowquoteattrregex = ul(r"'[^\x20\x21\x23-\x25\x27-\x3b\x3d"
+                           r"\x3f-\ud7ff\ue000-\ufffd]'")
+_replacementchar = ul(r"'\N{REPLACEMENT CHARACTER}'")
+
+if sys.maxunicode >= 0x10ffff: # pragma: nocover
+    _cdatasub = re.compile(_widecdataregex).sub
+    _quoteattrsub = re.compile(_widequoteattrregex).sub
+else: # pragma: nocover
+    _cdatasub = re.compile(_narrowcdataregex).sub
+    _quoteattrsub = re.compile(_narrowquoteattrregex).sub
+
+def _cdatareplace(m):
+    """Replace _cdatasub() regex match"""
+    if m.group(0) == u(']]>'):
+        return u(']]>]]&gt;<![CDATA[')
+    else:
+        return _replacementchar
+
+def _cdata(s):
+    r"""Escape a string as an XML CDATA block.
+
+    >>> from cram._encoding import ul
+    >>> (_cdata('1<\'2\'>&"3\x00]]>\t\r\n') ==
+    ...  ul(r"'<![CDATA[1<\'2\'>&\"3\ufffd]]>]]&gt;<![CDATA[\t\r\n]]>'"))
+    True
+    """
+    return u('<![CDATA[%s]]>') % _cdatasub(_cdatareplace, s)
+
+def _quoteattrreplace(m):
+    """Replace _quoteattrsub() regex match"""
+    return {u('\t'): u('&#9;'),
+            u('\n'): u('&#10;'),
+            u('\r'): u('&#13;'),
+            u('"'): u('&quot;'),
+            u('&'): u('&amp;'),
+            u('<'): u('&lt;'),
+            u('>'): u('&gt;')}.get(m.group(0), _replacementchar)
+
+def _quoteattr(s):
+    r"""Escape a string for use as an XML attribute value.
+
+    >>> from cram._encoding import ul
+    >>> (_quoteattr('1<\'2\'>&"3\x00]]>\t\r\n') ==
+    ...  ul(r"'\"1&lt;\'2\'&gt;&amp;&quot;3\ufffd]]&gt;&#9;&#13;&#10;\"'"))
+    True
+    """
+    return u('"%s"') % _quoteattrsub(_quoteattrreplace, s)
+
+def _timestamp():
+    """Return the current time in ISO 8601 format"""
+    tm = time.localtime()
+    if tm.tm_isdst == 1: # pragma: nocover
+        tz = time.altzone
+    else: # pragma: nocover
+        tz = time.timezone
+
+    timestamp = time.strftime('%Y-%m-%dT%H:%M:%S', tm)
+    tzhours = int(-tz / 60 / 60)
+    tzmins = int(abs(tz) / 60 % 60)
+    timestamp += u('%+03d:%02d') % (tzhours, tzmins)
+    return timestamp
+
+def runxunit(tests, xmlpath):
+    """Run tests with xUnit XML output.
+
+    tests should be a sequence of 2-tuples containing the following:
+
+        (test path, test function)
+
+    This function yields a new sequence where each test function is wrapped
+    with a function that writes test results to an xUnit XML file.
+    """
+    suitestart = time.time()
+    timestamp = _timestamp()
+    hostname = socket.gethostname()
+    total, skipped, failed = [0], [0], [0]
+    testcases = []
+
+    for path, test in tests:
+        def testwrapper():
+            """Run test and collect XML output"""
+            total[0] += 1
+
+            start = time.time()
+            refout, postout, diff = test()
+            testtime = time.time() - start
+
+            classname = path.decode(locale.getpreferredencoding(), 'replace')
+            name = os.path.basename(classname)
+
+            if postout is None:
+                skipped[0] += 1
+                testcase = (u('  <testcase classname=%(classname)s\n'
+                              '            name=%(name)s\n'
+                              '            time="%(time).6f">\n'
+                              '    <skipped/>\n'
+                              '  </testcase>\n') %
+                            {'classname': _quoteattr(classname),
+                             'name': _quoteattr(name),
+                             'time': testtime})
+            elif diff:
+                failed[0] += 1
+                diff = list(diff)
+                diffu = u('').join(l.decode(locale.getpreferredencoding(),
+                                            'replace')
+                                   for l in diff)
+                testcase = (u('  <testcase classname=%(classname)s\n'
+                              '            name=%(name)s\n'
+                              '            time="%(time).6f">\n'
+                              '    <failure>%(diff)s</failure>\n'
+                              '  </testcase>\n') %
+                            {'classname': _quoteattr(classname),
+                             'name': _quoteattr(name),
+                             'time': testtime,
+                             'diff': _cdata(diffu)})
+            else:
+                testcase = (u('  <testcase classname=%(classname)s\n'
+                              '            name=%(name)s\n'
+                              '            time="%(time).6f"/>\n') %
+                            {'classname': _quoteattr(classname),
+                             'name': _quoteattr(name),
+                             'time': testtime})
+            testcases.append(testcase)
+
+            return refout, postout, diff
+
+        yield path, testwrapper
+
+    suitetime = time.time() - suitestart
+    header = (u('<?xml version="1.0" encoding="utf-8"?>\n'
+                '<testsuite name="cram"\n'
+                '           tests="%(total)d"\n'
+                '           failures="%(failed)d"\n'
+                '           skipped="%(skipped)d"\n'
+                '           timestamp=%(timestamp)s\n'
+                '           hostname=%(hostname)s\n'
+                '           time="%(time).6f">\n') %
+              {'total': total[0],
+               'failed': failed[0],
+               'skipped': skipped[0],
+               'timestamp': _quoteattr(timestamp),
+               'hostname': _quoteattr(hostname),
+               'time': suitetime})
+    footer = u('</testsuite>\n')
+
+    xmlfile = open(xmlpath, 'wb')
+    try:
+        xmlfile.write(header.encode('utf-8'))
+        for testcase in testcases:
+            xmlfile.write(testcase.encode('utf-8'))
+        xmlfile.write(footer.encode('utf-8'))
+    finally:
+        xmlfile.close()
diff --git a/tests/scripts/generate_data.py b/tests/scripts/generate_data.py

new file mode 100755 (executable)

index 0000000..278acb5
--- /dev/null
+++ b/tests/scripts/generate_data.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os, shutil, sys
+from io import StringIO
+
+# FASTA generation
+fastaSeq_1 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG"""
+
+fastaSeq_2 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+AACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC"""
+
+fastaSeq_3 = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
+ACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT"""
+
+# FASTQ generation
+
+fastqSeq_1   = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG"""
+fastqQuals_1 = """[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[["""
+
+fastqSeq_2   = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC"""
+fastqQuals_2 = """[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[["""
+
+fastqSeq_3   = """TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT"""
+fastqQuals_3 = """]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]"""
+
+
+# file creation decorator
+def fileMaker(func):
+    def inner(*args, **kwargs):
+        print(" - Creating file: %s..." % args[1], end='')
+        sys.stdout.flush()
+        retval = func(*args)
+        print("done.")
+        sys.stdout.flush()
+        return retval
+    return inner
+
+# symlink creation decorator
+def fileLinker(func):
+    def inner(*args, **kwargs):
+        print(" - Creating symlink: %s..." % args[1], end='')
+        sys.stdout.flush()
+        retval = func(*args)
+        print("done.")
+        sys.stdout.flush()
+        return retval
+    return inner
+
+# return a copy of original, minues any lines that contain an entry in blacklist
+def trimXmlElements(original, blacklist):
+    out = StringIO()
+    for line in original.splitlines():
+        if all(x not in line for x in blacklist):
+            out.write(line + '\n')
+    result = out.getvalue()
+    out.close()
+    return result
+
+class TestDataGenerator:
+
+    def __init__(self, source, dest):
+
+        # source/destination directories
+        self.testDataDir      = source
+        self.generatedDataDir = dest
+
+        # generated output files/symlinks & 'maker' functions
+        self.outputFiles = {
+            'truncated.bam' : self.makeTruncatedBam,
+            'chunking_emptyfilters.subreadset.xml'   : self.makeChunkingXml,
+            'chunking_missingfilters.subreadset.xml' : self.makeChunkingXml,
+            'normal.fa' : self.makeNormalFasta,
+            'normal.fq' : self.makeNormalFastq
+        }
+        self.outputSymlinks = {
+            'aligned.bam'      : self.makeAlignedBamCopy,
+            'aligned.bam.bai'  : self.makeAlignedBamCopy,
+            'aligned.bam.pbi'  : self.makeAlignedBamCopy,
+            'aligned2.bam'     : self.makeAlignedBamCopy,
+            'aligned2.bam.bai' : self.makeAlignedBamCopy,
+            'aligned2.bam.pbi' : self.makeAlignedBamCopy,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam'     : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi' : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam'     : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.2.subreads.bam.pbi' : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam'     : self.makeChunkingSymlink,
+            'm150404_101626_42267_c100807920800000001823174110291514_s1_p0.3.subreads.bam.pbi' : self.makeChunkingSymlink,
+            'missing_pbi.bam' : self.makeMissingPbiBam,
+        }
+
+    def editChunkingXml(self, outputFn, removeFiltersNode):
+        inputXmlFn  = os.path.join(self.testDataDir,'chunking','chunking.subreadset.xml')
+        outputXmlFn = os.path.join(self.generatedDataDir,outputFn)
+
+        blacklist = ['pbds:Filter>', 'pbbase:Properties>', '<pbbase:Property']
+        if removeFiltersNode:
+            blacklist.append('pbds:Filters>')
+
+        inputXml = ''
+        with open(inputXmlFn, 'r') as xml_infile:
+            inputXml = xml_infile.read()
+        outputXml = trimXmlElements(inputXml, blacklist)
+        with open(outputXmlFn, 'w') as xml_outfile:
+            xml_outfile.write(outputXml)
+
+    @fileLinker
+    def makeAlignedBamCopy(self, outputFn):
+        source = os.path.join(self.testDataDir,outputFn)
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        os.symlink(source, dest)
+
+    @fileLinker
+    def makeChunkingSymlink(self, outputFn):
+        source = os.path.join(self.testDataDir,'chunking', outputFn)
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        os.symlink(source, dest)
+  
+    @fileLinker
+    def makeMissingPbiBam(self, outputFn):
+        source = os.path.join(self.testDataDir, 'phi29.bam')
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        os.symlink(source, dest)
+
+    @fileMaker
+    def makeChunkingXml(self, outputFn):
+        if outputFn == 'chunking_emptyfilters.subreadset.xml':
+            removeFiltersNode = False
+        else:
+            removeFiltersNode = True
+        self.editChunkingXml(outputFn, removeFiltersNode)
+
+    @fileMaker
+    def makeNormalFasta(self, outputFn):
+        content = ">1\n" + fastaSeq_1 + "\n>2\n" + fastaSeq_2 + "\n>3\n" + fastaSeq_3
+        dest = os.path.join(self.generatedDataDir, outputFn)
+        with open(outputFn, 'w') as fasta_out:
+            fasta_out.write(content)
+
+    @fileMaker
+    def makeNormalFastq(self, outputFn):
+        content = ("@1\n" + fastqSeq_1 + "\n+\n" + fastqQuals_1 + "\n" +
+                   "@2\n" + fastqSeq_2 + "\n+\n" + fastqQuals_2 + "\n" +
+                   "@3\n" + fastqSeq_3 + "\n+\n" + fastqQuals_3 + "\n")
+        dest = os.path.join(self.generatedDataDir, outputFn)
+        with open(outputFn, 'w') as fastq_out:
+            fastq_out.write(content)
+
+    @fileMaker
+    def makeTruncatedBam(self, outputFn):
+        source = os.path.join(self.testDataDir, 'phi29.bam')
+        dest   = os.path.join(self.generatedDataDir, outputFn)
+        shutil.copyfile(source, dest)
+        with open(dest, 'r+b') as in_file:
+            in_file.truncate(200)
+
+    # main entry point
+    def generate(self):
+
+        # skip file if it exists
+        os.chdir(self.generatedDataDir)
+        filenames = list(self.outputFiles.keys())
+        for file in filenames:
+            if os.path.exists(file) :
+                del self.outputFiles[file]
+
+        # skip symlink if it exists
+        symlinks = list(self.outputSymlinks.keys())
+        for link in symlinks:
+            if os.path.lexists(link):
+                del self.outputSymlinks[link]
+
+        # only print message & run makers, if any files/symlinks to be created
+        # else silent success
+        if self.outputFiles or self.outputSymlinks:
+            print('Generating test data in %s ' % self.generatedDataDir)
+            for file, func in self.outputFiles.items():
+                func(file)
+            for link, func in self.outputSymlinks.items():
+                func(link)
+
+# script entry point
+if __name__ == '__main__':
+    g = TestDataGenerator(sys.argv[1], sys.argv[2])
+    g.generate()
diff --git a/tests/src/PbbamTestData.h.in b/tests/src/PbbamTestData.h.in

new file mode 100644 (file)

index 0000000..3a620af
--- /dev/null
+++ b/tests/src/PbbamTestData.h.in
@@ -0,0 +1,23 @@
+// Author: Derek Barnett
+
+#ifndef PBBAMTESTDATA_H
+#define PBBAMTESTDATA_H
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace PbbamTestsConfig {
+
+const std::string Source_Dir = std::string("@PacBioBAM_TestsDir@");
+const std::string Bin_Dir    = std::string("@CMAKE_CURRENT_BINARY_DIR@");
+const std::string Data_Dir   = std::string("@PacBioBAM_TestsDir@/data");
+const std::string Generated_Dir     = std::string("@GeneratedDir@");
+const std::string GeneratedData_Dir = std::string("@GeneratedTestDataDir@");
+const std::string Bam2Sam    = std::string("@PacBioBAM_BinDir@/bam2sam");
+
+} // namespace PbbamTestsConfig
+} // namespace BAM
+} // namespace PacBio
+
+#endif // PBBAMTESTDATA_H
diff --git a/tests/src/cram/bam2sam.t.in b/tests/src/cram/bam2sam.t.in

new file mode 100644 (file)

index 0000000..66645c4
--- /dev/null
+++ b/tests/src/cram/bam2sam.t.in
@@ -0,0 +1,63 @@
+Setup:
+
+  $ BAM2SAM="@PacBioBAM_BinDir@/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+
+Normal:
+
+  $ $BAM2SAM < $DATADIR/phi29.bam | head -n 5
+  @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc)
+  @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc)
+  @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+
+Explicit Filename (not stdin):
+
+  $ $BAM2SAM $DATADIR/phi29.bam | head -n 5
+  @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc)
+  @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc)
+  @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+
+Header-Only:
+
+  $ $BAM2SAM --header-only < $DATADIR/phi29.bam | head -n 5
+  @HD\tVN:3.0.0\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\tPU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0 (esc)
+  @PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0 (esc)
+  @PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2 (esc)
+
+No-Header:
+
+  $ $BAM2SAM --no-header < $DATADIR/phi29.bam | head -n 5
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/2067_4072\t4\t*\t0\t255\t*\t*\t0\t0\tAAGTCATGTATAGAGTTATTGGCTCAGCGGTGGCAAGCAGCCAACTCAGCTCCTTTCGGGCTTGTTAGCAGCCGGATCCACACTCTGAAATTCCTGCAGCTCGAGTTATTTGATAGTAAAAGTGGGTCATCAAACCGCAACTACGCCACCCCGGTACCTGAACAGGCTTCGGTTTCATTTTGAGACGAGAAAAACCCACTTTGAAGTTTTCGAAAATCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAAGAGAACTTGATGTCAGTGTTAGTCGTCAGGAGAGCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGATGTAAGGTTTTCTGACGCAGATATTGTTGGCAACGCTTAAAAAGTGGGATTCGTGTGGCCAGTAGCCCAGGTTATCTTTCGGGTCCACGATGTCTTTGATAACGTCCAGAATTTCAGTACCAGTTCAGGTGAATAGAACGGTATCGCAGTTTCAGATAATACGATCATGACAGCCTGCGCTGGCTGTGATGGTCGTATAGCGTGCCCAGGCCGTGATAAAGACACCCATCCGGGGTATATACAGGGTCTTTCGTTTCCTCCCTAGCCTAAGAAAGAAAACCCAGAGCACCGTTCTCTTCAAGGTATGGCACTTTACCGTAACGTCCGGGTTGGAAAGCGAAATTTACCGTACAGGCCTGGTTCAGCATCAGCTTCCCGCCAGCTGTTTGATGGCGTCCCTTCAGAGGTAGTTTTGATATACGTCCATTTTGTCGATAAAGTCCTTGAACAGGCAAGTGGTTCCTTTGAAACTTCAGACCAAGAGGATAATATTCAACATTGTACAGGTCGTAGTGTCTTTCATCAGTTCCAGATCAAACATTAGAACAGCCACAAGGTCAGCAAATTTCACCGCCGGAAGATTTCAGGTAATTCGTTACGCTTGTAAAGAAATTGGGACGCTTGATCTGGGATGGTCGGGATCTAGCCTCCTTCAGTTCAAATTCACTTAACGAATGTGCTGAATGTGCCAGAGGGTAATCCTCGTCCAAAGACGGTATTTACCCTCGAAACGATCGGCTCGCCGTACGGCAACGCAGGCGAGAGTACATTCTGCAGCTGGGTACAGGGAATTAAACATCAAAAACCATACCTTCGCCGATCTCTTTTTCCTTTAAAACGTCATTCAGCCACGGTGAAGACCAACCACGGTAAGGCATAACGAACTTCCTTGTCCAGACCCAGGCTCAGGGTCCGGGAAAACCTTTTTAAACTTCTTGTGGTGATAATGTCTTTGAAGCCTTTCAGAAGTCAGAACCTGCCGTCATACGATCCAAGACCACTGCTTGAACCTGGATCAGCAGACTTCTGCGATAATCTGAATATCGTTTTTAATGTAGGCAATATTTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAAGTCGATGTCGCCTTTCAGTAGCCGTCAGTTAAAAGTCTTTCGCAATTTTCTTAACGCGGAAAGGCAAGTTTCTAGAGAGTCGTAAGATCACGTGTGGAATCTTGCGTTTACCCTTGTAACCCAGGCAATACAATCATATAACCACTGGCCCAATGCGAGAAATGATGGTGTTGTAGGTATTTTGGCAGACCATCTGCGGACCGCATTTAAAACGTTACGTTCCAGCCAGTTGATGATGAAATGCGCCATCAAAATTTCAGATTTGTGGAAGTACAGGTCCCAGCCTGAAAACTTTGCAGAAACCCAAGCCATAAAATTCATCCAGGGAGTTACCAATCTTATACTCGGAGTGGTCTTCGATGTTCATTGTAACCGTATGCCCATACGCGGCAATCTTCAAGCCTTGGTGTAGTCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTCATATGTCATTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTTATCCGCCTCACATTTCCCTATAGTGAGTCGTATTAATTTCGCGGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:4072\tqs:i:2067\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/4151_6183\t4\t*\t0\t255\t*\t*\t0\t0\tGATCCCGCGAATTAATTACGACTCACTATAGGGGAATTGTGAGCGGATAACAATTCCCGCCTCTAGAAATAATTTTGTTTAAACTTTTAAGAAAGGAGATATTACATATGAAACACAGCCACGTAAAATGTATTCCTGCGACTTGGAGACTACCACCAAGGTGAAGATTTGCCGCGTAATGGGCATACGGTTTACATGAAACATCGAAGAACAAACTCGAGTATAAGATTGGTAACTCCCCTGGATGAATTATGGCTTGGGTTACTGAAAGTTCGAGGTCTGACCTGTACTTCGCACAAATCTGAAAATTTGATGGCCGCAAATTTCAATTCATCACTGGCTGGAACGTAAACGGTTTTAAATGGTCCGCAGATCGGTCTGTGCCAAATACCCTGATCAACACATCATTTCTTCGCAATGCGGCCAGTGTAATGATTGATATCTTGCCCTGGGTTGACAAGGGGTAAACGCAAGATCCACACCTGTGATCTACGACCTCTCTGAAGAAAACTGCGTTTCCGGTTAAGAAAATTGCGAAAGACTTTAAGCTGAACGGTACTGAAAAGCGACATGACTATCATAATGAGCGCCCGGTCGTTACAAAATCACCCCGGAAAGAATATGCCTACATTTAAAAACGATATTCAGATTATCGCAAGAACTCTGCTGATCAGTTCAAGCAAGGGTCTGGATCGTAAATGACGGCAGGTTCTGACTCTCCTGAAAGGCTTCAAAAGACATTATCACCACCTAAAAGAAGTTTAAAAAGGTTTTTTCACCGACCCTGAGCCTAGGGCTGGACAAGGAAAGTTGTTAATGCCCATACCGTGGTGGTTTCACCTGGCTGAAAGACCGTTTTAAAGAAAAAGAGATCGGCGAAGGTATGGTTTTTGATGTTAATTCCCTGTAACCAAGCCTTCAATGTACTCTCGCCTGCTTGCCGTCACACGGGCGAGCGACGTATTCGAAAGGGTAAAATACGTTCTGGGACGGAGGATTTACCCTCTGCAATTCGGCACATTCCGTTGTGAATTTGGAACTGAAAGGAAGGCTTAGATCCCGACCATCCCAGATCAAGCGTTCCCATTTCTAACAAAGGGTAACGAATACCTGAAATCTTCCAGGCGGTGAAATTGCTGACCTGTGGCTGTCTAAATGTTTGATCTTGGAAACTGATGAAAGAGCACTACGACCTGGTACAATGTTGAATATATCTCTGGTCTGAAGTTCAAAGCAACCACTGGCCTGTTCAAGGACTTTATCGACAATGGACGTATATCAAAAGACTACCTCTGAAGACGCCATCAAACAGCTGGCGAAGCTGATGCTGACAAGCCTGTACGGTAAATTCGCGTCCCACCCGGACGTTTACCGGGTAAAGTGCCATATGCTGAAAGAGAAAGCGGTGCTCTGGTTTTTCGTCTAGGTGGAAGGAGGAAACGAAGACACTGTATATACCGCCGAATGGGTGTCTTTATCCAAGCGGCCTGGCACGCTATACGACCATCACAGGCAAGCGCAGGCTTTTGTTAATGATCGTATTATCTACTGCGATTACCGATTCTACTTCACTGACTGGTACTGAAATCTGGACGTTATCAAAGACATCGTAGACCCGAAGAAACTGGGCTACTGGCACACGAATTCCACTTTAAGCGTGCAAAATATCTGCGTCAGAAAACCTACATCCCAGGATATTTACATGAAAGAAGTAGACGGCAAACTGGTAGAGGGCTCTCCGTGACGACTACACTGACATCAAGTTCTCTGTGAAATGCGCAGGCAAATGACGGCACAAAATCCAAAAAAGGAAGTGACTTTCGAAAACTTCAAAAGTGGGTTCTCGTAAAATGAAACCGAAAGCTGTTCAGGTTTAAACCCGGGTGGCGTAGTGCCTGGTTGATGAACACTTTTTACTATCAAAATAACTTCGAAAGCTGCAGGAATTCAAGCTGATCCGGCTGCTAACAAAGCCCGAAGGAAGCTGAGTTGGCTGCTGCCACCGTGAGCAATACTCTAAATACATGACTCT\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:6183\tqs:i:4151\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/6234_8214\t4\t*\t0\t255\t*\t*\t0\t0\tAGAGTCATGTATAAGAGTTATTGCTCAGCGGTGGCAGCAGACAACTCAGCTTCCTTTCGGGCCTTTGTTAGCAGCCGGATCCAAGCTTGAATTCCTGCAAGCTCGAGTTATTTGATAGTAAAAGTGTCATCAAACCAGCACTACGGCCGAACCCGGTACCTGAACAGATTCGTTTCATTTTACGAGAAAAACCCACTTTGAAGTTTTGCCGAAAGTCACTTCTTTTTGATTTGTCCGTCATGCTGCGCATTTCACAGAGACTTGAATGTCAGTGTAGTCGTCATCGGGGGGGGGAAGAGCCCTCTACCAGTTTTGCCGTCTACTTCTTTCATGTAAATATCTGGATGTAGGTTTTCTGAACGCAGATATTTGCAGCTTAAAAGTGGATTCGTGTGCCCAGTAGCCCGTTTTCTTCGGGTCCTACGATGTCTTTTGATAACGTCCAGAATTTCAGTACCAGTCAGGTGAATAGAATCGGTATCGCAGTAGATAAATACGATCATAACAAGCCTGCGCTGCCTGTTGATGGTCGTATAGCGTGCCCAGGCCCGTGATAAAGAACCATCGGGGTATATAACAGGGTCTTTCGTTCCTCCTCACCTAGACGAAAAACCCAGAGCACCGTTCTCTTTCAAGGTATGGCCTTTACCGGTAACGTCCGGGTTGGACGCGAATTTAAGCCGTAACAGGCTGTCTCAGCATACAGCTTTCGCCAGCCTGTTTGATGGCCGTCTTCAGAGGTAGTTTTGATATACGTTCCATTTGTCGATAAAGTCCCTTGAGCAGGCCCAGTGGGTTGGCTTTGAACTCAGACGCAGAATATATTCAACATTGTAACAGGTCGTAGTGCTCTTTCATCAGTTCAGATTCAACATTAGACAAGCCACAGGTCAGCATTTCACCGCCGGGAAGAATTTCAAGGTATTCGTTTACCCTTGGTAGAAATGGAACGCTTGTAATCTGGATGGTCGGGATCTAGCCTTTTCAGTTCAAAATTCACACGAATGTGCTGAATGTGCAGAGGGTAATACCTCGTCCCAGACGTATTTACCCTCGAATAAGCGAATCGGCTCGCCGTATCGCAGCAGGCGAGAGTAAACATTTGAGCTGGGTAACAGGGAATTACATCCAAAACCATACCCTTTCGCACGATCTCTTTTTCTTTAAAACGTCATTCAGCCAGGTGAAACCACCAGGTAGGCATAACGAACTTCCTGTCCAGACCCAAGGCTCAGGTCGGGAAAACTTTTTAAACTTCCTTGTGGTGATAATGTCTTTTGAAGCTTTCAGAGAGTCAGAACCTGCGTCATACGATCCAGACCGCTGCTTGAAGCTGGATCAGCAAGCTTCTGCGATAATCTGAATATCGTTTATTAATTAGGCATATTCTTCCGGGTGATTTTGTAACCGACCGGGCGCTCATTATGATAGTCGATGTCGCCCCTTTCAAGTACCGTCAGCTAAAGTCTTTCGCATTTTCTTACCGGAAACGGCAGTTTCTTCAGAGAGTCGTAAGATCACGTGTGGATCTTGCGTTTACCGCTTGTAACCCAGGCAAATATCAATCATATACCACTGGCCATGCGAGAAATGATGGTGTTGTAGGTTATTGGCAGACCATCTGCGGACCATTTTAAAACCGTTACGTTCAGCCAGTTGATGAATGAATGCGCCATGCAAATTTCAGATTGTGGAAGTACAAGGTCAGCCTGACTTTCAGAACCCAAAGCCATAAATTCATCCAGGGAGTTACCATCTTATACTCCGGAGTTGGTCTTCGATGTTCATGTAACCGTATGCCCATACGCGGCAAATCTTCAACCTTGGTGTGTAGTCCTCAAAGTCGCAGGAATACATTTACGTGGCATGTGTTTTCATAATGTATATCTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTATCCCGCTCACAATCCCCTATCAGTGAGTCGTATTAATTTCGCGGATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:8214\tqs:i:6234\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/8294_10277\t4\t*\t0\t255\t*\t*\t0\t0\tGATTCCCGCGAAATTAATACGAATCACTATAAGGGGAATTGTGAGCGGATAACAATTCCCCTCTAGAAATAATTTTGTTTAACTTTAAGAGGGACGATATACATATGAACACATGCCTACGTAAAATGTATTCCTGCGAACTGTTGAGACTACCACCAAGGTTGAAGATTTGCCGCGTAATGGGCATACGGTTACATGAACATCGAAGACCACTCCGATATGAAGATTGGTTAACCCCTGGATGAATTTATGGCTTGGGTTCTGAAAGTTCAGGCTGACCTGTACTTCACAATCTGAAATTTGATGGCCGCATTCATCAATCACTGGCTGGAACGTAAAACGGTTTAAAAATGGTCCCGCAGATGGTCTGACAAATTAACTACAACACCATCATTTCTCGCATGGGCCCAGTGGTATATGAAATTGATATTTGCCTGGGTTACAAGGAGGTAAACGCAAGATCCACACGTGGATCTACGACTCTTCTGAAGAAACCTGGCCGTTTCCGTTAAGAAAATGCGAAAGAACTTAAGCTGACGGTAACTGAAAGGCGACATCGACTATCATATAATGAAGCGCCCGTCGTTACAAAATCACCCCGGAAGAATATGCCTTACATTAAAAAACGATATTCAGATTTCGCAGAAGCTCTGCTGATCCAGTTCAAAGCAGGGTCCTGGATCGTAATGACGGCAGGTTCTGACTCTCTGAAAGGCTTCAAAGAACATTATCACCCACCAAGAAGTTTAAAAAGGTTTTCCCGACAACTGAGCCTGGGTCTGGACAAGGAAGTTTCGTTTGCCTACCGTGGTGGTTTTCAACCTGCTGACTGAACCGTTTTAAAAGAAAATAGAGATCGGCGGAAAGGTATGGTTTTTGATGTTAATTCCTGTAACCAGCCTCAAAATGTACTCTCGCCTGCTGCCGTACGGCGGCCGATCGTATTCGAAGGGTAAATACGTCTGGGACCGAGGATAGCCCTCTGCACATTCAGCACATTCGTTGTGAAATTTGAACTGAAGGAAGCTGATCCCGACGCATCCAGATCAAGCGTTCCCATTTTCTACAAGGTAACGAATACCTGAAATCTTCCCGGCGGTGAAATTGCTGCCTGTGGCTGTCTAATGTTGATCTGGAAACTGATGAAAGAGCACTACGAGACCTGTACAATGTTGAATATATCTCTGGTCTGAAGTTCAAAGCAACCACTGGCCTGTTCAAGGACTTTATCGACAAATGGCGTATTATCAAAACTACCTCTGAAGACGCCATCAAACAGCTGGCGAAGCTGATGCTGACAGCCTGTACGGTAAATTCGCGTCGCAACCCGGACGTTTCCGTAAAGTGCCCATACCTGAAAGAGAAACGGTGCTCTGGGTTTTCGTCTAGGTGAGGAGGAAACGAAAGACCCTGTAATATACCCGATGGTGTCTTTTATCACGGCCTGGGCACGCTAGTACGACCAATCACAGCAGCGCAGGCTTGTTATGATCGTATTTCTACTGCGGATACCGATTCTATTCCACCTGACTGGTACTGAAATTCTGGAACGTTATCAAAGACATCGTAGACCCGAAGAAACTGGGCTACTGGGGCACCACGAATCCACTTTTAAGCGTGGCAAAATATCTGACGTCAGAAAACCTACATCCAGGATATTTACATGAAAGAAGTAGACGGCAACTGTAGAGGGCTCTTCCTGACGAACCTACACTGACATCAAGTTCTCTGTGAAATGCGCAGGCATGACGGACCAAAATCAAAAAGGAAGTGAACTTTTCGAAAACTTCAAAGTGGGTTTTCTCGTAAAATGAAACCGAAGCCTGTCAGGTACCGGGTGGCGTAGTGCTGGTTGATCGGACACTTTACTATCAATAACTCGAGCTGCAGAATTCCAAGCTTGGATTCCGGCTGCTAACAAAGCCCGAAAGGAAGCTGAGTTGGCTGCTGCACCGCTGAGCAATAACTCTATACATGACTCAT\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:10277\tqs:i:8294\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+  m140918_150013_42139_c100697631700000001823144703261565_s1_p0/30422/10327_12283\t4\t*\t0\t255\t*\t*\t0\t0\tAGAGTCATGTATAGAGTTATTGCTCAGCGGTGGCAGCACCAACTCAGCTTCCTTTCGGCTTTGTTAGCAGCCGATCCAAGCTTGAATTCCTGCAGCTCGGAGTTATTTGATAGTAAAAGTTGTCATCCAAACGCAGCACTACGCCCACCCGTACCTGAACAGGCTTTCGGTTTCATTTTACGAGAAAAACACTTTTGAAAGTTTTCGAAAGTCACTTCCTTTTTTGATTTTGTCCGTCATGCCTGCGCATTTCACAGAGAACTTGATGTCAGTGTAGTCGTCAGGAGAGCCCTCTACCAGTTTGCCGTCTACTTCTTTCATGTAAATATCCTGGAATGTAGGTTTTTCTGACGCAGATTATTTTGCACGCTTAAAAGTGGATTCGTGTGGCCCCAGTAGCCCAGTTTCTTCGGTCTACGATGTCTTTGATACGTCCAGAATTTCAGTAAACAGTCAGGTGAATAGAAATCCGGTATCGCAGTAGAATAATACGATCATAACAACCTGCGCTGCTGTGTGGTCGTATAGCGTGCCCAGGCCGTGATAACAGACACCTCGGGGTAATATACAGGGTCTTTCCGTTCCTCCTCAACCTAGACGAAACCCAGAGCACCGTTCTCTTTTCAGGTATGGCACTTTAACCGGTACGTCCGGGTTGGACGCGAATTTACCGTAGCAGGCTGTTCAGCATCAGCTTTCGCCAGCCTGTTTGATGGCGCTCTTCAGAGGTAGTTTGAATATACGTCCATTTGTCGAATAAAGTCCTTGGAACAGGCCCAGTGGTTGCTTTGAACTTCCAGACCAGAGATATATTTCAACATTGTACAGGTCGTAGTGCTCTTTCCACTCAGTTCCAGATCAACATTAAGACAGCCACAGGTCAGATTTCCCCGCCGGAAGATTCAGGTAATTCTAGTTACCCTTGTAGAAATGGCGACGCTTGATCTGGATGGTCGGGATCCTAGCTTCCCTTCAGTTCAAATTCACAACGAATGTTGCTGAATCTGTGCAGAGGGTAATCCTCGGTCCAGACGTATTTACCCTCGAATACGATGCTCGCCGTACGGCAGCAGCGAGAGTACATTTGAGCTGGTACAGGGAATTAACATCAAAAAACATACTTCGCCGATCTCTTTTTCTTTAAAACGGTCATTCAGCCAGGTGAAACCACCACGGTAGGCATAACGAAACTTCCTGTCCAGACCCAGGCTCAGGTCGGAAAACTTGTTAAACTTCTTGGTGGTGATAATGTCTTTGAAAGCCTTTCAGGAAGTCAGAACCATGCCGTCATCCGATCCAGACCCCTGCTTTGAACTGGAATCAGCAGAGGCTCTGCGATAATCGAATATCGTTTTTAAATGTAGGCATATTTTCTTCGGGGTGATTTGTAACGCGACCGGGCGCTCATTATGATAGTCGATGTCGCCTTTCAGTACCGTCAGCTTAAAGTCTTTCGCAATTTTCTTAACCGACGGCAGTTTCTTCAGAGAGGTCGTAGATCACGGTGTGGATCTTGCGTTTACCCTTGTAACCAGGCAAATATCAATCATATACCACTGGCCCATGCGAGAATGATGGTGTTGTAGGTATTTGGCAGACGCATCTGCGGACCATTTAAACCGTTACGTTCCAGCCAGTTGATGATGAATGCGCCCATCATTTCAGATTTGTGGAAGGTACAGGTCAGCCTGAACTTGTCAGAAACCCAAGCCATAAATTCATCCAGGGAGTACATCTTATAATCTCGAAGTGGTCTTCGATGTTCATGTAACCGTATGCCCATACGCGCAATCTTCACCTTGGTGGTAGTCTGCAGTCGCAGAATAATTTTACGTGGCATGTGTTTCATATGTTATTAGTCTCCTTCTTAAAGTTAAACAAAATTATTTTTAGAAGGGGAATTGTTATCCGCTCACAATTCCCCTATAGTGGAGTCGTATTAATTTCGCGGGTATC\t*\tRG:Z:a955def6\tbc:B:S,1,1\tbq:i:1\tcx:i:31\tnp:i:1\tqe:i:12283\tqs:i:10327\trq:f:0.88458\tsn:B:f,22.8448,13.8689,14.6461,14.3552\tzm:i:30422 (esc)
+
+Invalid-Args:
+
+  $ $BAM2SAM --header-only --no-header < $DATADIR/phi29.bam 
+  
+  ERROR: conflicting arguments requested: --no-header and --header-only
+  
+  Usage: bam2sam [options] [input]
+  
+  bam2sam converts a BAM file to SAM. It is essentially a stripped-down 'samtools
+  view', mostly useful for testing/debugging without requiring samtools. Input BAM
+  file is read from a file or stdin, and SAM output is written to stdout.
+  
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+  
+    Options:
+      input               Input BAM file. If not provided, stdin will be used as input.
+      --no-header         Omit header from output.
+      --header-only       Print only the header (no records).
+  [1]
+
diff --git a/tests/src/cram/pbbamify.t.in b/tests/src/cram/pbbamify.t.in

new file mode 100644 (file)

index 0000000..b2e3fd8
--- /dev/null
+++ b/tests/src/cram/pbbamify.t.in
@@ -0,0 +1,159 @@
+Setup:
+
+  $ PBBAMIFY="@PacBioBAM_BinDir@/pbbamify" && export PBBAMIFY
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+
+  $ GENERATEDDATADIR="@GeneratedDir@/" && export GENERATEDDATADIR
+
+Forward alignments with and without user specified tags, one alignment with undefined mapq, some alignments with basic CIGAR operations, 2 alignemtns with hard clipping, and several invalid alignments (1 without a seq field and 1 not present in the dataset) which should be skipped:
+
+  $ $PBBAMIFY --input=$DATADIR/pbbamify/input-aligned-1.bam $DATADIR/pbbamify/synthetic-ref-1.fa $DATADIR/pbbamify/synthetic_movie_1.subreads.bam | $SAMTOOLS view -h
+  [Warning] No records found for query 'synthetic_movie_1/10/0_100'. Skipping.
+  [Warning] Sequence 'synthetic_movie_1/1/0_100' (length 90) is not of the same length as the PacBio BAM sequence (length 100)! Skipping.
+  [Warning] Found 1 alignments without a seq field which were not converted (most likely secondary alignments).
+  [INFO] Done processing 15 alignments in 0 min.
+  @HD\tVN:1.5\tSO:unknown\tpb:3.0.3 (esc)
+  @SQ\tSN:synthetic_ref_1\tLN:150\tM5:e1e940d621d949c9617566ddf3055922 (esc)
+  @RG\tID:8d2370c0\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;BINDINGKIT=100-862-200;SEQUENCINGKIT=100-861-800;BASECALLERVERSION=5.0.0.5552;FRAMERATEHZ=80.000000\tPU:synthetic_movie_1\tPM:SEQUEL (esc)
+  @PG\tID:Handmade\tPN:Handmade (esc)
+  @PG\tID:baz2bam\tPN:baz2bam\tVN:5.0.0.5552\tCL:/opt/pacbio/ppa-5.0.0/bin/baz2bam /data/pb/synthetic_movie_1.baz -o /data/pb/synthetic_movie_1 --metadata /data/pb/.synthetic_movie_1.metadata.xml -j 12 -b 12 --progress --silent --minSubLength 50 --minSnr 3.750000 --adapters /data/pb/synthetic_movie_1.adapters.fasta (esc)
+  @PG\tID:bazFormat\tPN:bazformat\tVN:1.3.0 (esc)
+  @PG\tID:bazwriter\tPN:bazwriter\tVN:5.0.0.5552 (esc)
+  synthetic_movie_1/1/0_100\t0\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t254\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tZE:f:1\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t0\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/3/0_100\t4\t*\t0\t255\t*\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:3\tRG:Z:8d2370c0 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t8S1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I1=6S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+
+Reverse alignments: 2 primary alignments and 7 secondary, 6 alignments with extended CIGAR and 2 with basic CIGAR strings, 1 alignment with undefined (255) mapq, 2 alignments with hard clipping, 1 alignment with user defined tag. All alignments have a read group assigned which is different than the dataset.
+  $ $PBBAMIFY --input=$DATADIR/pbbamify/input-aligned-2.bam $DATADIR/pbbamify/synthetic-ref-1.fa $DATADIR/pbbamify/synthetic_movie_2.subreads.bam | $SAMTOOLS view -h
+  [INFO] Done processing 9 alignments in 0 min.
+  @HD\tVN:1.5\tSO:unknown\tpb:3.0.3 (esc)
+  @SQ\tSN:synthetic_ref_1\tLN:150\tM5:e1e940d621d949c9617566ddf3055922 (esc)
+  @RG\tID:7a515ee0\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;BINDINGKIT=100-862-200;SEQUENCINGKIT=100-861-800;BASECALLERVERSION=5.0.0.5552;FRAMERATEHZ=80.000000\tPU:synthetic_movie_2\tPM:SEQUEL (esc)
+  @PG\tID:Handmade\tPN:Handmade (esc)
+  @PG\tID:baz2bam\tPN:baz2bam\tVN:5.0.0.5552\tCL:/opt/pacbio/ppa-5.0.0/bin/baz2bam /data/pb/synthetic_movie_2.baz -o /data/pb/synthetic_movie_2 --metadata /data/pb/.synthetic_movie_2.metadata.xml -j 12 -b 12 --progress --silent --minSubLength 50 --minSnr 3.750000 --adapters /data/pb/synthetic_movie_2.adapters.fasta (esc)
+  @PG\tID:bazFormat\tPN:bazformat\tVN:1.3.0 (esc)
+  @PG\tID:bazwriter\tPN:bazwriter\tVN:5.0.0.5552 (esc)
+  synthetic_movie_2/1000001/0_100\t16\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t254\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tZE:f:1\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000002/0_101\t16\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+
+CCS read:
+
+  $ $PBBAMIFY --input=$DATADIR/pbbamify/input-aligned-3.bam $DATADIR/pbbamify/synthetic-ref-1.fa $DATADIR/pbbamify/synthetic_movie_3.subreads.bam | $SAMTOOLS view -h
+  [INFO] Done processing 1 alignments in 0 min.
+  @HD\tVN:1.5\tSO:unknown\tpb:3.0.1 (esc)
+  @SQ\tSN:synthetic_ref_1\tLN:150\tM5:e1e940d621d949c9617566ddf3055922 (esc)
+  @RG\tID:67e06f58\tPL:PACBIO\tDS:READTYPE=CCS;BINDINGKIT=100-862-200;SEQUENCINGKIT=101-093-700;BASECALLERVERSION=5.0.0.5049;FRAMERATEHZ=80.000000\tPU:synthetic_movie_3\tPM:SEQUEL (esc)
+  @PG\tID:Handmade\tPN:Handmade (esc)
+  @PG\tID:ccs-3.0.0\tPN:ccs\tVN:3.0.0\tDS:Generate circular consensus sequences (ccs) from subreads.\tCL:ccs (esc)
+  synthetic_movie_3/3000001/ccs\t0\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\tCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\tRG:Z:67e06f58\tnp:i:6\trq:f:0.993687\trs:B:i,8,1,0,0,0,0\tsn:B:f,6.21632,11.7596,4.35394,8.45458\tza:f:nan\tzm:i:3000001\tzs:B:f,nan,nan,nan,nan,nan,nan,-inf,nan,nan (esc)
+
+No verbose output:
+
+  $ $PBBAMIFY --input=$DATADIR/pbbamify/input-aligned-1.bam --verbose-level=0 $DATADIR/pbbamify/synthetic-ref-1.fa $DATADIR/pbbamify/synthetic_movie_1.subreads.bam | $SAMTOOLS view -h
+  @HD\tVN:1.5\tSO:unknown\tpb:3.0.3 (esc)
+  @SQ\tSN:synthetic_ref_1\tLN:150\tM5:e1e940d621d949c9617566ddf3055922 (esc)
+  @RG\tID:8d2370c0\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;BINDINGKIT=100-862-200;SEQUENCINGKIT=100-861-800;BASECALLERVERSION=5.0.0.5552;FRAMERATEHZ=80.000000\tPU:synthetic_movie_1\tPM:SEQUEL (esc)
+  @PG\tID:Handmade\tPN:Handmade (esc)
+  @PG\tID:baz2bam\tPN:baz2bam\tVN:5.0.0.5552\tCL:/opt/pacbio/ppa-5.0.0/bin/baz2bam /data/pb/synthetic_movie_1.baz -o /data/pb/synthetic_movie_1 --metadata /data/pb/.synthetic_movie_1.metadata.xml -j 12 -b 12 --progress --silent --minSubLength 50 --minSnr 3.750000 --adapters /data/pb/synthetic_movie_1.adapters.fasta (esc)
+  @PG\tID:bazFormat\tPN:bazformat\tVN:1.3.0 (esc)
+  @PG\tID:bazwriter\tPN:bazwriter\tVN:5.0.0.5552 (esc)
+  synthetic_movie_1/1/0_100\t0\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t254\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tZE:f:1\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t0\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/3/0_100\t4\t*\t0\t255\t*\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:3\tRG:Z:8d2370c0 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t8S1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I1=6S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+
+Test on a dataset, input contains alignments from all subread sets.
+
+  $ $PBBAMIFY --input=$DATADIR/pbbamify/input-aligned-all.bam $DATADIR/pbbamify/synthetic-ref-1.fa $GENERATEDDATADIR/synthetic_movie_all.subreadset.xml | $SAMTOOLS view -h
+  [Warning] No records found for query 'synthetic_movie_1/10/0_100'. Skipping.
+  [Warning] Sequence 'synthetic_movie_1/1/0_100' (length 90) is not of the same length as the PacBio BAM sequence (length 100)! Skipping.
+  [Warning] Found 1 alignments without a seq field which were not converted (most likely secondary alignments).
+  [INFO] Done processing 25 alignments in 0 min.
+  @HD\tVN:1.5\tSO:unknown\tpb:3.0.3 (esc)
+  @SQ\tSN:synthetic_ref_1\tLN:150\tM5:e1e940d621d949c9617566ddf3055922 (esc)
+  @RG\tID:67e06f58\tPL:PACBIO\tDS:READTYPE=CCS;BINDINGKIT=100-862-200;SEQUENCINGKIT=101-093-700;BASECALLERVERSION=5.0.0.5049;FRAMERATEHZ=80.000000\tPU:synthetic_movie_3\tPM:SEQUEL (esc)
+  @RG\tID:7a515ee0\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;BINDINGKIT=100-862-200;SEQUENCINGKIT=100-861-800;BASECALLERVERSION=5.0.0.5552;FRAMERATEHZ=80.000000\tPU:synthetic_movie_2\tPM:SEQUEL (esc)
+  @RG\tID:8d2370c0\tPL:PACBIO\tDS:READTYPE=SUBREAD;Ipd:CodecV1=ip;PulseWidth:CodecV1=pw;BINDINGKIT=100-862-200;SEQUENCINGKIT=100-861-800;BASECALLERVERSION=5.0.0.5552;FRAMERATEHZ=80.000000\tPU:synthetic_movie_1\tPM:SEQUEL (esc)
+  @PG\tID:Handmade\tPN:Handmade (esc)
+  @PG\tID:baz2bam\tPN:baz2bam\tVN:5.0.0.5552\tCL:/opt/pacbio/ppa-5.0.0/bin/baz2bam /data/pb/synthetic_movie_1.baz -o /data/pb/synthetic_movie_1 --metadata /data/pb/.synthetic_movie_1.metadata.xml -j 12 -b 12 --progress --silent --minSubLength 50 --minSnr 3.750000 --adapters /data/pb/synthetic_movie_1.adapters.fasta (esc)
+  @PG\tID:bazFormat\tPN:bazformat\tVN:1.3.0 (esc)
+  @PG\tID:bazwriter\tPN:bazwriter\tVN:5.0.0.5552 (esc)
+  @PG\tID:ccs-3.0.0\tPN:ccs\tVN:3.0.0\tDS:Generate circular consensus sequences (ccs) from subreads.\tCL:ccs (esc)
+  synthetic_movie_1/1/0_100\t0\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t254\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tZE:f:1\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t0\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_1/3/0_100\t4\t*\t0\t255\t*\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:3\tRG:Z:8d2370c0 (esc)
+  synthetic_movie_1/1/0_100\t256\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\tAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1 (esc)
+  synthetic_movie_1/2/0_101\t256\tsynthetic_ref_1\t30\t60\t8S1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I1=6S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:8d2370c0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:2 (esc)
+  synthetic_movie_2/1000001/0_100\t16\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t254\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tZE:f:1\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000001/0_100\t272\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100\tqe:i:100\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000001 (esc)
+  synthetic_movie_2/1000002/0_101\t16\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t23\t60\t8=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I7=\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_2/1000002/0_101\t272\tsynthetic_ref_1\t30\t60\t7S1=1X21=3I1X3D13=1D1=1X1=1I9=1X32=1I4=3S\t*\t0\t0\tCGCTATTTTTGAAAATTTTCCGGTTTAAGGAAATTCCGTTCTTCTTCTGAATAACTTAATCTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAAACGAC\t!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\tRG:Z:7a515ee0\tcx:i:3\tip:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tnp:i:1\tpw:B:C,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101\tqe:i:101\tqs:i:0\trq:f:0.8\tsn:B:f,8.34462,15.7107,6.3469,10.3163\tzm:i:1000002 (esc)
+  synthetic_movie_3/3000001/ccs\t0\tsynthetic_ref_1\t23\t60\t100=\t*\t0\t0\tCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGAC\tCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC\tRG:Z:67e06f58\tnp:i:6\trq:f:0.993687\trs:B:i,8,1,0,0,0,0\tsn:B:f,6.21632,11.7596,4.35394,8.45458\tza:f:nan\tzm:i:3000001\tzs:B:f,nan,nan,nan,nan,nan,nan,-inf,nan,nan (esc)
+
+No-args:
+
+  $ $PBBAMIFY
+  
+  ERROR: Exactly two positional arguments must be specified.
+  
+  Usage: pbbamify [options] <ref.fa> <pb.bam>|<pb.fofn>|<pb.xml>
+  
+  pbbamify converts an arbitray aligned BAM file to a PacBio-compatible BAM
+  file.Input BAM file is read from a file or stdin, the raw-reads PacBio BAM is
+  givenas a parameter, and BAM output is written to stdout.
+  
+  Options:
+    -h, --help            show this help message and exit
+    --version             show program's version number and exit
+  
+    Options:
+                          Reference used to align the input.
+      --input=STR         The aligned non-PacBio BAM file. If not provided, stdin
+                          will be used as input.
+      --output=STR        Path to the output BAM file. If not specified, output
+                          will be to the stdout.
+      --verbose-level=INT
+                          Specifies the level of info which will be output
+                          produced onstderr. 0 turns all output off, 1 outputs
+                          only warnings, while levels 2 and above outputs a status
+                          message every 1000000 (2), 100000 (3), 1000 (4), 100
+                          (5), 10 (6) and 1 (7) reads.
+                          A PacBio BAM file containing raw reads.
+  [1]
diff --git a/tests/src/cram/pbindexdump_cpp.t.in b/tests/src/cram/pbindexdump_cpp.t.in

new file mode 100644 (file)

index 0000000..18a210c
--- /dev/null
+++ b/tests/src/cram/pbindexdump_cpp.t.in
@@ -0,0 +1,39 @@
+Setup:
+
+  $ PBINDEXDUMP="@PacBioBAM_BinDir@/pbindexdump" && export PBINDEXDUMP
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+
+Normal C++:
+
+  $ $PBINDEXDUMP --format=cpp $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  PbiRawData rawData;
+  rawData.Version(PbiFile::Version_3_0_1);
+  rawData.FileSections(PbiFile::BASIC);
+  rawData.NumReads(1);
+  
+  PbiRawBasicData& basicData = rawData.BasicData();
+  basicData.rgId_       = {-898246524};
+  basicData.qStart_     = {2659};
+  basicData.qEnd_       = {7034};
+  basicData.holeNumber_ = {0};
+  basicData.readQual_   = {0.01};
+  basicData.ctxtFlag_   = {0};
+  basicData.fileOffset_ = {20054016};
+  
+  
+--(leave the blank lines above this)--
+
+Request C++, with JSON options (stdout includes usage/help, so we just want to check stderr):
+
+  $ $PBINDEXDUMP --format=cpp --json-indent-level=2 $DATADIR/polymerase/production_hq.hqregion.bam.pbi > /dev/null
+  
+  ERROR: JSON formatting options not valid on non-JSON output
+  
+  [1]
+
+  $ $PBINDEXDUMP --format=cpp --json-raw $DATADIR/polymerase/production_hq.hqregion.bam.pbi > /dev/null
+  
+  ERROR: JSON formatting options not valid on non-JSON output
+  
+  [1]
diff --git a/tests/src/cram/pbindexdump_json.t.in b/tests/src/cram/pbindexdump_json.t.in

new file mode 100644 (file)

index 0000000..0c1cbcd
--- /dev/null
+++ b/tests/src/cram/pbindexdump_json.t.in
@@ -0,0 +1,83 @@
+Setup:
+
+  $ PBINDEXDUMP="@PacBioBAM_BinDir@/pbindexdump" && export PBINDEXDUMP
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+
+Default settings (JSON):
+
+  $ $PBINDEXDUMP $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  {
+      "fileSections": [
+          "BasicData"
+      ],
+      "numReads": 1,
+      "reads": [
+          {
+              "contextFlag": 0,
+              "fileOffset": 20054016,
+              "holeNumber": 0,
+              "qEnd": 7034,
+              "qStart": 2659,
+              "readQuality": 0.00999999977648258,
+              "rgId": -898246524
+          }
+      ],
+      "version": "3.0.1"
+  }
+
+JSON indent level(2):
+
+  $ $PBINDEXDUMP --json-indent-level=2 $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  {
+    "fileSections": [
+      "BasicData"
+    ],
+    "numReads": 1,
+    "reads": [
+      {
+        "contextFlag": 0,
+        "fileOffset": 20054016,
+        "holeNumber": 0,
+        "qEnd": 7034,
+        "qStart": 2659,
+        "readQuality": 0.00999999977648258,
+        "rgId": -898246524
+      }
+    ],
+    "version": "3.0.1"
+  }
+
+JSON raw:
+
+  $ $PBINDEXDUMP --json-raw $DATADIR/polymerase/production_hq.hqregion.bam.pbi
+  {
+      "basicData": {
+          "ctxtFlag": [
+              0
+          ],
+          "fileOffset": [
+              20054016
+          ],
+          "holeNumber": [
+              0
+          ],
+          "qEnd": [
+              7034
+          ],
+          "qStart": [
+              2659
+          ],
+          "readQual": [
+              0.00999999977648258
+          ],
+          "rgId": [
+              -898246524
+          ]
+      },
+      "fileSections": [
+          "BasicData"
+      ],
+      "numReads": 1,
+      "version": "3.0.1"
+  }
diff --git a/tests/src/cram/pbmerge_aligned_ordering.t.in b/tests/src/cram/pbmerge_aligned_ordering.t.in

new file mode 100644 (file)

index 0000000..58171bb
--- /dev/null
+++ b/tests/src/cram/pbmerge_aligned_ordering.t.in
@@ -0,0 +1,197 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ INPUT_1="$DATADIR/dataset/bam_mapping_1.bam" && export INPUT_1
+  $ INPUT_2="$DATADIR/dataset/bam_mapping_2.bam" && export INPUT_2
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/aligned_ordering_merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/aligned_ordering_merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $INPUT_1
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $INPUT_1 | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+
+  $ $BAM2SAM --header-only $INPUT_2
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $INPUT_2 | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+Normal Merge:
+
+  $ $PBMERGE $INPUT_1 $INPUT_2 > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+  $ rm $MERGED_BAM
+
+Shuffle Input:
+
+  $ $PBMERGE $INPUT_2 $INPUT_2 > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7046_7293\tlambda_NEB3011\t5136 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/38025/6255_7894\tlambda_NEB3011\t5427 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5311_5508\tlambda_NEB3011\t5943 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/36363/899_1197\tlambda_NEB3011\t6258 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/36363/605_853\tlambda_NEB3011\t6312 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/31174/0_1029\tlambda_NEB3011\t6487 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/31174/1075_1271\tlambda_NEB3011\t6499 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/38025/5743_6211\tlambda_NEB3011\t6606 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/50257/6944_7361\tlambda_NEB3011\t6942 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/50257/6546_6903\tlambda_NEB3011\t7010 (esc)
+
+  $ rm $MERGED_BAM
+
+Explicit Output Filename (also enables PBI):
+
+  $ $PBMERGE -o $MERGED_BAM $INPUT_1 $INPUT_2
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Explicit Output Filename (with disabled PBI):
+
+  $ $PBMERGE -o $MERGED_BAM --no-pbi $INPUT_1 $INPUT_2
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/7247/7338_7831\tlambda_NEB3011\t4904 (esc)
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_dataset.t.in b/tests/src/cram/pbmerge_dataset.t.in

new file mode 100644 (file)

index 0000000..1c7cb7a
--- /dev/null
+++ b/tests/src/cram/pbmerge_dataset.t.in
@@ -0,0 +1,144 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ INPUT_XML="$DATADIR/polymerase/consolidate.subread.dataset.xml" && export INPUT_XML
+  $ BAM_1="$DATADIR/polymerase/production.subreads.bam" && export BAM_1
+  $ BAM_2="$DATADIR/polymerase/production.scraps.bam" && export BAM_2
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --no-header $BAM_1 | cut -f 1
+  ArminsFakeMovie/0/2659_3025
+  ArminsFakeMovie/0/3116_3628
+  ArminsFakeMovie/0/3722_4267
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4960_5477
+  ArminsFakeMovie/0/5571_6087
+  ArminsFakeMovie/0/6199_6719
+  ArminsFakeMovie/0/6812_7034
+
+  $ $BAM2SAM --no-header $BAM_2  | cut -f 1
+  ArminsFakeMovie/0/0_2659
+  ArminsFakeMovie/0/3025_3047
+  ArminsFakeMovie/0/3047_3095
+  ArminsFakeMovie/0/3095_3116
+  ArminsFakeMovie/0/3628_3650
+  ArminsFakeMovie/0/3650_3700
+  ArminsFakeMovie/0/3700_3722
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/5477_5498
+  ArminsFakeMovie/0/5498_5546
+  ArminsFakeMovie/0/5546_5571
+  ArminsFakeMovie/0/6087_6116
+  ArminsFakeMovie/0/6116_6173
+  ArminsFakeMovie/0/6173_6199
+  ArminsFakeMovie/0/6719_6740
+  ArminsFakeMovie/0/6740_6790
+  ArminsFakeMovie/0/6790_6812
+  ArminsFakeMovie/0/7034_7035
+
+Normal Merge from XML:
+
+  $ $PBMERGE -o $MERGED_BAM $INPUT_XML
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc)
+  @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc)
+  @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/4960_5477
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Normal Merge from XML (disabled PBI):
+
+  $ $PBMERGE --no-pbi -o $MERGED_BAM $INPUT_XML
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc)
+  @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc)
+  @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/4960_5477
+
+  $ rm $MERGED_BAM
+
+Write to stdout:
+
+  $ $PBMERGE --no-pbi $INPUT_XML > $MERGED_BAM
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:8aaede36\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:CodecV1=ip;BINDINGKIT=FakeBindKit;SEQUENCINGKIT=FakeSeqKit;BASECALLERVERSION=0.2.0;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:BAZ_FORMAT\tVN:0.3.0 (esc)
+  @PG\tID:PPA-BAZ2BAM\tVN:0.1.0 (esc)
+  @PG\tID:PPA-BAZWRITER\tVN:0.2.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/0/4267_4289
+  ArminsFakeMovie/0/4289_4335
+  ArminsFakeMovie/0/4335_4356
+  ArminsFakeMovie/0/4356_4864
+  ArminsFakeMovie/0/4864_4888
+  ArminsFakeMovie/0/4888_4939
+  ArminsFakeMovie/0/4939_4960
+  ArminsFakeMovie/0/4960_5477
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_fofn.t.in b/tests/src/cram/pbmerge_fofn.t.in

new file mode 100644 (file)

index 0000000..34e9af6
--- /dev/null
+++ b/tests/src/cram/pbmerge_fofn.t.in
@@ -0,0 +1,120 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ INPUT_FOFN="$DATADIR/merge.fofn" && export INPUT_FOFN
+  $ INPUT_1="$DATADIR/aligned.bam" && export INPUT_1
+  $ INPUT_2="$DATADIR/aligned2.bam" && export INPUT_2
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/aligned_ordering_merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/aligned_ordering_merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $INPUT_1
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc)
+  @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc)
+
+  $ $BAM2SAM --no-header $INPUT_1 | cut -f 1,3,4 | head -n 10
+  singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+
+  $ $BAM2SAM --header-only $INPUT_2
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:b89a4406\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $INPUT_2 | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+
+Normal Merge from FOFN:
+
+  $ $PBMERGE -o $MERGED_BAM $INPUT_FOFN
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc)
+  @RG\tID:b89a4406\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Normal Merge from FOFN (disabled PBI):
+
+  $ $PBMERGE --no-pbi -o $MERGED_BAM $INPUT_FOFN
+
+  $ [ -f $MERGED_BAM ] && echo "Found" || echo "Not found"
+  Found
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.3 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:0d7b28fa\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\tPU:singleInsertion\tPM:SEQUEL (esc)
+  @RG\tID:b89a4406\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;FRAMERATEHZ=100\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377\tPM:SEQUEL (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+  @PG\tID:bwa\tPN:bwa\tVN:0.7.10-r1017-dirty\tCL:bwa mem lambdaNEB.fa singleInsertion.fasta (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1,3,4 | head -n 20
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/387_1134\tlambda_NEB3011\t303 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  singleInsertion/100/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/200/0_49\tlambda_NEB3011\t5211 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+  singleInsertion/100/0_111\tlambda_NEB3011\t9378 (esc)
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_mixed_ordering.t.in b/tests/src/cram/pbmerge_mixed_ordering.t.in

new file mode 100644 (file)

index 0000000..83d926c
--- /dev/null
+++ b/tests/src/cram/pbmerge_mixed_ordering.t.in
@@ -0,0 +1,57 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ UNALIGNED_BAM="$DATADIR/polymerase/internal.hqregions.bam" && export UNALIGNED_BAM
+  $ ALIGNED_BAM="$DATADIR/dataset/bam_mapping_1.bam" && export ALIGNED_BAM
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/mixed_ordering_merged.bam" && export MERGED_BAM
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $UNALIGNED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+
+  $ $BAM2SAM --no-header $UNALIGNED_BAM | cut -f 1
+  ArminsFakeMovie/100000/2659_7034
+
+  $ $BAM2SAM --header-only $ALIGNED_BAM
+  @HD\tVN:1.3.1\tSO:coordinate\tpb:3.0.1 (esc)
+  @SQ\tSN:lambda_NEB3011\tLN:48502\tM5:a1319ff90e994c8190a4fe6569d0822a (esc)
+  @RG\tID:a9a22406c5\tDS:READTYPE=SUBREAD;BINDINGKIT=100356300;SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3;InsertionQV=iq;DeletionQV=dq;SubstitutionQV=sq;MergeQV=mq;SubstitutionTag=st;DeletionTag=dt\tPL:PACBIO\tPU:m140905_042212_sidney_c100564852550000001823085912221377_s1_X0\tSM:c100564852550000001823085912221377 (esc)
+  @PG\tID:BLASR\tVN:1.3.1.141565\tCL:/home/UNIXHOME/yli/for_the_people/blasr_bam_out/blasr m140905_042212_sidney_c100564852550000001823085912221377_s1_X0.1.bax.h5 lambdaNEB.fa -out tmp.bam -bam -bestn 10 -minMatch 12 -nproc 8 -minSubreadLength 50 -minReadLength 50 -randomSeed 1 -clipping subread  (esc)
+
+  $ $BAM2SAM --no-header $ALIGNED_BAM | cut -f 1,3,4 | head -n 10
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132\tlambda_NEB3011\t1 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/32328/0_344\tlambda_NEB3011\t676 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/9936_10187\tlambda_NEB3011\t2171 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/6469/10232_10394\tlambda_NEB3011\t2204 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7468_8906\tlambda_NEB3011\t3573 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/5557_7235\tlambda_NEB3011\t4507 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/13473/7285_8657\tlambda_NEB3011\t4508 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/426_1045\tlambda_NEB3011\t4593 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/30983/7064_7421\tlambda_NEB3011\t4670 (esc)
+  m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/19915/0_382\tlambda_NEB3011\t4843 (esc)
+
+Normal Merge - should fail:
+
+  $ $PBMERGE $UNALIGNED_BAM $ALIGNED_BAM > $MERGED_BAM
+  ERROR: cannot merge different dataset types
+  [1]
+
+Shuffle Input - should fail:
+
+  $ $PBMERGE $ALIGNED_BAM $UNALIGNED_BAM > $MERGED_BAM
+  ERROR: cannot merge different dataset types
+  [1]
+
+Cleanup:
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/cram/pbmerge_pacbio_ordering.t.in b/tests/src/cram/pbmerge_pacbio_ordering.t.in

new file mode 100644 (file)

index 0000000..f52759f
--- /dev/null
+++ b/tests/src/cram/pbmerge_pacbio_ordering.t.in
@@ -0,0 +1,457 @@
+Setup:
+
+  $ TOOLS_BIN="@PacBioBAM_BinDir@" && export TOOLS_BIN
+  $ PBMERGE="$TOOLS_BIN/pbmerge" && export PBMERGE
+  $ BAM2SAM="$TOOLS_BIN/bam2sam" && export BAM2SAM
+
+  $ DATADIR="@PacBioBAM_TestsDir@/data" && export DATADIR
+  $ HQREGION_BAM="$DATADIR/polymerase/internal.hqregions.bam" && export HQREGION_BAM
+  $ SCRAPS_BAM="$DATADIR/polymerase/internal.scraps.bam" && export SCRAPS_BAM
+
+  $ MERGED_BAM="@GeneratedTestDataDir@/pacbio_ordering_merged.bam" && export MERGED_BAM
+  $ MERGED_BAM_PBI="@GeneratedTestDataDir@/pacbio_ordering_merged.bam.pbi" && export MERGED_BAM_PBI
+
+Sanity Check:
+
+  $ $BAM2SAM --header-only $HQREGION_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+
+  $ $BAM2SAM --no-header $HQREGION_BAM | cut -f 1
+  ArminsFakeMovie/100000/2659_7034
+
+  $ $BAM2SAM --header-only $SCRAPS_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+
+  $ $BAM2SAM --no-header $SCRAPS_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+Normal Merge:
+
+  $ $PBMERGE $HQREGION_BAM $SCRAPS_BAM > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ rm $MERGED_BAM
+
+Shuffle Input:
+
+  $ $PBMERGE $SCRAPS_BAM $HQREGION_BAM  > $MERGED_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ rm $MERGED_BAM
+
+Explicit Output Filename (also enables PBI):
+
+  $ $PBMERGE -o $MERGED_BAM $HQREGION_BAM $SCRAPS_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Found
+
+  $ rm $MERGED_BAM
+  $ rm $MERGED_BAM_PBI
+
+Explicit Output Filename (with disabled PBI):
+
+  $ $PBMERGE -o $MERGED_BAM --no-pbi $HQREGION_BAM $SCRAPS_BAM
+
+  $ $BAM2SAM --header-only $MERGED_BAM
+  @HD\tVN:1.1\tSO:unknown\tpb:3.0.1 (esc)
+  @RG\tID:ca75d884\tPL:PACBIO\tDS:READTYPE=HQREGION;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\tPM:SEQUEL (esc)
+  @PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0 (esc)
+  @PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0 (esc)
+  @PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0 (esc)
+  @PG\tID:pbmerge-@PacBioBAM_VERSION@\tPN:pbmerge\tVN:@PacBioBAM_VERSION@ (esc)
+
+  $ $BAM2SAM --no-header $MERGED_BAM | cut -f 1
+  ArminsFakeMovie/100000/0_2659
+  ArminsFakeMovie/100000/2659_7034
+  ArminsFakeMovie/100000/3025_3047
+  ArminsFakeMovie/100000/3047_3095
+  ArminsFakeMovie/100000/3095_3116
+  ArminsFakeMovie/100000/3628_3650
+  ArminsFakeMovie/100000/3650_3700
+  ArminsFakeMovie/100000/3700_3722
+  ArminsFakeMovie/100000/4267_4289
+  ArminsFakeMovie/100000/4289_4335
+  ArminsFakeMovie/100000/4335_4356
+  ArminsFakeMovie/100000/4864_4888
+  ArminsFakeMovie/100000/4888_4939
+  ArminsFakeMovie/100000/4939_4960
+  ArminsFakeMovie/100000/5477_5498
+  ArminsFakeMovie/100000/5498_5546
+  ArminsFakeMovie/100000/5546_5571
+  ArminsFakeMovie/100000/6087_6116
+  ArminsFakeMovie/100000/6116_6173
+  ArminsFakeMovie/100000/6173_6199
+  ArminsFakeMovie/100000/6719_6740
+  ArminsFakeMovie/100000/6740_6790
+  ArminsFakeMovie/100000/6790_6812
+  ArminsFakeMovie/100000/7034_7035
+  ArminsFakeMovie/200000/0_2659
+  ArminsFakeMovie/200000/3025_3047
+  ArminsFakeMovie/200000/3047_3095
+  ArminsFakeMovie/200000/3095_3116
+  ArminsFakeMovie/200000/3628_3650
+  ArminsFakeMovie/200000/3650_3700
+  ArminsFakeMovie/200000/3700_3722
+  ArminsFakeMovie/200000/4267_4289
+  ArminsFakeMovie/200000/4289_4335
+  ArminsFakeMovie/200000/4335_4356
+  ArminsFakeMovie/200000/4864_4888
+  ArminsFakeMovie/200000/4888_4939
+  ArminsFakeMovie/200000/4939_4960
+  ArminsFakeMovie/200000/5477_5498
+  ArminsFakeMovie/200000/5498_5546
+  ArminsFakeMovie/200000/5546_5571
+  ArminsFakeMovie/200000/6087_6116
+  ArminsFakeMovie/200000/6116_6173
+  ArminsFakeMovie/200000/6173_6199
+  ArminsFakeMovie/200000/6719_6740
+  ArminsFakeMovie/200000/6740_6790
+  ArminsFakeMovie/200000/6790_6812
+  ArminsFakeMovie/200000/7034_7035
+  ArminsFakeMovie/300000/0_2659
+  ArminsFakeMovie/300000/3025_3047
+  ArminsFakeMovie/300000/3047_3095
+  ArminsFakeMovie/300000/3095_3116
+  ArminsFakeMovie/300000/3628_3650
+  ArminsFakeMovie/300000/3650_3700
+  ArminsFakeMovie/300000/3700_3722
+  ArminsFakeMovie/300000/4267_4289
+  ArminsFakeMovie/300000/4289_4335
+  ArminsFakeMovie/300000/4335_4356
+  ArminsFakeMovie/300000/4864_4888
+  ArminsFakeMovie/300000/4888_4939
+  ArminsFakeMovie/300000/4939_4960
+  ArminsFakeMovie/300000/5477_5498
+  ArminsFakeMovie/300000/5498_5546
+  ArminsFakeMovie/300000/5546_5571
+  ArminsFakeMovie/300000/6087_6116
+  ArminsFakeMovie/300000/6116_6173
+  ArminsFakeMovie/300000/6173_6199
+  ArminsFakeMovie/300000/6719_6740
+  ArminsFakeMovie/300000/6740_6790
+  ArminsFakeMovie/300000/6790_6812
+  ArminsFakeMovie/300000/7034_7035
+
+  $ [ -f $MERGED_BAM_PBI ] && echo "Found" || echo "Not found"
+  Not found
+
+  $ rm $MERGED_BAM
diff --git a/tests/src/meson.build b/tests/src/meson.build

new file mode 100644 (file)

index 0000000..bcd8202
--- /dev/null
+++ b/tests/src/meson.build
@@ -0,0 +1,69 @@
+pbbam_test_cpp_sources = files([
+  'test_Accuracy.cpp',
+  'test_AlignmentPrinter.cpp',
+  'test_BamFile.cpp',
+  'test_BamHeader.cpp',
+  'test_BamRecord.cpp',
+  'test_BamRecordBuilder.cpp',
+  'test_BamRecordClipping.cpp',
+  'test_BamRecordImplCore.cpp',
+  'test_BamRecordImplTags.cpp',
+  'test_BamRecordImplVariableData.cpp',
+  'test_BamRecordMapping.cpp',
+  'test_BamWriter.cpp',
+  'test_BarcodeQuery.cpp',
+  'test_Cigar.cpp',
+  'test_Compare.cpp',
+  'test_DataSetCore.cpp',
+  'test_DataSetIO.cpp',
+  'test_DataSetQuery.cpp',
+  'test_DataSetXsd.cpp',
+  'test_EndToEnd.cpp',
+  'test_EntireFileQuery.cpp',
+  'test_Fasta.cpp',
+  'test_Fastq.cpp',
+  'test_FileUtils.cpp',
+  'test_Frames.cpp',
+  'test_GenomicIntervalQuery.cpp',
+  'test_IndexedBamWriter.cpp',
+  'test_IndexedFastaReader.cpp',
+  'test_Intervals.cpp',
+  'test_LongCigar.cpp',
+  'test_PacBioIndex.cpp',
+  'test_PbiFilter.cpp',
+  'test_PbiFilterQuery.cpp',
+  'test_QNameQuery.cpp',
+  'test_QualityValues.cpp',
+  'test_Pulse2BaseCache.cpp',
+  'test_ReadAccuracyQuery.cpp',
+  'test_ReadGroupInfo.cpp',
+  'test_SamWriter.cpp',
+  'test_SequenceUtils.cpp',
+  'test_StringUtils.cpp',
+  'test_SubreadLengthQuery.cpp',
+  'test_Tags.cpp',
+  'test_TimeUtils.cpp',
+  'test_Validator.cpp',
+  'test_VcfFile.cpp',
+  'test_VcfFormat.cpp',
+  'test_VcfHeader.cpp',
+  'test_VcfReader.cpp',
+  'test_VcfSort.cpp',
+  'test_VcfQuery.cpp',
+  'test_VcfVariant.cpp',
+  'test_VcfWriter.cpp',
+  'test_Version.cpp',
+  'test_WhitelistedZmwReadStitcher.cpp',
+  'test_ZmwReadStitcher.cpp',
+  'test_ZmwQuery.cpp'])
+
+# cram files
+pbbam_cram_bam2sam_t_in = files('cram/bam2sam.t.in')
+pbbam_cram_pbindexdump_json_t_in = files('cram/pbindexdump_json.t.in')
+pbbam_cram_pbindexdump_cpp_t_in = files('cram/pbindexdump_cpp.t.in')
+pbbam_cram_pbmerge_pacbio_ordering_t_in = files('cram/pbmerge_pacbio_ordering.t.in')
+pbbam_cram_pbmerge_aligned_ordering_t_in = files('cram/pbmerge_aligned_ordering.t.in')
+pbbam_cram_pbmerge_mixed_ordering_t_in = files('cram/pbmerge_mixed_ordering.t.in')
+pbbam_cram_pbmerge_dataset_t_in = files('cram/pbmerge_dataset.t.in')
+pbbam_cram_pbmerge_fofn_t_in = files('cram/pbmerge_fofn.t.in')
+pbbam_cram_pbbamify_t_in = files('cram/pbbamify.t.in')
diff --git a/tests/src/test_Accuracy.cpp b/tests/src/test_Accuracy.cpp

new file mode 100644 (file)

index 0000000..3242d27
--- /dev/null
+++ b/tests/src/test_Accuracy.cpp
@@ -0,0 +1,25 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+
+#include <pbbam/Accuracy.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(AccuracyTest, ClampValues)
+{
+    Accuracy a_zero(0.0);
+    Accuracy a_neg(-0.5);
+    Accuracy a_min(0.0);
+    Accuracy a_normal(0.9);
+    Accuracy a_max(1.0);
+    Accuracy a_tooLarge(1.1);
+
+    EXPECT_FLOAT_EQ(0.0, a_zero);
+    EXPECT_FLOAT_EQ(0.0, a_neg);
+    EXPECT_FLOAT_EQ(0.0, a_min);
+    EXPECT_FLOAT_EQ(0.9, a_normal);
+    EXPECT_FLOAT_EQ(1.0, a_max);
+    EXPECT_FLOAT_EQ(1.0, a_tooLarge);
+}
diff --git a/tests/src/test_AlignmentPrinter.cpp b/tests/src/test_AlignmentPrinter.cpp

new file mode 100644 (file)

index 0000000..a88deab
--- /dev/null
+++ b/tests/src/test_AlignmentPrinter.cpp
@@ -0,0 +1,120 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/AlignmentPrinter.h>
+#include <pbbam/BamFile.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/IndexedFastaReader.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace AlignmentPrinterTests {
+
+const std::string lambdaFasta = PbbamTestsConfig::Data_Dir + "/lambdaNEB.fa";
+const std::string singleInsertionBam = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+
+}  // namespace AlignmentPrinterTests
+
+TEST(AlignmentPrinterTest, Print)
+{
+    IndexedFastaReader r(AlignmentPrinterTests::lambdaFasta);
+    AlignmentPrinter pretty(r);
+
+    BamFile bamFile(AlignmentPrinterTests::singleInsertionBam);
+    EntireFileQuery bamQuery(bamFile);
+    auto it = bamQuery.begin();
+
+    // funky formatting used to format alignments
+    auto expected = std::string{
+        "Read        : singleInsertion/100/0_49\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 49\n"
+        "Concordance : 0.96\n"
+        "\n"
+        "5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249\n"
+        "       \x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| "
+        "|\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| "
+        "||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "   0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG :   39\n"
+        "\n"
+        "5249 : ACTGGCTGAT : 5259\n"
+        "       |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "  39 : ACTGGCTGAT :   49\n"
+        "\n"};
+
+    auto record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+
+    expected = std::string{
+        "Read        : singleInsertion/200/0_49\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 49\n"
+        "Concordance : 0.96\n"
+        "\n"
+        "5210 : GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGG : 5249\n"
+        "       \x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| "
+        "|\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||| "
+        "||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "   0 : GGCTGCAG-GTACAGCGGTCAGGAGGCCAATTGATGCCGG :   39\n"
+        "\n"
+        "5249 : ACTGGCTGAT : 5259\n"
+        "       |\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||||\n"
+        "  39 : ACTGGCTGAT :   49\n"
+        "\n"};
+
+    record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+
+    expected = std::string{
+        "Read        : singleInsertion/100/0_111\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 59\n"
+        "Concordance : 0.951\n"
+        "\n"
+        "9377 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCG : 9417\n"
+        "       "
+        "|||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||"
+        "\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||  |\n"
+        "   0 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGA--G :   38\n"
+        "\n"
+        "9417 : CAGCACGGT-AACAGCGGCAA : 9437\n"
+        "       |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||| "
+        "||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||\n"
+        "  38 : CAGCACGGTAAACAGCGGCAA :   59\n"
+        "\n"};
+
+    record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+
+    expected = std::string{
+        "Read        : singleInsertion/100/0_111\n"
+        "Reference   : lambda_NEB3011\n"
+        "\n"
+        "Read-length : 59\n"
+        "Concordance : 0.951\n"
+        "\n"
+        "9377 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCG : 9417\n"
+        "       "
+        "|||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||"
+        "\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||||||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m|||  |\n"
+        "   0 : AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGA--G :   38\n"
+        "\n"
+        "9417 : CAGCACGGT-AACAGCGGCAA : 9437\n"
+        "       |||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||| "
+        "||||\x1B[1m\x1B[31m|\x1B[0m\x1B[39;49m||||||\n"
+        "  38 : CAGCACGGTAAACAGCGGCAA :   59\n"
+        "\n"};
+
+    record = *it++;
+    EXPECT_EQ(expected, pretty.Print(record, Orientation::GENOMIC));
+}
diff --git a/tests/src/test_BamFile.cpp b/tests/src/test_BamFile.cpp

new file mode 100644 (file)

index 0000000..bff5fd7
--- /dev/null
+++ b/tests/src/test_BamFile.cpp
@@ -0,0 +1,105 @@
+// Author: Derek Barnett
+
+#include <unistd.h>
+#include <cstddef>
+#include <cstdlib>
+#include <stdexcept>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/../../src/FileUtils.h>
+#include <pbbam/BamFile.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace BamFileTests {
+
+template <typename T>
+void CheckFile(const T& input, const size_t expectedCount)
+{
+    size_t observedCount = 0;
+    EntireFileQuery entireFile(input);
+    for (const BamRecord& r : entireFile) {
+        UNUSED(r);
+        ++observedCount;
+    }
+    EXPECT_EQ(expectedCount, observedCount);
+}
+
+}  // namespace BamFileTests
+
+TEST(BamFileTest, NonExistentFileThrows)
+{
+    EXPECT_THROW(BamFile{"does_not_exist.bam"}, std::runtime_error);
+}
+
+TEST(BamFileTest, NonBamFileThrows)
+{
+    EXPECT_THROW(BamFile{PbbamTestsConfig::Data_Dir + "/lambdaNEB.fa.fai"}, std::runtime_error);
+}
+
+TEST(BamFileTest, RelativePathBamOk)
+{
+    // cache current working directory, then drill down so we can point to
+    // BAMs using relative path
+    const std::string cwd = internal::FileUtils::CurrentWorkingDirectory();
+    ASSERT_EQ(0, chdir(PbbamTestsConfig::Data_Dir.c_str()));
+    ASSERT_EQ(0, chdir("relative/a"));
+
+    // BamFile from relative BAM fn
+    BamFileTests::CheckFile(BamFile{"../b/test1.bam"}, 3);
+
+    // dataset from relative BAM fn
+    BamFileTests::CheckFile(DataSet{"../b/test1.bam"}, 3);
+
+    // dataset from BamFile object (itself from relative BAM fn)
+    {
+        auto file = BamFile{"../b/test1.bam"};
+        BamFileTests::CheckFile(DataSet{file}, 3);
+    }
+
+    // restore working directory
+    ASSERT_EQ(0, chdir(cwd.c_str()));
+}
+
+TEST(BamFileTest, RelativePathXmlOk)
+{
+    // cache current working directory, then drill down so we can point to
+    // BAMs using relative path
+    const std::string cwd = internal::FileUtils::CurrentWorkingDirectory();
+    ASSERT_EQ(0, chdir(PbbamTestsConfig::Data_Dir.c_str()));
+
+    // dataset from XML containing relative paths
+    BamFileTests::CheckFile(DataSet{"relative/relative.xml"}, 9);
+
+    // restore working directory
+    ASSERT_EQ(0, chdir(cwd.c_str()));
+}
+
+TEST(BamFileTest, RelativePathFofnOk)
+{
+    // cache current working directory, then drill down so we can point to
+    // BAMs using relative path
+    const std::string cwd = internal::FileUtils::CurrentWorkingDirectory();
+    ASSERT_EQ(0, chdir(PbbamTestsConfig::Data_Dir.c_str()));
+
+    // dataset from FOFN containing relative paths
+    BamFileTests::CheckFile(DataSet{"relative/relative.fofn"}, 9);
+
+    // NOTE: doesn't yet support a FOFN containing an XML with relative paths
+    //       BamFileTests::CheckFile(DataSet{ "relative/relative2.fofn" }, 60);
+
+    // restore working directory
+    ASSERT_EQ(0, chdir(cwd.c_str()));
+}
+
+TEST(BamFileTest, TruncatedFileThrowsOk)
+{
+    EXPECT_THROW(BamFile{PbbamTestsConfig::GeneratedData_Dir + "/truncated.bam"},
+                 std::runtime_error);
+}
diff --git a/tests/src/test_BamHeader.cpp b/tests/src/test_BamHeader.cpp

new file mode 100644 (file)

index 0000000..f6db965
--- /dev/null
+++ b/tests/src/test_BamHeader.cpp
@@ -0,0 +1,392 @@
+// Author: Derek Barnett
+
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <htslib/sam.h>
+
+#include <pbbam/BamHeader.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace BamHeaderTests {
+
+struct BamHdrDeleter
+{
+    void operator()(bam_hdr_t* hdr)
+    {
+        if (hdr) bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+}  // namespace BamHeaderTests
+
+TEST(BamHeaderTest, DefaultConstruction)
+{
+    BamHeader header;
+    EXPECT_TRUE(header.Version().empty());
+    EXPECT_TRUE(header.SortOrder().empty());  // default to unknown ?
+    EXPECT_TRUE(header.ReadGroups().empty());
+    EXPECT_TRUE(header.Sequences().empty());
+    EXPECT_TRUE(header.Programs().empty());
+    EXPECT_TRUE(header.Comments().empty());
+
+    EXPECT_THROW(header.Program("foo"), std::exception);
+    EXPECT_THROW(header.ReadGroup("foo"), std::exception);
+    EXPECT_THROW(header.SequenceId("foo"), std::exception);
+    EXPECT_THROW(header.SequenceLength(42), std::exception);
+    EXPECT_THROW(header.SequenceName(42), std::exception);
+}
+
+TEST(BamHeaderTest, DecodeTest)
+{
+    const std::string text{
+        "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+        "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+        "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+        "@RG\tID:rg1\tSM:control\n"
+        "@RG\tID:rg2\tSM:condition1\n"
+        "@RG\tID:rg3\tSM:condition1\n"
+        "@PG\tID:_foo_\tPN:ide\n"
+        "@CO\tipsum and so on\n"
+        "@CO\tcitation needed\n"};
+
+    BamHeader header = BamHeader(text);
+
+    EXPECT_EQ(std::string("1.1"), header.Version());
+    EXPECT_EQ(std::string("queryname"), header.SortOrder());
+    EXPECT_EQ(std::string("3.0.1"), header.PacBioBamVersion());
+
+    EXPECT_EQ(3, header.ReadGroups().size());
+    EXPECT_TRUE(header.HasReadGroup("rg1"));
+    EXPECT_TRUE(header.HasReadGroup("rg2"));
+    EXPECT_TRUE(header.HasReadGroup("rg3"));
+
+    EXPECT_EQ(std::string("control"), header.ReadGroup("rg1").Sample());
+    EXPECT_EQ(std::string("condition1"), header.ReadGroup("rg2").Sample());
+    EXPECT_EQ(std::string("condition1"), header.ReadGroup("rg3").Sample());
+
+    EXPECT_EQ(2, header.Sequences().size());
+    EXPECT_TRUE(header.HasSequence("chr1"));
+    EXPECT_TRUE(header.HasSequence("chr2"));
+    EXPECT_EQ(std::string("chocobo"), header.Sequence("chr1").Species());
+    EXPECT_EQ(std::string("chocobo"), header.Sequence("chr2").Species());
+    EXPECT_EQ(std::string("2038"), header.Sequence("chr1").Length());
+    EXPECT_EQ(std::string("3042"), header.Sequence("chr2").Length());
+
+    EXPECT_EQ(1, header.Programs().size());
+    EXPECT_TRUE(header.HasProgram("_foo_"));
+    EXPECT_EQ(std::string("ide"), header.Program("_foo_").Name());
+
+    EXPECT_EQ(2, header.Comments().size());
+    EXPECT_EQ(std::string("ipsum and so on"), header.Comments().at(0));
+    EXPECT_EQ(std::string("citation needed"), header.Comments().at(1));
+}
+
+TEST(BamHeaderTest, VersionCheckOk)
+{
+    auto expectFail = [](std::string&& label, std::string&& text) {
+        SCOPED_TRACE(label);
+        EXPECT_THROW(BamHeader{text}, std::runtime_error);
+    };
+    expectFail("empty version", "@HD\tVN:1.1\tSO:queryname\tpb:\n");
+    expectFail("old beta version", "@HD\tVN:1.1\tSO:queryname\tpb:3.0b3\n");
+    expectFail("old beta version", "@HD\tVN:1.1\tSO:queryname\tpb:3.0b7\n");
+    expectFail("invalid value", "@HD\tVN:1.1\tSO:queryname\tpb:3.0.should_not_work\n");
+    expectFail("earlier than minimum", "@HD\tVN:1.1\tSO:queryname\tpb:3.0.0\n");
+
+    // correct version syntax, number
+    EXPECT_NO_THROW(BamHeader{"@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"});
+}
+
+TEST(BamHeaderTest, EncodeTest)
+{
+    ReadGroupInfo rg1("rg1");
+    rg1.Sample("control");
+    ReadGroupInfo rg2("rg2");
+    rg2.Sample("condition1");
+    ReadGroupInfo rg3("rg3");
+    rg3.Sample("condition1");
+
+    SequenceInfo seq1("chr1");
+    seq1.Length("2038").Species("chocobo");
+    SequenceInfo seq2("chr2");
+    seq2.Length("3042").Species("chocobo");
+
+    ProgramInfo prog1("_foo_");
+    prog1.Name("ide");
+
+    BamHeader header;
+    header.Version("1.1")
+        .SortOrder("queryname")
+        .PacBioBamVersion("3.0.1")
+        .AddReadGroup(rg1)
+        .AddReadGroup(rg2)
+        .AddReadGroup(rg3)
+        .AddSequence(seq1)
+        .AddSequence(seq2)
+        .AddProgram(prog1)
+        .AddComment("ipsum and so on")
+        .AddComment("citation needed");
+
+    const std::string expectedText{
+        "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+        "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+        "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+        "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
+        "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@PG\tID:_foo_\tPN:ide\n"
+        "@CO\tipsum and so on\n"
+        "@CO\tcitation needed\n"};
+
+    const std::string text = header.ToSam();
+    EXPECT_EQ(expectedText, text);
+}
+
+TEST(BamHeaderTest, ConvertToRawDataOk)
+{
+    ReadGroupInfo rg1("rg1");
+    rg1.Sample("control");
+    ReadGroupInfo rg2("rg2");
+    rg2.Sample("condition1");
+    ReadGroupInfo rg3("rg3");
+    rg3.Sample("condition1");
+
+    SequenceInfo seq1("chr1");
+    seq1.Length("2038").Species("chocobo");
+    SequenceInfo seq2("chr2");
+    seq2.Length("3042").Species("chocobo");
+
+    ProgramInfo prog1("_foo_");
+    prog1.Name("ide");
+
+    BamHeader header;
+    header.Version("1.1")
+        .SortOrder("queryname")
+        .PacBioBamVersion("3.0.1")
+        .AddReadGroup(rg1)
+        .AddReadGroup(rg2)
+        .AddReadGroup(rg3)
+        .AddSequence(seq1)
+        .AddSequence(seq2)
+        .AddProgram(prog1)
+        .AddComment("ipsum and so on")
+        .AddComment("citation needed");
+
+    const std::string expectedText{
+        "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+        "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+        "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+        "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
+        "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@PG\tID:_foo_\tPN:ide\n"
+        "@CO\tipsum and so on\n"
+        "@CO\tcitation needed\n"};
+
+    const std::string text = header.ToSam();
+    std::shared_ptr<bam_hdr_t> rawData(sam_hdr_parse(text.size(), text.c_str()),
+                                       BamHeaderTests::BamHdrDeleter());
+    rawData->ignore_sam_err = 0;
+    rawData->cigar_tab = nullptr;
+    rawData->l_text = text.size();
+    rawData->text = static_cast<char*>(calloc(rawData->l_text + 1, 1));
+    memcpy(rawData->text, text.c_str(), rawData->l_text);
+
+    const std::string rawText(rawData->text, rawData->l_text);
+    EXPECT_EQ(expectedText, rawText);
+}
+
+TEST(BamHeaderTest, ExtractFromRawDataOk)
+{
+    ReadGroupInfo rg1("rg1");
+    rg1.Sample("control");
+    ReadGroupInfo rg2("rg2");
+    rg2.Sample("condition1");
+    ReadGroupInfo rg3("rg3");
+    rg3.Sample("condition1");
+
+    SequenceInfo seq1("chr1");
+    seq1.Length("2038").Species("chocobo");
+    SequenceInfo seq2("chr2");
+    seq2.Length("3042").Species("chocobo");
+
+    ProgramInfo prog1("_foo_");
+    prog1.Name("ide");
+
+    BamHeader header;
+    header.Version("1.1")
+        .SortOrder("queryname")
+        .PacBioBamVersion("3.0.1")
+        .AddReadGroup(rg1)
+        .AddReadGroup(rg2)
+        .AddReadGroup(rg3)
+        .AddSequence(seq1)
+        .AddSequence(seq2)
+        .AddProgram(prog1)
+        .AddComment("ipsum and so on")
+        .AddComment("citation needed");
+
+    const std::string expectedText{
+        "@HD\tVN:1.1\tSO:queryname\tpb:3.0.1\n"
+        "@SQ\tSN:chr1\tLN:2038\tSP:chocobo\n"
+        "@SQ\tSN:chr2\tLN:3042\tSP:chocobo\n"
+        "@RG\tID:rg1\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:control\tPM:SEQUEL\n"
+        "@RG\tID:rg2\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@RG\tID:rg3\tPL:PACBIO\tDS:READTYPE=UNKNOWN\tSM:condition1\tPM:SEQUEL\n"
+        "@PG\tID:_foo_\tPN:ide\n"
+        "@CO\tipsum and so on\n"
+        "@CO\tcitation needed\n"};
+
+    std::string text = header.ToSam();
+    std::shared_ptr<bam_hdr_t> rawData(sam_hdr_parse(text.size(), text.c_str()),
+                                       BamHeaderTests::BamHdrDeleter());
+    rawData->ignore_sam_err = 0;
+    rawData->cigar_tab = nullptr;
+    rawData->l_text = text.size();
+    rawData->text = static_cast<char*>(calloc(rawData->l_text + 1, 1));
+    memcpy(rawData->text, text.c_str(), rawData->l_text);
+
+    const BamHeader newHeader = BamHeader(std::string(rawData->text, rawData->l_text));
+
+    EXPECT_EQ(header.Version(), newHeader.Version());
+    EXPECT_EQ(header.SortOrder(), newHeader.SortOrder());
+    EXPECT_EQ(header.PacBioBamVersion(), newHeader.PacBioBamVersion());
+
+    text = newHeader.ToSam();
+    EXPECT_EQ(expectedText, text);
+}
+
+TEST(BamHeaderTest, MergeOk)
+{
+    const std::string hdrText1{
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+        "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
+        "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\t"
+        "PM:SEQUEL\n"
+        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
+        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"
+        "@CO\tcomment1\n"};
+
+    const std::string hdrText2{
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;"
+        "PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;"
+        "PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;"
+        "BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;"
+        "FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\t"
+        "PM:SEQUEL\n"
+        "@PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0\n"
+        "@PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0\n"
+        "@PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0\n"
+        "@CO\tcomment2\n"};
+
+    const std::string mergedText{
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+        "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
+        "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\t"
+        "PM:SEQUEL\n"
+        "@RG\tID:e83fc9c6\tPL:PACBIO\tDS:READTYPE=SCRAP;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;SubstitutionTag=st;Ipd:Frames=ip;"
+        "PulseWidth:Frames=pw;PkMid=pm;PkMean=pa;LabelQV=pq;AltLabel=pt;AltLabelQV=pv;"
+        "PulseMergeQV=pg;PulseCall=pc;PrePulseFrames=pd;PulseCallWidth=px;"
+        "BINDINGKIT=100372700;SEQUENCINGKIT=100356200;BASECALLERVERSION=0.1;"
+        "FRAMERATEHZ=100.000000\tPU:ArminsFakeMovie\t"
+        "PM:SEQUEL\n"
+        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
+        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"
+        "@PG\tID:baz2bam-0.15.0\tPN:baz2bam\tVN:0.15.0\n"
+        "@PG\tID:bazFormat-0.3.0\tPN:bazFormat\tVN:0.3.0\n"
+        "@PG\tID:bazwriter-0.15.0\tPN:bazwriter\tVN:0.15.0\n"
+        "@CO\tcomment1\n"
+        "@CO\tcomment2\n"};
+
+    {  // operator+
+
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        const BamHeader merged = header1 + header2;
+        EXPECT_EQ(mergedText, merged.ToSam());
+
+        // also make sure inputs not changed
+        EXPECT_EQ(hdrText1, header1.ToSam());
+        EXPECT_EQ(hdrText2, header2.ToSam());
+    }
+
+    {  // operator+=
+
+        BamHeader header1(hdrText1);
+        header1 += BamHeader(hdrText2);
+        EXPECT_EQ(mergedText, header1.ToSam());
+    }
+}
+
+TEST(BamHeaderTest, MergeHandlesDuplicateReadGroups)
+{
+    const std::string hdrText{
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:a955def6\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+        "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.3.0.0.140018;FRAMERATEHZ=75.000000\t"
+        "PU:m140918_150013_42139_c100697631700000001823144703261565_s1_p0\tPM:SEQUEL\n"
+        "@PG\tID:bam2bam-0.20.0\tPN:bam2bam\tVN:0.20.0\n"
+        "@PG\tID:bax2bam-0.0.2\tPN:bax2bam\tVN:0.0.2\n"};
+
+    // duplicate @RG:IDs handled ok (i.e. not duplicated in output)
+    const BamHeader header1(hdrText);
+    const BamHeader header2(hdrText);
+    const BamHeader merged = header1 + header2;
+    EXPECT_EQ(hdrText, merged.ToSam());
+}
+
+TEST(BamHeaderTest, MergeCompatibilityOk)
+{
+    {  // different @HD:VN - this IS allowed (as of SAT-465, pbbam v0.7.2)
+        const std::string hdrText1 = {"@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"};
+        const std::string hdrText2 = {"@HD\tVN:1.0\tSO:unknown\tpb:3.0.1\n"};
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_NO_THROW(header1 + header2);
+    }
+
+    {  // different @HD:SO
+        const std::string hdrText1 = {"@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"};
+        const std::string hdrText2 = {"@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n"};
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_THROW(header1 + header2, std::runtime_error);
+    }
+
+    {  // different @HD:pb - this IS allowed (as of SAT-529, pbbam 0.7.4)
+        const std::string hdrText1 = {"@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"};
+        const std::string hdrText2 = {"@HD\tVN:1.1\tSO:unknown\tpb:3.0.3\n"};
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_NO_THROW(header1 + header2);
+    }
+
+    {  // @SQ list clash
+        const std::string hdrText1{
+            "@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n"
+            "@SQ\tSN:foo\tLN:42\n"
+            "@SQ\tSN:bar\tLN:24\n"};
+        const std::string hdrText2{
+            "@HD\tVN:1.1\tSO:coordinate\tpb:3.0.1\n"
+            "@SQ\tSN:foo\tLN:42\n"
+            "@SQ\tSN:baz\tLN:99\n"};
+        const BamHeader header1(hdrText1);
+        const BamHeader header2(hdrText2);
+        EXPECT_THROW(header1 + header2, std::runtime_error);
+    }
+}
diff --git a/tests/src/test_BamRecord.cpp b/tests/src/test_BamRecord.cpp

new file mode 100644 (file)

index 0000000..fe2d99d
--- /dev/null
+++ b/tests/src/test_BamRecord.cpp
@@ -0,0 +1,2725 @@
+// Author: Derek Barnett
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamTagCodec.h>
+#include "../src/MemoryUtils.h"
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace BamRecordTests {
+
+static
+BamRecordImpl CreateBamImpl()
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Bin(42);
+    bam.Flag(42);
+    bam.InsertSize(42);
+    bam.MapQuality(42);
+    bam.MatePosition(42);
+    bam.MateReferenceId(42);
+    bam.Position(42);
+    bam.ReferenceId(42);
+    bam.Tags(tags);
+    return bam;
+}
+
+static inline
+BamRecord CreateBam()
+{ return BamRecord{ CreateBamImpl() }; }
+
+static
+void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+    const uint32_t expectedNameBytes = bam.Name().size() + 1;  // include NULL term
+    const uint32_t expectedNameNulls = 4 - (expectedNameBytes % 4);
+    const uint32_t expectedNameLength = expectedNameBytes + expectedNameNulls;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t expectedSeqLength = bam.Sequence().length();
+    const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + <encoded length>
+    const int expectedTotalDataLength = expectedNameLength + (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength + 1) / 2 + expectedSeqLength +
+                                        expectedTagsLength;
+
+    const auto rawData = PacBio::BAM::internal::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    EXPECT_EQ(expectedNameNulls, rawData->core.l_extranul);
+    EXPECT_EQ(expectedNameLength, rawData->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps, rawData->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength, rawData->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, rawData->l_data);
+}
+
+static inline
+void CheckRawData(const BamRecord& bam)
+{ CheckRawData(bam.Impl()); }
+
+static
+BamRecordImpl MakeCigaredImpl(const std::string& seq,
+                              const std::string& cigar,
+                              const Strand strand)
+{
+    BamRecordImpl impl;
+    impl.SetMapped(true).ReferenceId(0).Position(0).MapQuality(0);
+    impl.CigarData(Cigar::FromStdString(cigar));
+    impl.MateReferenceId(-1).MatePosition(-1).InsertSize(0);
+    impl.SetSequenceAndQualities(seq, std::string(seq.size(), '*'));
+    impl.SetReverseStrand(strand == Strand::REVERSE);
+    return impl;
+}
+
+static inline
+BamRecord MakeCigaredRecord(const std::string& seq,
+                            const std::string& cigar,
+                            const Strand strand)
+{ return BamRecord{ MakeCigaredImpl(seq, cigar, strand) }; }
+
+static
+BamRecord MakeCigaredBaseRecord(const std::string& bases,
+                                const std::string& cigar,
+                                const Strand strand)
+{
+    TagCollection tags;
+    tags["dt"] = bases;
+    tags["st"] = bases;
+
+    const std::string seq = std::string(bases.size(), 'N');
+    BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredFrameRecord(const std::vector<uint16_t>& frames,
+                                 const std::string& cigar,
+                                 const Strand strand)
+{
+    TagCollection tags;
+    tags["ip"] = frames;
+    tags["pw"] = frames;
+
+    const std::string seq = std::string(frames.size(), 'N');
+    BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredQualRecord(const std::string& quals,
+                                const std::string& cigar,
+                                const Strand strand)
+{
+    TagCollection tags;
+    tags["dq"] = quals;
+    tags["iq"] = quals;
+    tags["mq"] = quals;
+    tags["sq"] = quals;
+
+    const std::string seq = std::string(quals.size(), 'N');
+    BamRecordImpl impl = MakeCigaredImpl(seq, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseBaseRecord(const std::string& seqBases,
+                                     const std::string& pulseCalls,
+                                     const std::string& pulseBases,
+                                     const std::string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls; // PulseCall
+    tags["pt"] = pulseBases; // AltLabelTag
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseQualRecord(const std::string& seqBases,
+                                     const std::string& pulseCalls,
+                                     const std::string& pulseQuals,
+                                     const std::string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls;
+    tags["pv"] = pulseQuals; // AltLabelQV
+    tags["pq"] = pulseQuals; // LabelQV
+    tags["pg"] = pulseQuals; // PulseMergeQV
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseFrameRecord(const std::string& seqBases,
+                                     const std::string& pulseCalls,
+                                     const std::vector<uint16_t>& pulseFrames,
+                                     const std::string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls;
+    tags["pd"] = pulseFrames; // PrePulseFrames
+    tags["px"] = pulseFrames; // PulseCallWidth
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+static
+BamRecord MakeCigaredPulseUIntRecord(const std::string& seqBases,
+                                     const std::string& pulseCalls,
+                                     const std::vector<uint32_t>& pulseUInts,
+                                     const std::string& cigar,
+                                     const Strand strand)
+{
+    TagCollection tags;
+    tags["pc"] = pulseCalls;
+    tags["sf"] = pulseUInts; // StartFrame
+
+    BamRecordImpl impl = MakeCigaredImpl(seqBases, cigar, strand);
+    impl.Tags(tags);
+    return BamRecord(std::move(impl));
+}
+
+// ----------------------------------------------------------
+// helper structs and methods for checking combinations of:
+//   aligned strand, orientation requested, alignment, clipping
+// ----------------------------------------------------------
+
+// generic result holder for various requested states
+template<typename T>
+struct ExpectedResult
+{
+public:
+    ExpectedResult(std::initializer_list<T> init)
+        : d_(init)
+    {
+        assert(12 == init.size());
+    }
+
+    T ForwardGenomic() const               { return d_.at(0); }
+    T ForwardNative() const                { return d_.at(1); }
+    T ForwardGenomicAligned() const        { return d_.at(2); }
+    T ForwardNativeAligned() const         { return d_.at(3); }
+    T ForwardGenomicAlignedClipped() const { return d_.at(4); }
+    T ForwardNativeAlignedClipped() const  { return d_.at(5); }
+    T ReverseGenomic() const               { return d_.at(6); }
+    T ReverseNative() const                { return d_.at(7); }
+    T ReverseGenomicAligned() const        { return d_.at(8); }
+    T ReverseNativeAligned() const         { return d_.at(9); }
+    T ReverseGenomicAlignedClipped() const { return d_.at(10); }
+    T ReverseNativeAlignedClipped() const  { return d_.at(11); }
+
+private:
+    std::vector<T> d_;
+};
+
+// generic data type checker on the various requested states
+template<typename DataType, typename MakeRecordType, typename FetchDataType>
+void CheckAlignAndClip(const std::string& cigar,
+                       const DataType& input,
+                       const BamRecordTests::ExpectedResult<DataType>& e,
+                       const MakeRecordType& makeRecord,
+                       const FetchDataType& fetchData)
+{
+    {   // map to forward strand
+        const BamRecord b = makeRecord(input, cigar, Strand::FORWARD);
+        EXPECT_EQ(e.ForwardGenomic(),               fetchData(b, Orientation::GENOMIC, false, false));
+        EXPECT_EQ(e.ForwardNative(),                fetchData(b, Orientation::NATIVE,  false, false));
+        EXPECT_EQ(e.ForwardGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false));
+        EXPECT_EQ(e.ForwardNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false));
+        EXPECT_EQ(e.ForwardGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true));
+        EXPECT_EQ(e.ForwardNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true));
+    }
+    {   // map to reverse strand
+        const BamRecord b = makeRecord(input, cigar, Strand::REVERSE);
+        EXPECT_EQ(e.ReverseGenomic(),               fetchData(b, Orientation::GENOMIC, false, false));
+        EXPECT_EQ(e.ReverseNative(),                fetchData(b, Orientation::NATIVE,  false, false));
+        EXPECT_EQ(e.ReverseGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false));
+        EXPECT_EQ(e.ReverseNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false));
+        EXPECT_EQ(e.ReverseGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true));
+        EXPECT_EQ(e.ReverseNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true));
+    }
+}
+
+template<typename DataType, typename MakeRecordType, typename FetchDataType>
+void CheckPulseDataAlignAndClip(const std::string& cigar,
+                                const std::string& seqBases,
+                                const std::string& pulseCalls,
+                                const DataType& input,
+                                const BamRecordTests::ExpectedResult<DataType>& allPulses,
+                                const BamRecordTests::ExpectedResult<DataType>& basecallsOnly,
+                                const MakeRecordType& makeRecord,
+                                const FetchDataType& fetchData)
+{
+    {   // map to forward strand
+        const BamRecord b = makeRecord(seqBases, pulseCalls, input, cigar, Strand::FORWARD);
+
+        EXPECT_EQ(allPulses.ForwardGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::ALL));
+        EXPECT_EQ(allPulses.ForwardNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::ALL));
+        // no align/clipping operations available on ALL pulses
+
+        EXPECT_EQ(basecallsOnly.ForwardGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true,  PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ForwardNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true,  PulseBehavior::BASECALLS_ONLY));
+    }
+    {   // map to reverse strand
+        const BamRecord b = makeRecord(seqBases, pulseCalls, input, cigar, Strand::REVERSE);
+
+        EXPECT_EQ(allPulses.ReverseGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::ALL));
+        EXPECT_EQ(allPulses.ReverseNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::ALL));
+        // no align/clipping operations available on ALL pulses
+
+        EXPECT_EQ(basecallsOnly.ReverseGenomic(),               fetchData(b, Orientation::GENOMIC, false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseNative(),                fetchData(b, Orientation::NATIVE,  false, false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseGenomicAligned(),        fetchData(b, Orientation::GENOMIC, true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseNativeAligned(),         fetchData(b, Orientation::NATIVE,  true,  false, PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseGenomicAlignedClipped(), fetchData(b, Orientation::GENOMIC, true,  true,  PulseBehavior::BASECALLS_ONLY));
+        EXPECT_EQ(basecallsOnly.ReverseNativeAlignedClipped(),  fetchData(b, Orientation::NATIVE,  true,  true,  PulseBehavior::BASECALLS_ONLY));
+    }
+}
+
+static
+void CheckBaseTagsClippedAndAligned(const std::string& cigar,
+                                    const std::string& input,
+                                    const ExpectedResult<std::string>& e)
+{
+    // aligned record + DeletionTag, SubstitutionTag
+    auto makeRecord = [](const std::string& newBases,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return MakeCigaredBaseRecord(newBases, newCigar, newStrand); };
+
+    // DeletionTag
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.DeletionTag(orientation, aligned, exciseSoftClips); }
+    );
+
+    // SubstitutionTag
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.SubstitutionTag(orientation, aligned, exciseSoftClips); }
+    );
+}
+
+static
+void CheckFrameTagsClippedAndAligned(const std::string& cigar,
+                                     const std::vector<uint16_t>& input,
+                                     const ExpectedResult<std::vector<uint16_t> >& e)
+{
+
+    // aligned record + IPD, PulseWidth
+    auto makeRecord = [](const std::vector<uint16_t>& newFrames,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return BamRecordTests::MakeCigaredFrameRecord(newFrames, newCigar, newStrand); };
+
+    // IPD
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.IPD(orientation, aligned, exciseSoftClips).Data(); }
+    );
+
+    // PulseWidth
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.PulseWidth(orientation, aligned, exciseSoftClips).Data(); }
+    );
+}
+
+static
+void CheckQualityTagsClippedAndAligned(const std::string& cigar,
+                                       const std::string& input,
+                                       const ExpectedResult<std::string>& e)
+{
+    // aligned record + DeletionQV, InsertionQV, MergeQV, SubstitutionQV
+    auto makeRecord = [](const std::string& newQuals,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return BamRecordTests::MakeCigaredQualRecord(newQuals, newCigar, newStrand); };
+
+    // DeletionQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.DeletionQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+
+    // InsertionQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.InsertionQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+
+    // MergeQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.MergeQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+
+    // SubstitutionQV
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.SubstitutionQV(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+}
+
+static
+void CheckQualitiesClippedAndAligned(const std::string& cigar,
+                                     const std::string& input,
+                                     const ExpectedResult<std::string>& e)
+{
+    // aligned record w/ dummy SEQ & QUALs under test
+    auto makeRecord = [](const std::string& newQuals,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    {
+        const std::string seq = std::string(newQuals.size(), 'N');
+        auto record = BamRecordTests::MakeCigaredRecord(seq, newCigar, newStrand);
+        record.Impl().SetSequenceAndQualities(seq, newQuals);
+        return record;
+    };
+
+    // QUAL
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.Qualities(orientation, aligned, exciseSoftClips).Fastq(); }
+    );
+}
+
+static
+void CheckSequenceClippedAndAligned(const std::string& cigar,
+                                    const std::string& input,
+                                    const ExpectedResult<std::string>& e)
+{
+    // aligned record w/ SEQ
+    auto makeRecord = [](const std::string& newSeq,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return BamRecordTests::MakeCigaredRecord(newSeq, newCigar, newStrand); };
+
+    // SEQ
+    CheckAlignAndClip(cigar, input, e, makeRecord,
+                      [](const BamRecord& b,
+                         Orientation orientation,
+                         bool aligned,
+                         bool exciseSoftClips)
+                      { return b.Sequence(orientation, aligned, exciseSoftClips); }
+    );
+}
+
+static
+void CheckPulseBaseTags(const std::string& cigar,
+                        const std::string& seqBases,
+                        const std::string& pulseCalls,
+                        const std::string& pulseBases,
+                        const ExpectedResult<std::string>& allPulses,
+                        const ExpectedResult<std::string>& basecallsOnly)
+{
+    // aligned record + AltLabelTag
+    auto makeRecord = [](const std::string& newSeqBases,
+                         const std::string& newPulseCalls,
+                         const std::string& newPulseBases,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return MakeCigaredPulseBaseRecord(newSeqBases, newPulseCalls, newPulseBases, newCigar, newStrand); };
+
+    // AltLabelTag
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseBases, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.AltLabelTag(orientation, aligned, exciseSoftClips, pulseBehavior); }
+    );
+    // PulseCall
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseBases, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PulseCall(orientation, aligned, exciseSoftClips, pulseBehavior); }
+    );
+}
+
+static
+void CheckPulseFrameTags(const std::string& cigar,
+                         const std::string& seqBases,
+                         const std::string& pulseCalls,
+                         const std::vector<uint16_t>& pulseFrames,
+                         const ExpectedResult<std::vector<uint16_t>>& allPulses,
+                         const ExpectedResult<std::vector<uint16_t>>& basecallsOnly)
+{
+    // aligned record + PrePulseFrames
+    auto makeRecord = [](const std::string& newSeqBases,
+                         const std::string& newPulseCalls,
+                         const std::vector<uint16_t>& newPulseFrames,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return MakeCigaredPulseFrameRecord(newSeqBases, newPulseCalls, newPulseFrames, newCigar, newStrand); };
+
+    // PrePulseFrame
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseFrames, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PrePulseFrames(orientation, aligned, exciseSoftClips, pulseBehavior).Data(); }
+    );
+    // PulseCallWidth
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseFrames, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PulseCallWidth(orientation, aligned, exciseSoftClips, pulseBehavior).Data(); }
+    );
+}
+
+/*
+
+    { BamRecordTag::PKMEAN,            {"pa", true}  },   photons (vector<float>
+    { BamRecordTag::PKMEAN_2,          {"ps", true}  },   photons
+    { BamRecordTag::PKMID,             {"pm", true}  },   photons
+    { BamRecordTag::PKMID_2,           {"pi", true}  },   photons
+*/
+
+static
+void CheckPulseQualityTags(const std::string& cigar,
+                           const std::string& seqBases,
+                           const std::string& pulseCalls,
+                           const std::string& pulseQuals,
+                           const ExpectedResult<std::string>& allPulses,
+                           const ExpectedResult<std::string>& basecallsOnly)
+{
+    // aligned record + AltLabelQV
+    auto makeRecord = [](const std::string& newSeqBases,
+                         const std::string& newPulseCalls,
+                         const std::string& newPulseQuals,
+                         const std::string& newCigar,
+                         const Strand newStrand)
+    { return MakeCigaredPulseQualRecord(newSeqBases, newPulseCalls, newPulseQuals, newCigar, newStrand); };
+
+    // AltLabelQV
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.AltLabelQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); }
+    );
+    // LabelQV
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.LabelQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); }
+    );
+    // PulseMergeQV
+    CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, pulseQuals, allPulses, basecallsOnly, makeRecord,
+                              [](const BamRecord& b,
+                                 Orientation orientation,
+                                 bool aligned,
+                                 bool exciseSoftClips,
+                                 PulseBehavior pulseBehavior)
+                              { return b.PulseMergeQV(orientation, aligned, exciseSoftClips, pulseBehavior).Fastq(); }
+    );
+}
+
+static
+void CheckPulseUIntTags(const std::string& cigar,
+                        const std::string& seqBases,
+                        const std::string& pulseCalls,
+                        const std::vector<uint32_t>& startFrames,
+                        const ExpectedResult<std::vector<uint32_t>>& allPulses,
+                        const ExpectedResult<std::vector<uint32_t>>& basecallsOnly)
+{
+   // aligned record + StartFrame
+   auto makeRecord = [](const std::string& newSeqBases,
+                        const std::string& newPulseCalls,
+                        const std::vector<uint32_t>& newStartFrames,
+                        const std::string& newCigar,
+                        const Strand newStrand)
+   { return MakeCigaredPulseUIntRecord(newSeqBases, newPulseCalls, newStartFrames, newCigar, newStrand); };
+
+   // StartFrame
+   CheckPulseDataAlignAndClip(cigar, seqBases, pulseCalls, startFrames, allPulses, basecallsOnly, makeRecord,
+                             [](const BamRecord& b,
+                                Orientation orientation,
+                                bool aligned,
+                                bool exciseSoftClips,
+                                PulseBehavior pulseBehavior)
+                             { return b.StartFrame(orientation, aligned, exciseSoftClips, pulseBehavior); }
+   );
+}
+
+
+
+} // namespace BamRecordTests
+
+TEST(BamRecordTest, DefaultValues)
+{
+    BamRecord bam;
+    const std::string emptyString;
+
+    // BamRecordImpl data
+    EXPECT_EQ(0, bam.Impl().Bin());
+    EXPECT_EQ(BamRecordImpl::UNMAPPED, bam.Impl().Flag());  // forced init unmapped
+    EXPECT_EQ(0, bam.Impl().InsertSize());
+    EXPECT_EQ(255, bam.Impl().MapQuality());
+    EXPECT_EQ(-1, bam.Impl().MateReferenceId());
+    EXPECT_EQ(-1, bam.Impl().MatePosition());
+    EXPECT_EQ(-1, bam.Impl().Position());
+    EXPECT_EQ(-1, bam.Impl().ReferenceId());
+    EXPECT_EQ(0, bam.Impl().Tags().size());
+
+    EXPECT_FALSE(bam.Impl().IsDuplicate());
+    EXPECT_FALSE(bam.Impl().IsFailedQC());
+    EXPECT_FALSE(bam.Impl().IsFirstMate());
+    EXPECT_FALSE(bam.Impl().IsMapped());             // forced init unmapped
+    EXPECT_TRUE(bam.Impl().IsMateMapped());
+    EXPECT_FALSE(bam.Impl().IsMateReverseStrand());
+    EXPECT_FALSE(bam.Impl().IsPaired());
+    EXPECT_TRUE(bam.Impl().IsPrimaryAlignment());
+    EXPECT_FALSE(bam.Impl().IsProperPair());
+    EXPECT_FALSE(bam.Impl().IsReverseStrand());
+    EXPECT_FALSE(bam.Impl().IsSecondMate());
+    EXPECT_FALSE(bam.Impl().IsSupplementaryAlignment());
+
+    EXPECT_EQ(emptyString, bam.Impl().Name());
+    EXPECT_EQ(emptyString, bam.Impl().CigarData().ToStdString());
+    EXPECT_EQ(emptyString, bam.Impl().Sequence());
+    EXPECT_EQ(emptyString, bam.Impl().Qualities().Fastq());
+
+    // PacBio data
+    EXPECT_EQ(-1, bam.AlignedStart());
+    EXPECT_EQ(-1, bam.AlignedEnd());
+
+    EXPECT_FALSE(bam.HasHoleNumber());
+    EXPECT_FALSE(bam.HasNumPasses());
+    EXPECT_FALSE(bam.HasQueryEnd());
+    EXPECT_FALSE(bam.HasQueryStart());
+    EXPECT_FALSE(bam.HasReadAccuracy());
+
+    EXPECT_THROW(bam.HoleNumber(), std::exception);
+    EXPECT_THROW(bam.NumPasses(), std::exception);
+    EXPECT_EQ(int32_t{0}, bam.QueryEnd());
+    EXPECT_EQ(int32_t{0}, bam.QueryStart());
+    EXPECT_THROW(bam.ReadAccuracy(), std::exception);
+
+    EXPECT_FALSE(bam.HasDeletionQV());
+    EXPECT_FALSE(bam.HasDeletionTag());
+    EXPECT_FALSE(bam.HasInsertionQV());
+    EXPECT_FALSE(bam.HasMergeQV());
+    EXPECT_FALSE(bam.HasSubstitutionQV());
+    EXPECT_FALSE(bam.HasSubstitutionTag());
+
+    EXPECT_THROW(bam.DeletionQV(),      std::exception);
+    EXPECT_THROW(bam.DeletionTag(),     std::exception);
+    EXPECT_THROW(bam.InsertionQV(),     std::exception);
+    EXPECT_THROW(bam.MergeQV(),         std::exception);
+    EXPECT_THROW(bam.SubstitutionQV(),  std::exception);
+    EXPECT_THROW(bam.SubstitutionTag(), std::exception);
+
+    // raw data
+    BamRecordTests::CheckRawData(bam);
+}
+
+TEST(BamRecordTest, FromBamRecordImpl)
+{
+    // check generic data
+    BamRecordImpl genericBam = BamRecordTests::CreateBamImpl();
+
+    EXPECT_EQ(42, genericBam.Bin());
+    EXPECT_EQ(42, genericBam.Flag());
+    EXPECT_EQ(42, genericBam.InsertSize());
+    EXPECT_EQ(42, genericBam.MapQuality());
+    EXPECT_EQ(42, genericBam.MateReferenceId());
+    EXPECT_EQ(42, genericBam.MatePosition());
+    EXPECT_EQ(42, genericBam.Position());
+    EXPECT_EQ(42, genericBam.ReferenceId());
+
+    const TagCollection genericTags = genericBam.Tags();
+    EXPECT_TRUE(genericTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), genericTags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, genericTags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), genericTags.at("CA").ToUInt8Array());
+
+    // copy ctor
+    BamRecord bam1(genericBam);
+
+    EXPECT_EQ(42, bam1.Impl().Bin());
+    EXPECT_EQ(42, bam1.Impl().Flag());
+    EXPECT_EQ(42, bam1.Impl().InsertSize());
+    EXPECT_EQ(42, bam1.Impl().MapQuality());
+    EXPECT_EQ(42, bam1.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam1.Impl().MatePosition());
+    EXPECT_EQ(42, bam1.Impl().Position());
+    EXPECT_EQ(42, bam1.Impl().ReferenceId());
+
+    const TagCollection bam1Tags = bam1.Impl().Tags();
+    EXPECT_TRUE(bam1Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), bam1Tags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, bam1Tags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), bam1Tags.at("CA").ToUInt8Array());
+
+    // copy assignment
+    BamRecord bam2;
+    bam2 = genericBam;
+
+    EXPECT_EQ(42, bam2.Impl().Bin());
+    EXPECT_EQ(42, bam2.Impl().Flag());
+    EXPECT_EQ(42, bam2.Impl().InsertSize());
+    EXPECT_EQ(42, bam2.Impl().MapQuality());
+    EXPECT_EQ(42, bam2.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam2.Impl().MatePosition());
+    EXPECT_EQ(42, bam2.Impl().Position());
+    EXPECT_EQ(42, bam2.Impl().ReferenceId());
+
+    const TagCollection bam2Tags = bam2.Impl().Tags();
+    EXPECT_TRUE(bam2Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), bam2Tags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, bam2Tags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), bam2Tags.at("CA").ToUInt8Array());
+
+    // change genericBam, make sure we deep copied bam1 & bam2
+    genericBam.Position(2000);
+
+    EXPECT_EQ(2000, genericBam.Position());
+    EXPECT_EQ(42, bam1.Impl().Position());
+    EXPECT_EQ(42, bam2.Impl().Position());
+
+    // move ctor
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    BamRecord bam3(std::move(BamRecordTests::CreateBamImpl()));
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam3.Impl().Bin());
+    EXPECT_EQ(42, bam3.Impl().Flag());
+    EXPECT_EQ(42, bam3.Impl().InsertSize());
+    EXPECT_EQ(42, bam3.Impl().MapQuality());
+    EXPECT_EQ(42, bam3.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam3.Impl().MatePosition());
+    EXPECT_EQ(42, bam3.Impl().Position());
+    EXPECT_EQ(42, bam3.Impl().ReferenceId());
+
+    const TagCollection bam3Tags = bam3.Impl().Tags();
+    EXPECT_TRUE(bam3Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), bam3Tags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, bam3Tags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), bam3Tags.at("CA").ToUInt8Array());
+
+    // move assignment
+    BamRecord bam4;
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    bam4 = std::move(BamRecordTests::CreateBamImpl());
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam4.Impl().Bin());
+    EXPECT_EQ(42, bam4.Impl().Flag());
+    EXPECT_EQ(42, bam4.Impl().InsertSize());
+    EXPECT_EQ(42, bam4.Impl().MapQuality());
+    EXPECT_EQ(42, bam4.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam4.Impl().MatePosition());
+    EXPECT_EQ(42, bam4.Impl().Position());
+    EXPECT_EQ(42, bam4.Impl().ReferenceId());
+
+    const TagCollection bam4Tags = bam4.Impl().Tags();
+    EXPECT_TRUE(bam4Tags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), bam4Tags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, bam4Tags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), bam4Tags.at("CA").ToUInt8Array());
+}
+
+TEST(BamRecordTest, SelfAssignmentTolerated)
+{
+    BamRecord bam1;
+    bam1.Impl().Bin(42);
+    bam1.Impl().Flag(42);
+    bam1.Impl().InsertSize(42);
+    bam1.Impl().MapQuality(42);
+    bam1.Impl().MatePosition(42);
+    bam1.Impl().MateReferenceId(42);
+    bam1.Impl().Position(42);
+    bam1.Impl().ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam1.Impl().Tags(tags);
+
+    bam1 = bam1;
+
+    EXPECT_EQ(42, bam1.Impl().Bin());
+    EXPECT_EQ(42, bam1.Impl().Flag());
+    EXPECT_EQ(42, bam1.Impl().InsertSize());
+    EXPECT_EQ(42, bam1.Impl().MapQuality());
+    EXPECT_EQ(42, bam1.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam1.Impl().MatePosition());
+    EXPECT_EQ(42, bam1.Impl().Position());
+    EXPECT_EQ(42, bam1.Impl().ReferenceId());
+
+    const TagCollection fetchedTags1 = bam1.Impl().Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    BamRecordTests::CheckRawData(bam1);
+}
+
+TEST(BamRecordTest, CoreSetters)
+{
+    // create basic BAM with (generic) data
+    BamRecord bam = BamRecordTests::CreateBam();
+
+    QualityValues testQVs;
+    testQVs.push_back(0);
+    testQVs.push_back(1);
+
+    const std::string testTags = "GATTACA";
+
+    // now set PacBio data
+//    bam.AlignedStart(42);
+//    bam.AlignedEnd(42);
+//    bam.DeletionQVs(testQVs);
+//    bam.DeletionTags(testTags);
+//    bam.HoleNumber(42);
+//    bam.InsertionQVs(testQVs);
+//    bam.MergeQVs(testQVs);
+//    bam.NumPasses(42);
+//    bam.QueryEnd(42);
+//    bam.QueryStart(42);
+//    bam.ReadAccuracy(42);
+//    bam.ReferenceEnd(42);
+//    bam.ReferenceStart(42);
+//    bam.SubstitutionQVs(testQVs);
+//    bam.SubstitutionTags(testTags);
+
+    // check generic data
+    EXPECT_EQ(42, bam.Impl().Bin());
+    EXPECT_EQ(42, bam.Impl().Flag());
+    EXPECT_EQ(42, bam.Impl().InsertSize());
+    EXPECT_EQ(42, bam.Impl().MapQuality());
+    EXPECT_EQ(42, bam.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam.Impl().MatePosition());
+    EXPECT_EQ(42, bam.Impl().Position());
+    EXPECT_EQ(42, bam.Impl().ReferenceId());
+
+    // check PacBio data
+//    EXPECT_EQ(42, bam.AlignedStart());
+//    EXPECT_EQ(42, bam.AlignedEnd());
+//    EXPECT_EQ(testQVs, bam.DeletionQVs());
+//    EXPECT_EQ(testTags, bam.DeletionTags());
+//    EXPECT_EQ(42, bam.HoleNumber());
+//    EXPECT_EQ(testQVs, bam.InsertionQVs());
+//    EXPECT_EQ(testQVs, bam.MergeQVs());
+
+//    EXPECT_EQ(42, bam.NumPasses());
+//    EXPECT_EQ(42, bam.QueryEnd());
+//    EXPECT_EQ(42, bam.QueryStart());
+//    EXPECT_EQ(42, bam.ReadAccuracy());
+//    EXPECT_EQ(42, bam.ReferenceEnd());
+//    EXPECT_EQ(42, bam.ReferenceStart());
+//    EXPECT_EQ(testQVs, bam.SubstitutionQVs());
+//    EXPECT_EQ(testTags, bam.SubstitutionTags());
+
+    // check tags
+    const TagCollection fetchedTags = bam.Impl().Tags();
+    EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array());
+
+    BamRecordTests::CheckRawData(bam);
+}
+
+TEST(BamRecordTest, SequenceOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Sequence");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "13=",                  // CIGAR
+            "ATATATCCCGGCG",        // input
+            {
+                "ATATATCCCGGCG",    // forward strand, genomic
+                "ATATATCCCGGCG",    // forward strand, native
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned
+                "ATATATCCCGGCG",    // forward strand, native,  aligned
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned + clipped
+                "ATATATCCCGGCG",    // forward strand, native,  aligned + clipped
+                "ATATATCCCGGCG",    // reverse strand, genomic
+                "CGCCGGGATATAT",    // reverse strand, native
+                "ATATATCCCGGCG",    // reverse strand, genomic, aligned
+                "CGCCGGGATATAT",    // reverse strand, native,  aligned
+                "ATATATCCCGGCG",    // reverse strand, genomic, aligned + clipped
+                "CGCCGGGATATAT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, QualitiesOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Qualities");
+        BamRecordTests::CheckQualitiesClippedAndAligned(
+            "13=",                  // CIGAR
+            "?]?]?]?]?]?]*",        // input
+            {
+                "?]?]?]?]?]?]*",    // forward strand, genomic
+                "?]?]?]?]?]?]*",    // forward strand, native
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned + clipped
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned + clipped
+                "?]?]?]?]?]?]*",    // reverse strand, genomic
+                "*]?]?]?]?]?]?",    // reverse strand, native
+                "?]?]?]?]?]?]*",    // reverse strand, genomic, aligned
+                "*]?]?]?]?]?]?",    // reverse strand, native,  aligned
+                "?]?]?]?]?]?]*",    // reverse strand, genomic, aligned + clipped
+                "*]?]?]?]?]?]?"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, SequenceTagsOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Base Tags");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "13=",                  // CIGAR
+            "ATATATCCCGGCG",        // input
+            {
+                "ATATATCCCGGCG",    // forward strand, genomic
+                "ATATATCCCGGCG",    // forward strand, native
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned
+                "ATATATCCCGGCG",    // forward strand, native, aligned
+                "ATATATCCCGGCG",    // forward strand, genomic, aligned, clipped
+                "ATATATCCCGGCG",    // forward strand, native, aligned, clipped
+                "CGCCGGGATATAT",    // reverse strand, genomic
+                "ATATATCCCGGCG",    // reverse strand, native
+                "CGCCGGGATATAT",    // reverse strand, genomic, aligned
+                "ATATATCCCGGCG",    // reverse strand, native, aligned
+                "CGCCGGGATATAT",    // reverse strand, genomic, aligned, clipped
+                "ATATATCCCGGCG"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, FrameTagsOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Frames");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "5=",                   // CIGAR
+            { 0, 1, 2, 3, 4 },      // input
+            {
+                { 0, 1, 2, 3, 4 },  // forward strand, genomic
+                { 0, 1, 2, 3, 4 },  // forward strand, native
+                { 0, 1, 2, 3, 4 },  // forward strand, genomic, aligned
+                { 0, 1, 2, 3, 4 },  // forward strand, native, aligned
+                { 0, 1, 2, 3, 4 },  // forward strand, genomic, aligned, clipped
+                { 0, 1, 2, 3, 4 },  // forward strand, native, aligned, clipped
+                { 4, 3, 2, 1, 0 },  // reverse strand, genomic
+                { 0, 1, 2, 3, 4 },  // reverse strand, native
+                { 4, 3, 2, 1, 0 },  // reverse strand, genomic, aligned
+                { 0, 1, 2, 3, 4 },  // reverse strand, native, aligned
+                { 4, 3, 2, 1, 0 },  // reverse strand, genomic, aligned, clipped
+                { 0, 1, 2, 3, 4 }   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, QualityTagsOrientation)
+{
+    {
+        SCOPED_TRACE("Simple CIGAR Quality Tags");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "13=",                  // CIGAR
+            "?]?]?]?]?]?]*",        // input
+            {
+                "?]?]?]?]?]?]*",    // forward strand, genomic
+                "?]?]?]?]?]?]*",    // forward strand, native
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned
+                "?]?]?]?]?]?]*",    // forward strand, genomic, aligned + clipped
+                "?]?]?]?]?]?]*",    // forward strand, native,  aligned + clipped
+                "*]?]?]?]?]?]?",    // reverse strand, genomic
+                "?]?]?]?]?]?]*",    // reverse strand, native
+                "*]?]?]?]?]?]?",    // reverse strand, genomic, aligned
+                "?]?]?]?]?]?]*",    // reverse strand, native,  aligned
+                "*]?]?]?]?]?]?",    // reverse strand, genomic, aligned + clipped
+                "?]?]?]?]?]?]*"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, SequenceClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 10=");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "10=",              // CIGAR
+            "ATCCGCGGTT",       // input
+            {
+                "ATCCGCGGTT",   // forward strand, genomic
+                "ATCCGCGGTT",   // forward strand, native
+                "ATCCGCGGTT",   // forward strand, genomic, aligned
+                "ATCCGCGGTT",   // forward strand, native,  aligned
+                "ATCCGCGGTT",   // forward strand, genomic, aligned + clipped
+                "ATCCGCGGTT",   // forward strand, native,  aligned + clipped
+                "ATCCGCGGTT",   // reverse strand, genomic
+                "AACCGCGGAT",   // reverse strand, native
+                "ATCCGCGGTT",   // reverse strand, genomic, aligned
+                "AACCGCGGAT",   // reverse strand, native,  aligned
+                "ATCCGCGGTT",   // reverse strand, genomic, aligned + clipped
+                "AACCGCGGAT"    // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3=4N3=");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "3=4N3=",       // CIGAR
+            "ACGTTT",        // input
+            {
+                "ACGTTT",    // forward strand, genomic
+                "ACGTTT",    // forward strand, native
+                "ACGTTT",    // forward strand, genomic, aligned
+                "ACGTTT",    // forward strand, native,  aligned
+                "ACGTTT",    // forward strand, genomic, aligned + clipped
+                "ACGTTT",    // forward strand, native,  aligned + clipped
+                "ACGTTT",    // reverse strand, genomic
+                "AAACGT",    // reverse strand, native
+                "ACGTTT",    // reverse strand, genomic, aligned
+                "AAACGT",    // reverse strand, native,  aligned
+                "ACGTTT",    // reverse strand, genomic, aligned + clipped
+                "AAACGT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 1S8=1S");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "1S8=1S",           // CIGAR
+            "ACCCGCGGTT",       // input
+            {
+                "ACCCGCGGTT",   // forward strand, genomic
+                "ACCCGCGGTT",   // forward strand, native
+                "ACCCGCGGTT",   // forward strand, genomic, aligned
+                "ACCCGCGGTT",   // forward strand, native,  aligned
+                "CCCGCGGT",     // forward strand, genomic, aligned + clipped
+                "CCCGCGGT",     // forward strand, native,  aligned + clipped
+                "ACCCGCGGTT",   // reverse strand, genomic
+                "AACCGCGGGT",   // reverse strand, native
+                "ACCCGCGGTT",   // reverse strand, genomic, aligned
+                "AACCGCGGGT",   // reverse strand, native,  aligned
+                "CCCGCGGT",     // reverse strand, genomic, aligned + clipped
+                "ACCGCGGG"      // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 1H8=1H");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "1H8=1H",           // CIGAR
+            "ATCGCGGT",         // input
+            {
+                "ATCGCGGT",     // forward strand, genomic
+                "ATCGCGGT",     // forward strand, native
+                "ATCGCGGT",     // forward strand, genomic, aligned
+                "ATCGCGGT",     // forward strand, native,  aligned
+                "ATCGCGGT",     // forward strand, genomic, aligned + clipped
+                "ATCGCGGT",     // forward strand, native,  aligned + clipped
+                "ATCGCGGT",     // reverse strand, genomic
+                "ACCGCGAT",     // reverse strand, native
+                "ATCGCGGT",     // reverse strand, genomic, aligned
+                "ACCGCGAT",     // reverse strand, native,  aligned
+                "ATCGCGGT",     // reverse strand, genomic, aligned + clipped
+                "ACCGCGAT"      // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2S6=2S");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2S6=2S",           // CIGAR
+            "AGCCGCGGTT",       // input
+            {
+                "AGCCGCGGTT",   // forward strand, genomic
+                "AGCCGCGGTT",   // forward strand, native
+                "AGCCGCGGTT",   // forward strand, genomic, aligned
+                "AGCCGCGGTT",   // forward strand, native,  aligned
+                "CCGCGG",       // forward strand, genomic, aligned + clipped
+                "CCGCGG",       // forward strand, native,  aligned + clipped
+                "AGCCGCGGTT",   // reverse strand, genomic
+                "AACCGCGGCT",   // reverse strand, native
+                "AGCCGCGGTT",   // reverse strand, genomic, aligned
+                "AACCGCGGCT",   // reverse strand, native,  aligned
+                "CCGCGG",       // reverse strand, genomic, aligned + clipped
+                "CCGCGG"        // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2S3=2I3=2S");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2S3=2I3=2S",           // CIGAR
+            "ATCCGNNCGGTT",         // input
+            {
+                "ATCCGNNCGGTT",     // forward strand, genomic
+                "ATCCGNNCGGTT",     // forward strand, native
+                "ATCCGNNCGGTT",     // forward strand, genomic, aligned
+                "ATCCGNNCGGTT",     // forward strand, native,  aligned
+                "CCGNNCGG",         // forward strand, genomic, aligned + clipped
+                "CCGNNCGG",         // forward strand, native,  aligned + clipped
+                "ATCCGNNCGGTT",     // reverse strand, genomic
+                "AACCGNNCGGAT",     // reverse strand, native
+                "ATCCGNNCGGTT",     // reverse strand, genomic, aligned
+                "AACCGNNCGGAT",     // reverse strand, native,  aligned
+                "CCGNNCGG",         // reverse strand, genomic, aligned + clipped
+                "CCGNNCGG"          // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H6=2H");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2H6=2H",       // CIGAR
+            "CAGCGG",       // input
+            {
+                "CAGCGG",   // forward strand, genomic
+                "CAGCGG",   // forward strand, native
+                "CAGCGG",   // forward strand, genomic, aligned
+                "CAGCGG",   // forward strand, native,  aligned
+                "CAGCGG",   // forward strand, genomic, aligned + clipped
+                "CAGCGG",   // forward strand, native,  aligned + clipped
+                "CAGCGG",   // reverse strand, genomic
+                "CCGCTG",   // reverse strand, native
+                "CAGCGG",   // reverse strand, genomic, aligned
+                "CCGCTG",   // reverse strand, native,  aligned
+                "CAGCGG",   // reverse strand, genomic, aligned + clipped
+                "CCGCTG"    // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, ClippingOrientationAndAlignment)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native,  aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",  // forward strand, native,  aligned + clipped
+                "AACCGTTA",     // reverse strand, genomic
+                "TAACGGTT",     // reverse strand, native
+                "AACC---GTTA",  // reverse strand, genomic, aligned
+                "TAAC---GGTT",  // reverse strand, native,  aligned
+                "AACC---GTTA",  // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "4=1D2I2D4=",           // CIGAR
+            "ATCCTAGGTT",           // input
+            {
+                "ATCCTAGGTT",       // forward strand, genomic
+                "ATCCTAGGTT",       // forward strand, native
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned
+                "ATCC-TA--GGTT",    // forward strand, native,  aligned
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned + clipped
+                "ATCC-TA--GGTT",    // forward strand, native,  aligned + clipped
+                "ATCCTAGGTT",       // reverse strand, genomic
+                "AACCTAGGAT",       // reverse strand, native
+                "ATCC-TA--GGTT",    // reverse strand, genomic, aligned
+                "AACC--TA-GGAT",    // reverse strand, native,  aligned
+                "ATCC-TA--GGTT",    // reverse strand, genomic, aligned + clipped
+                "AACC--TA-GGAT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "4=1D2P2I2P2D4=",           // CIGAR
+            "ATCCTAGGTT",               // input
+            {
+                "ATCCTAGGTT",           // forward strand, genomic
+                "ATCCTAGGTT",           // forward strand, native
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, native,  aligned
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned + clipped
+                "ATCC-**TA**--GGTT",    // forward strand, native,  aligned + clipped
+                "ATCCTAGGTT",           // reverse strand, genomic
+                "AACCTAGGAT",           // reverse strand, native
+                "ATCC-**TA**--GGTT",    // reverse strand, genomic, aligned
+                "AACC--**TA**-GGAT",    // reverse strand, native,  aligned
+                "ATCC-**TA**--GGTT",    // reverse strand, genomic, aligned + clipped
+                "AACC--**TA**-GGAT"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2S4=3D4=3S");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2S4=3D4=3S",               // CIGAR
+            "TTAACCGTTACCG",            // input
+            {
+                "TTAACCGTTACCG",        // forward strand, genomic
+                "TTAACCGTTACCG",        // forward strand, native
+                "TTAACC---GTTACCG",     // forward strand, genomic, aligned
+                "TTAACC---GTTACCG",     // forward strand, native,  aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",          // forward strand, native,  aligned + clipped
+                "TTAACCGTTACCG",        // reverse strand, genomic
+                "CGGTAACGGTTAA",        // reverse strand, native
+                "TTAACC---GTTACCG",     // reverse strand, genomic, aligned
+                "CGGTAAC---GGTTAA",     // reverse strand, native,  aligned
+                "AACC---GTTA",          // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"           // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native,  aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",  // forward strand, native,  aligned + clipped
+                "AACCGTTA",     // reverse strand, genomic
+                "TAACGGTT",     // reverse strand, native
+                "AACC---GTTA",  // reverse strand, genomic, aligned
+                "TAAC---GGTT",  // reverse strand, native,  aligned
+                "AACC---GTTA",  // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H2S4=3D4=3S3H");
+        BamRecordTests::CheckSequenceClippedAndAligned(
+            "2H2S4=3D4=3S3H",           // CIGAR
+            "TTAACCGTTACCG",            // input
+            {
+                "TTAACCGTTACCG",        // forward strand, genomic
+                "TTAACCGTTACCG",        // forward strand, native
+                "TTAACC---GTTACCG",     // forward strand, genomic, aligned
+                "TTAACC---GTTACCG",     // forward strand, native,  aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned + clipped
+                "AACC---GTTA",          // forward strand, native,  aligned + clipped
+                "TTAACCGTTACCG",        // reverse strand, genomic
+                "CGGTAACGGTTAA",        // reverse strand, native
+                "TTAACC---GTTACCG",     // reverse strand, genomic, aligned
+                "CGGTAAC---GGTTAA",     // reverse strand, native,  aligned
+                "AACC---GTTA",          // reverse strand, genomic, aligned + clipped
+                "TAAC---GGTT"           // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, QualityTagsClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "4=3D4=",           // CIGAR
+            "?]?]?]?@",         // input
+            {
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native,  aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned + clipped
+                "?]?]!!!?]?@",  // forward strand, native,  aligned + clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native,  aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned + clipped
+                "?]?]!!!?]?@"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "4=1D2I2D4=",           // CIGAR
+            "?]?]87?]?@",           // input
+            {
+                "?]?]87?]?@",       // forward strand, genomic
+                "?]?]87?]?@",       // forward strand, native
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned + clipped
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned + clipped
+                "@?]?78]?]?",       // reverse strand, genomic
+                "?]?]87?]?@",       // reverse strand, native
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned
+                "?]?]!!87!?]?@",    // reverse strand, native,  aligned
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned + clipped
+                "?]?]!!87!?]?@"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "4=1D2P2I2P2D4=",       // CIGAR
+            "?]?]87?]?@",           // input
+        {
+            "?]?]87?]?@",           // forward strand, genomic
+            "?]?]87?]?@",           // forward strand, native
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned + clipped
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned + clipped
+            "@?]?78]?]?",           // reverse strand, genomic
+            "?]?]87?]?@",           // reverse strand, native
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned
+            "?]?]!!!!87!!!?]?@",    // reverse strand, native,  aligned
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned + clipped
+            "?]?]!!!!87!!!?]?@"     // reverse strand, native,  aligned + clipped
+        }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "3S4=3D4=3S",               // CIGAR
+            "vvv?]?]?]?@xxx",           // input
+            {
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "2H4=3D4=3H",       // CIGAR
+            "?]?]?]?@",         // input
+            {
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native, aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",  // forward strand, native, aligned, clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native, aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckQualityTagsClippedAndAligned(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "vvv?]?]?]?@xxx",           // input
+            {
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, BaseTagsClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "4=1D2I2D4=",           // CIGAR
+            "ATCCTAGGTT",           // input
+            {
+                "ATCCTAGGTT",       // forward strand, genomic
+                "ATCCTAGGTT",       // forward strand, native
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned
+                "ATCC-TA--GGTT",    // forward strand, native, aligned
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-TA--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",       // reverse strand, genomic
+                "ATCCTAGGTT",       // reverse strand, native
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--TA-GGTT",    // reverse strand, native, aligned
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--TA-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "4=1D2P2I2P2D4=",           // CIGAR
+            "ATCCTAGGTT",               // input
+            {
+                "ATCCTAGGTT",           // forward strand, genomic
+                "ATCCTAGGTT",           // forward strand, native
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",           // reverse strand, genomic
+                "ATCCTAGGTT",           // reverse strand, native
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--**TA**-GGTT",    // reverse strand, native, aligned
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--**TA**-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // input
+            {
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // input
+            {
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckBaseTagsClippedAndAligned(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // input
+            {
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, FrameTagsClippedAndAligned)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "4=3D4=",                                           // CIGAR
+            { 10, 20, 10, 20, 10, 20, 10, 30 },                 // input
+            {
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "4=1D2I2D4=",                                               // CIGAR
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                 // input
+            {
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "4=1D2P2I2P2D4=",                                                   // CIGAR
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // input
+        {
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, genomic
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, native
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+            { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+            { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },                         // reverse strand, genomic
+            { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // reverse strand, native
+            { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+            { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+            { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+            { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+        }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "3S4=3D4=3S",                                                               // CIGAR
+            { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },                 // input
+            {
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "2H4=3D4=3H",                                       // CIGAR
+            { 10, 20, 10, 20, 10, 20, 10, 30 },                 // input
+            {
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckFrameTagsClippedAndAligned(
+            "2H3S4=3D4=3S3H",                                                           // CIGAR
+            { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },                 // input
+            {
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseBaseTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckPulseBaseTags(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "AAaaCCGggTTA",     // tag data
+
+            {   // all pulses
+
+                "AAaaCCGggTTA",     // forward strand, genomic
+                "AAaaCCGggTTA",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native, aligned
+                "",  // forward strand, genomic, aligned, clipped
+                "",  // forward strand, native, aligned, clipped
+                "TAAccCGGttTT",     // reverse strand, genomic
+                "AAaaCCGggTTA",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native, aligned
+                "",  // reverse strand, genomic, aligned, clipped
+                ""   // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckPulseBaseTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "ATttCCTtAGGggTT",  // tag data
+
+            {   // all pulses
+
+                "ATttCCTtAGGggTT",       // forward strand, genomic
+                "ATttCCTtAGGggTT",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",    // forward strand, genomic, aligned, clipped
+                "",    // forward strand, native, aligned, clipped
+                "AAccCCTaAGGaaAT",       // reverse strand, genomic
+                "ATttCCTtAGGggTT",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",    // reverse strand, genomic, aligned, clipped
+                ""     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "ATCCTAGGTT",       // forward strand, genomic
+                "ATCCTAGGTT",       // forward strand, native
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned
+                "ATCC-TA--GGTT",    // forward strand, native, aligned
+                "ATCC-TA--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-TA--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",       // reverse strand, genomic
+                "ATCCTAGGTT",       // reverse strand, native
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--TA-GGTT",    // reverse strand, native, aligned
+                "AACC-TA--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--TA-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckPulseBaseTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "ATttCCTtAGGggTT",  // tag data
+            {
+                "ATttCCTtAGGggTT",           // forward strand, genomic
+                "ATttCCTtAGGggTT",           // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",    // forward strand, genomic, aligned, clipped
+                "",    // forward strand, native, aligned, clipped
+                "AAccCCTaAGGaaAT",           // reverse strand, genomic
+                "ATttCCTtAGGggTT",           // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",    // reverse strand, genomic, aligned, clipped
+                ""     // reverse strand, native, aligned, clipped
+            },
+            {
+                "ATCCTAGGTT",           // forward strand, genomic
+                "ATCCTAGGTT",           // forward strand, native
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned
+                "ATCC-**TA**--GGTT",    // forward strand, genomic, aligned, clipped
+                "ATCC-**TA**--GGTT",    // forward strand, native, aligned, clipped
+                "AACCTAGGAT",           // reverse strand, genomic
+                "ATCCTAGGTT",           // reverse strand, native
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned
+                "ATCC--**TA**-GGTT",    // reverse strand, native, aligned
+                "AACC-**TA**--GGAT",    // reverse strand, genomic, aligned, clipped
+                "ATCC--**TA**-GGTT"     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckPulseBaseTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "TTTttAACCccGTTAaaCCG",     // tag data
+
+            {   // all pulses
+
+                "TTTttAACCccGTTAaaCCG",       // forward strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // forward strand, native
+                "",         // forward strand, genomic, aligned
+                "",         // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "CGGttTAACggGGTTaaAAA",       // reverse strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",     // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckPulseBaseTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "AAaaCCGggTTA",     // tag data
+
+            {   // all pulses
+
+                "AAaaCCGggTTA",     // forward strand, genomic
+                "AAaaCCGggTTA",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native, aligned
+                "",  // forward strand, genomic, aligned, clipped
+                "",  // forward strand, native, aligned, clipped
+                "TAAccCGGttTT",     // reverse strand, genomic
+                "AAaaCCGggTTA",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native, aligned
+                "",  // reverse strand, genomic, aligned, clipped
+                ""   // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "AACCGTTA",     // forward strand, genomic
+                "AACCGTTA",     // forward strand, native
+                "AACC---GTTA",  // forward strand, genomic, aligned
+                "AACC---GTTA",  // forward strand, native, aligned
+                "AACC---GTTA",  // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",  // forward strand, native, aligned, clipped
+                "TAACGGTT",     // reverse strand, genomic
+                "AACCGTTA",     // reverse strand, native
+                "TAAC---GGTT",  // reverse strand, genomic, aligned
+                "AACC---GTTA",  // reverse strand, native, aligned
+                "TAAC---GGTT",  // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckPulseBaseTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "TTTttAACCccGTTAaaCCG",     // tag data
+
+            {   // all pulses
+
+                "TTTttAACCccGTTAaaCCG",       // forward strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // forward strand, native
+                "",         // forward strand, genomic, aligned
+                "",         // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "CGGttTAACggGGTTaaAAA",       // reverse strand, genomic
+                "TTTttAACCccGTTAaaCCG",       // reverse strand, native
+                "",         // reverse strand, genomic, aligned
+                "",         // reverse strand, native, aligned
+                "",          // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "TTTAACCGTTACCG",       // forward strand, genomic
+                "TTTAACCGTTACCG",       // forward strand, native
+                "TTTAACC---GTTACCG",    // forward strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // forward strand, native, aligned
+                "AACC---GTTA",          // forward strand, genomic, aligned, clipped
+                "AACC---GTTA",          // forward strand, native, aligned, clipped
+                "CGGTAACGGTTAAA",       // reverse strand, genomic
+                "TTTAACCGTTACCG",       // reverse strand, native
+                "CGGTAAC---GGTTAAA",    // reverse strand, genomic, aligned
+                "TTTAACC---GTTACCG",    // reverse strand, native, aligned
+                "TAAC---GGTT",          // reverse strand, genomic, aligned, clipped
+                "AACC---GTTA"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseQualityTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckPulseQualityTags(
+            "4=3D4=",           // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "?]!!?]?!!]?@",     // tag data
+
+            {   // all pulses
+
+                "?]!!?]?!!]?@",     // forward strand, genomic
+                "?]!!?]?!!]?@",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native,  aligned
+                "",  // forward strand, genomic, aligned + clipped
+                "",  // forward strand, native,  aligned + clipped
+                "@?]!!?]?!!]?",     // reverse strand, genomic
+                "?]!!?]?!!]?@",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native,  aligned
+                "",  // reverse strand, genomic, aligned + clipped
+                ""   // reverse strand, native,  aligned + clipped
+            },
+            {   // basecalls only
+
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native,  aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned + clipped
+                "?]?]!!!?]?@",  // forward strand, native,  aligned + clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native,  aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned + clipped
+                "?]?]!!!?]?@"   // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckPulseQualityTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "?]!!?]8!7?]!!?@",  // tag data
+
+            {   // all pulses
+
+                "?]!!?]8!7?]!!?@",       // forward strand, genomic
+                "?]!!?]8!7?]!!?@",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native,  aligned
+                "",    // forward strand, genomic, aligned + clipped
+                "",    // forward strand, native,  aligned + clipped
+                "@?!!]?7!8]?!!]?",       // reverse strand, genomic
+                "?]!!?]8!7?]!!?@",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native,  aligned
+                "",    // reverse strand, genomic, aligned + clipped
+                ""     // reverse strand, native,  aligned + clipped
+            },
+            {   // basecalls only
+
+                "?]?]87?]?@",       // forward strand, genomic
+                "?]?]87?]?@",       // forward strand, native
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned
+                "?]?]!87!!?]?@",    // forward strand, genomic, aligned + clipped
+                "?]?]!87!!?]?@",    // forward strand, native,  aligned + clipped
+                "@?]?78]?]?",       // reverse strand, genomic
+                "?]?]87?]?@",       // reverse strand, native
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned
+                "?]?]!!87!?]?@",    // reverse strand, native,  aligned
+                "@?]?!78!!]?]?",    // reverse strand, genomic, aligned + clipped
+                "?]?]!!87!?]?@"     // reverse strand, native,  aligned + clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckPulseQualityTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            "?]!!?]8!7?]!!?@",  // tag data
+        {
+            "?]!!?]8!7?]!!?@",           // forward strand, genomic
+            "?]!!?]8!7?]!!?@",           // forward strand, native
+            "",    // forward strand, genomic, aligned
+            "",    // forward strand, native,  aligned
+            "",    // forward strand, genomic, aligned + clipped
+            "",    // forward strand, native,  aligned + clipped
+            "@?!!]?7!8]?!!]?",           // reverse strand, genomic
+            "?]!!?]8!7?]!!?@",           // reverse strand, native
+            "",    // reverse strand, genomic, aligned
+            "",    // reverse strand, native,  aligned
+            "",    // reverse strand, genomic, aligned + clipped
+            ""     // reverse strand, native,  aligned + clipped
+        },
+        {
+            "?]?]87?]?@",           // forward strand, genomic
+            "?]?]87?]?@",           // forward strand, native
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned
+            "?]?]!!!87!!!!?]?@",    // forward strand, genomic, aligned + clipped
+            "?]?]!!!87!!!!?]?@",    // forward strand, native,  aligned + clipped
+            "@?]?78]?]?",           // reverse strand, genomic
+            "?]?]87?]?@",           // reverse strand, native
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned
+            "?]?]!!!!87!!!?]?@",    // reverse strand, native,  aligned
+            "@?]?!!!78!!!!]?]?",    // reverse strand, genomic, aligned + clipped
+            "?]?]!!!!87!!!?]?@"     // reverse strand, native,  aligned + clipped
+        }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckPulseQualityTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "vvv!!?]?]!!?]?@!!xxx",     // tag data
+
+            {   // all pulses
+
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "xxx!!@?]?!!]?]?!!vvv",       // reverse strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",          // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckPulseQualityTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            "?]!!?]?!!]?@",     // tag data
+
+            {   // all pulses
+
+                "?]!!?]?!!]?@",     // forward strand, genomic
+                "?]!!?]?!!]?@",     // forward strand, native
+                "",  // forward strand, genomic, aligned
+                "",  // forward strand, native, aligned
+                "",  // forward strand, genomic, aligned, clipped
+                "",  // forward strand, native, aligned, clipped
+                "@?]!!?]?!!]?",     // reverse strand, genomic
+                "?]!!?]?!!]?@",     // reverse strand, native
+                "",  // reverse strand, genomic, aligned
+                "",  // reverse strand, native, aligned
+                "",  // reverse strand, genomic, aligned, clipped
+                ""   // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "?]?]?]?@",     // forward strand, genomic
+                "?]?]?]?@",     // forward strand, native
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned
+                "?]?]!!!?]?@",  // forward strand, native, aligned
+                "?]?]!!!?]?@",  // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",  // forward strand, native, aligned, clipped
+                "@?]?]?]?",     // reverse strand, genomic
+                "?]?]?]?@",     // reverse strand, native
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned
+                "?]?]!!!?]?@",  // reverse strand, native, aligned
+                "@?]?!!!]?]?",  // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"   // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckPulseQualityTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            "vvv!!?]?]!!?]?@!!xxx",     // tag data
+
+            {   // all pulses
+
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // forward strand, native
+                "",    // forward strand, genomic, aligned
+                "",    // forward strand, native, aligned
+                "",          // forward strand, genomic, aligned, clipped
+                "",          // forward strand, native, aligned, clipped
+                "xxx!!@?]?!!]?]?!!vvv",       // reverse strand, genomic
+                "vvv!!?]?]!!?]?@!!xxx",       // reverse strand, native
+                "",    // reverse strand, genomic, aligned
+                "",    // reverse strand, native, aligned
+                "",          // reverse strand, genomic, aligned, clipped
+                ""           // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                "vvv?]?]?]?@xxx",       // forward strand, genomic
+                "vvv?]?]?]?@xxx",       // forward strand, native
+                "vvv?]?]!!!?]?@xxx",    // forward strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // forward strand, native, aligned
+                "?]?]!!!?]?@",          // forward strand, genomic, aligned, clipped
+                "?]?]!!!?]?@",          // forward strand, native, aligned, clipped
+                "xxx@?]?]?]?vvv",       // reverse strand, genomic
+                "vvv?]?]?]?@xxx",       // reverse strand, native
+                "xxx@?]?!!!]?]?vvv",    // reverse strand, genomic, aligned
+                "vvv?]?]!!!?]?@xxx",    // reverse strand, native, aligned
+                "@?]?!!!]?]?",          // reverse strand, genomic, aligned, clipped
+                "?]?]!!!?]?@"           // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseFrameTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckPulseFrameTags(
+            "4=3D4=",       // CIGAR
+            "AACCGTTA",     // seqBases
+            "AAaaCCGggTTA", // pulseCalls
+            { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },   // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckPulseFrameTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckPulseFrameTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },                         // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckPulseFrameTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },   // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckPulseFrameTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckPulseFrameTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },                 // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseUIntTags)
+{
+    {
+        SCOPED_TRACE("CIGAR: 4=3D4=");
+        BamRecordTests::CheckPulseUIntTags(
+            "4=3D4=",       // CIGAR
+            "AACCGTTA",     // seqBases
+            "AAaaCCGggTTA", // pulseCalls
+            { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },   // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0,0, 10, 20, 10, 0,0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2I2D4=");
+        BamRecordTests::CheckPulseUIntTags(
+            "4=1D2I2D4=",       // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 80, 70, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 70, 80, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 80, 70, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 4=1D2P2I2P2D4=");
+        BamRecordTests::CheckPulseUIntTags(
+            "4=1D2P2I2P2D4=",   // CIGAR
+            "ATCCTAGGTT",       // seqBases
+            "ATttCCTtAGGggTT",  // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 0, 0, 20, 10, 70, 0, 80, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 80, 0, 70, 10, 20, 0, 0, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 80, 70, 0, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 70, 80, 20, 10, 20, 10 },                         // reverse strand, genomic
+                { 10, 20, 10, 20, 80, 70, 10, 20, 10, 30 },                         // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 70, 80, 0, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 0, 80, 70, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 3S4=3D4=3S");
+        BamRecordTests::CheckPulseUIntTags(
+            "3S4=3D4=3S",               // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },   // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H4=3D4=3H");
+        BamRecordTests::CheckPulseUIntTags(
+            "2H4=3D4=3H",       // CIGAR
+            "AACCGTTA",         // seqBases
+            "AAaaCCGggTTA",     // pulseCalls
+            { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 }, // tag data
+
+            {   // all pulses
+
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 0, 0, 10, 20, 10, 0, 0, 20, 10, 30 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // forward strand, native
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // forward strand, native, aligned, clipped
+                { 30, 10, 20, 10, 20, 10, 20, 10 },             // reverse strand, genomic
+                { 10, 20, 10, 20, 10, 20, 10, 30 },             // reverse strand, native
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },    // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }     // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+    {
+        SCOPED_TRACE("CIGAR: 2H3S4=3D4=3S3H");
+        BamRecordTests::CheckPulseUIntTags(
+            "2H3S4=3D4=3S3H",           // CIGAR
+            "TTTAACCGTTACCG",           // seqBases
+            "TTTttAACCccGTTAaaCCG",     // pulseCalls
+            { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },                 // tag data
+
+            {   // all pulses
+
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // forward strand, native
+                { },    // forward strand, genomic, aligned
+                { },    // forward strand, native, aligned
+                { },    // forward strand, genomic, aligned, clipped
+                { },    // forward strand, native, aligned, clipped
+                { 50, 50, 50, 0, 0, 30, 10, 20, 10, 0, 0, 20, 10, 20, 10, 0, 0, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 0, 0, 10, 20, 10, 20, 0, 0, 10, 20, 10, 30, 0, 0, 50, 50, 50 },             // reverse strand, native
+                { },    // reverse strand, genomic, aligned
+                { },    // reverse strand, native, aligned
+                { },    // reverse strand, genomic, aligned, clipped
+                { }     // reverse strand, native, aligned, clipped
+            },
+            {   // basecalls only
+
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // forward strand, native
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // forward strand, native, aligned
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 },                            // forward strand, native, aligned, clipped
+                { 50, 50, 50, 30, 10, 20, 10, 20, 10, 20, 10, 40, 40, 40 },             // reverse strand, genomic
+                { 40, 40, 40, 10, 20, 10, 20, 10, 20, 10, 30, 50, 50, 50 },             // reverse strand, native
+                { 50, 50, 50, 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10, 40, 40, 40 },    // reverse strand, genomic, aligned
+                { 40, 40, 40, 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30, 50, 50, 50 },    // reverse strand, native, aligned
+                { 30, 10, 20, 10, 0, 0, 0, 20, 10, 20, 10 },                            // reverse strand, genomic, aligned, clipped
+                { 10, 20, 10, 20, 0, 0, 0, 10, 20, 10, 30 }                             // reverse strand, native, aligned, clipped
+            }
+        );
+    }
+}
+
+TEST(BamRecordTest, PulseExclusionTag)
+{
+    const std::vector<PacBio::BAM::PulseExclusionReason> reasons =
+    {
+        PulseExclusionReason::BASE
+      , PulseExclusionReason::PAUSE
+      , PulseExclusionReason::SHORT_PULSE
+      , PulseExclusionReason::BURST
+      , PulseExclusionReason::BASE
+      , PulseExclusionReason::PAUSE
+    };
+
+    auto bam = BamRecordTests::CreateBam();
+    bam.PulseExclusionReason(reasons);
+
+    EXPECT_TRUE(bam.HasPulseExclusion());
+    auto result = bam.PulseExclusionReason();
+    EXPECT_EQ(reasons, result);
+
+}
+
+TEST(BamRecordTest, TranscriptRecord)
+{
+    const std::string readTypeStr{"TRANSCRIPT"};
+    const auto readGroupId = MakeReadGroupId("transcript", readTypeStr);
+
+    ReadGroupInfo rg{readGroupId};
+    rg.ReadType(readTypeStr);
+
+    BamHeader header;
+    header.Version("1.1")
+        .SortOrder("queryname")
+        .PacBioBamVersion("3.0.1");
+
+    BamRecord bam{header};
+    bam.Impl().Name("transcript/1234");
+
+    EXPECT_EQ(RecordType::TRANSCRIPT, bam.Type());
+    EXPECT_EQ(1234, bam.HoleNumber());
+    EXPECT_THROW({bam.QueryStart();}, std::runtime_error);
+    EXPECT_THROW({bam.QueryEnd();}, std::runtime_error);
+}
+
+// clang-format on
diff --git a/tests/src/test_BamRecordBuilder.cpp b/tests/src/test_BamRecordBuilder.cpp

new file mode 100644 (file)

index 0000000..09594a4
--- /dev/null
+++ b/tests/src/test_BamRecordBuilder.cpp
@@ -0,0 +1,174 @@
+// Author: Derek Barnett
+
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecordBuilder.h>
+#include <pbbam/BamTagCodec.h>
+#include "../src/MemoryUtils.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace BamRecordBuilderTests {
+
+static void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+
+    const uint32_t expectedNameLength = bam.Name().size() + 1;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t expectedSeqLength = bam.Sequence().length();
+    const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + << TAGS >>
+
+    const int expectedTotalDataLength = expectedNameLength + (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength + 1) / 2 + expectedSeqLength +
+                                        expectedTagsLength;
+
+    const auto rawData = PacBio::BAM::internal::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+    EXPECT_EQ(expectedNameLength, rawData->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps, rawData->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength, rawData->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, rawData->l_data);
+}
+
+static void CheckRawData(const BamRecord& bam) { CheckRawData(bam.Impl()); }
+
+}  // namespace BamRecordBuilderTests
+
+TEST(BamRecordBuilderTest, DefaultValues)
+{
+    BamRecordBuilder builder;
+    BamRecord bam = builder.Build();
+
+    const auto rawData = PacBio::BAM::internal::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    // fixed-length (core) data
+    EXPECT_EQ(0, rawData->core.tid);
+    EXPECT_EQ(0, rawData->core.pos);
+    EXPECT_EQ(0, rawData->core.bin);
+    EXPECT_EQ(0, rawData->core.qual);
+    EXPECT_EQ(1, rawData->core.l_qname);  // initialized w/ NULL-term
+    EXPECT_EQ(0, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(0, rawData->core.mtid);
+    EXPECT_EQ(0, rawData->core.mpos);
+    EXPECT_EQ(0, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(1, rawData->l_data);
+    EXPECT_EQ(int{0x800}, rawData->m_data);  // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(0, bam.Impl().Bin());
+    EXPECT_EQ(0, bam.Impl().Flag());
+    EXPECT_EQ(0, bam.Impl().InsertSize());
+    EXPECT_EQ(0, bam.Impl().MapQuality());
+    EXPECT_EQ(0, bam.Impl().MateReferenceId());
+    EXPECT_EQ(0, bam.Impl().MatePosition());
+    EXPECT_EQ(0, bam.Impl().Position());
+    EXPECT_EQ(0, bam.Impl().ReferenceId());
+    EXPECT_EQ(0, bam.Impl().Tags().size());
+
+    EXPECT_FALSE(bam.Impl().IsDuplicate());
+    EXPECT_FALSE(bam.Impl().IsFailedQC());
+    EXPECT_FALSE(bam.Impl().IsFirstMate());
+    EXPECT_TRUE(bam.Impl().IsMapped());
+    EXPECT_TRUE(bam.Impl().IsMateMapped());
+    EXPECT_FALSE(bam.Impl().IsMateReverseStrand());
+    EXPECT_FALSE(bam.Impl().IsPaired());
+    EXPECT_TRUE(bam.Impl().IsPrimaryAlignment());
+    EXPECT_FALSE(bam.Impl().IsProperPair());
+    EXPECT_FALSE(bam.Impl().IsReverseStrand());
+    EXPECT_FALSE(bam.Impl().IsSecondMate());
+    EXPECT_FALSE(bam.Impl().IsSupplementaryAlignment());
+
+    const std::string emptyString = "";
+    EXPECT_EQ(emptyString, bam.Impl().Name());
+    EXPECT_EQ(emptyString, bam.Impl().CigarData().ToStdString());
+    EXPECT_EQ(emptyString, bam.Impl().Sequence());
+    EXPECT_EQ(emptyString, bam.Impl().Qualities().Fastq());
+    BamRecordBuilderTests::CheckRawData(bam);
+}
+
+TEST(BamRecordBuilderTest, CheckSetters)
+{
+    // should be 28 bytes, encoded
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordBuilder builder;
+    builder.Bin(42)
+        .Flag(42)
+        .InsertSize(42)
+        .MapQuality(42)
+        .MatePosition(42)
+        .MateReferenceId(42)
+        .Position(42)
+        .ReferenceId(42)
+        .Tags(tags);
+
+    BamRecord bam = builder.Build();
+
+    // -------------------------------
+    // check raw data
+    // -------------------------------
+
+    const auto rawData = PacBio::BAM::internal::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    // fixed-length (core) data
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(1, rawData->core.l_qname);  // initialized w/ NULL-term
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(29, rawData->l_data);          // NULL-term qname + tags
+    EXPECT_EQ(int{0x800}, rawData->m_data);  // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(42, bam.Impl().Bin());
+    EXPECT_EQ(42, bam.Impl().Flag());
+    EXPECT_EQ(42, bam.Impl().InsertSize());
+    EXPECT_EQ(42, bam.Impl().MapQuality());
+    EXPECT_EQ(42, bam.Impl().MateReferenceId());
+    EXPECT_EQ(42, bam.Impl().MatePosition());
+    EXPECT_EQ(42, bam.Impl().Position());
+    EXPECT_EQ(42, bam.Impl().ReferenceId());
+
+    const TagCollection fetchedTags = bam.Impl().Tags();
+
+    EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array());
+}
diff --git a/tests/src/test_BamRecordClipping.cpp b/tests/src/test_BamRecordClipping.cpp

new file mode 100644 (file)

index 0000000..a98dc49
--- /dev/null
+++ b/tests/src/test_BamRecordClipping.cpp
@@ -0,0 +1,2020 @@
+// Author: Derek Barnett
+
+#include <chrono>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamRecordView.h>
+#include <pbbam/BamTagCodec.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+typedef std::vector<uint16_t> f_data;
+
+namespace BamRecordClippingTests {
+
+static 
+ReadGroupInfo MakeReadGroup(const FrameCodec codec,
+                            const std::string& movieName,
+                            const std::string& readType)
+{
+    ReadGroupInfo rg{movieName, readType};
+    rg.IpdCodec(codec);
+    rg.PulseWidthCodec(codec);
+    return rg;
+}
+
+static
+BamRecord MakeRecord(const Position qStart,
+                     const Position qEnd,
+                     const std::string& seq,
+                     const std::string& quals,
+                     const std::string& tagBases,
+                     const std::string& tagQuals,
+                     const f_data& frames,
+                     const std::string& pulseCall = "",
+                     const std::string& pulseBases = "",
+                     const std::string& pulseQuals = "",
+                     const f_data& pulseFrames = f_data(),
+                     const FrameCodec codec = FrameCodec::RAW)
+{
+    BamRecordImpl impl;
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["qs"] = qStart;        // qStart
+    tags["qe"] = qEnd;          // qEnd
+    tags["dt"] = tagBases;      // deletionTag
+    tags["st"] = tagBases;      // substitutionTag
+    tags["dq"] = tagQuals;      // deletionQV
+    tags["iq"] = tagQuals;      // insertionQV
+    tags["mq"] = tagQuals;      // mergeQV
+    tags["sq"] = tagQuals;      // substitutionQV
+    tags["ip"] = frames;        // IPD
+    tags["pw"] = frames;        // pulseWidth
+    tags["pc"] = pulseCall;     // pulseCall
+    tags["pt"] = pulseBases;    // altLabelTag
+    tags["pq"] = pulseQuals;    // labelQV
+    tags["pv"] = pulseQuals;    // altLabelQV
+    tags["pg"] = pulseQuals;    // pulseMergeQV
+    tags["pa"] = pulseFrames;   // pkmean
+    tags["pm"] = pulseFrames;   // pkmid
+    impl.Tags(tags);
+
+    const auto rg = MakeReadGroup(codec, "movie", "SUBREAD");
+
+    BamRecord bam(std::move(impl));
+    bam.header_.AddReadGroup(rg);
+    bam.ReadGroup(rg);
+    return bam;
+}
+
+static
+BamRecord MakeCCSRecord(const std::string& seq,
+                        const std::string& quals,
+                        const std::string& tagBases,
+                        const std::string& tagQuals,
+                        const f_data& frames,
+                        const std::string& pulseCall = "",
+                        const std::string& pulseBases = "",
+                        const std::string& pulseQuals = "",
+                        const f_data& pulseFrames = f_data(),
+                        const FrameCodec codec = FrameCodec::RAW)
+{
+    BamRecordImpl impl;
+    impl.Name("movie/42/ccs");
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["dt"] = tagBases;      // deletionTag
+    tags["st"] = tagBases;      // substitutionTag
+    tags["dq"] = tagQuals;      // deletionQV
+    tags["iq"] = tagQuals;      // insertionQV
+    tags["mq"] = tagQuals;      // mergeQV
+    tags["sq"] = tagQuals;      // substitutionQV
+    tags["ip"] = frames;        // IPD
+    tags["pw"] = frames;        // pulseWidth
+    tags["pc"] = pulseCall;     // pulseCall
+    tags["pt"] = pulseBases;    // altLabelTag
+    tags["pq"] = pulseQuals;    // labelQV
+    tags["pv"] = pulseQuals;    // altLabelQV
+    tags["pg"] = pulseQuals;    // pulseMergeQV
+    tags["pa"] = pulseFrames;   // pkmean
+    tags["pm"] = pulseFrames;   // pkmid
+    impl.Tags(tags);
+
+    const auto rg = MakeReadGroup(codec, "movie", "CCS");
+
+    BamRecord bam(std::move(impl));
+    bam.header_.AddReadGroup(rg);
+    bam.ReadGroup(rg);
+    return bam;
+}
+
+} // namespace BamRecordClippingTests
+
+TEST(BamRecordClippingTest, ClipToQuery_Basic)
+{
+    const Position qStart  = 500;
+    const Position qEnd    = 510;
+    const std::string seq       = "AACCGTTAGC";
+    const std::string quals     = "?]?]?]?]?*";
+    const std::string tagBases  = "AACCGTTAGC";
+    const std::string tagQuals  = "?]?]?]?]?*";
+    const f_data frames    = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const std::string pulseCall   = "ttAaAtaCCGggatTTAcatGCt";
+    const std::string pulseBases  = pulseCall;
+    const std::string pulseQuals  = "==?=]==?]?====]?]===?*=";
+    const f_data pulseFrames = { 0,0,10,0,10,0,0,20,20,30,0,0,0,0,40,40,10,0,0,0,30,20,0 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const std::string seq_clipped      = "CCGTTAG";
+    const std::string quals_clipped    = "?]?]?]?";
+    const std::string tagBases_clipped = "CCGTTAG";
+    const std::string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const std::string pulseCall_clipped = "CCGggatTTAcatG";
+    const std::string pulseQuals_clipped = "?]?====]?]===?";
+    const f_data pulseFrames_clipped = { 20,20,30,0,0,0,0,40,40,10,0,0,0,30 };
+
+    const std::string seq_rev       = "GCTAACGGTT";
+    const std::string pulseCall_rev = "aGCatgTAAatccCGGtaTtTaa";
+    const std::string quals_rev     = "*?]?]?]?]?";
+    const std::string tagQuals_rev  = quals_rev;
+    const f_data frames_rev    = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 };
+
+    const std::string seq_rev_clipped   = "CTAACGG";
+    const std::string quals_rev_clipped = "?]?]?]?";
+    const std::string tagBases_rev_clipped = seq_rev_clipped;
+    const std::string tagQuals_rev_clipped = quals_rev_clipped;
+    const f_data frames_rev_clipped = { 30, 10, 40, 40, 30, 20, 20 };
+
+    const std::string pulseCall_rev_clipped = "CatgTAAatccCGG";
+    const std::string pulseQuals_rev_clipped    = "?===]?]====?]?";
+    const f_data pulseFrames_rev_clipped = { 30,0,0,0,10,40,40,0,0,0,0,30,20,20 };
+
+    const std::string s1_cigar = "10=";
+    const std::string s2_cigar = "5=3D5=";
+    const std::string s3_cigar = "4=1D2I2D4=";
+
+    const std::string s1_cigar_clipped = "7=";
+    const std::string s2_cigar_clipped = "3=3D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D3=";
+
+    const BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  pulseCall, pulseBases, pulseQuals, pulseFrames);
+
+    BamRecord s0 = prototype; // unmapped record
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    s0.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s1.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    {   // s0
+
+        EXPECT_FALSE(s0.IsMapped());
+        EXPECT_EQ(clipStart, s0.QueryStart());
+        EXPECT_EQ(clipEnd,   s0.QueryEnd());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedStart());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedEnd());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceStart());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceEnd());
+
+        const BamRecordView view
+        {
+            s0,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(clipStart, s1.QueryStart());
+        EXPECT_EQ(clipEnd,   s1.QueryEnd());
+        EXPECT_EQ(clipStart, s1.AlignedStart());   // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s1.AlignedEnd());     // alignStart + seqLength
+        EXPECT_EQ(102, s1.ReferenceStart());       // 100 + startOffset
+        EXPECT_EQ(109, s1.ReferenceEnd());         // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s1_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s1_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s1_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s1_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(102, s1_rev.ReferenceStart());        // 100 + startOffset
+        EXPECT_EQ(109, s1_rev.ReferenceEnd());          // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(clipStart, s2.QueryStart());
+        EXPECT_EQ(clipEnd,   s2.QueryEnd());
+        EXPECT_EQ(clipStart, s2.AlignedStart());   // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s2.AlignedEnd());     // alignStart + seqLength
+        EXPECT_EQ(102, s2.ReferenceStart());       // 100 + startOffset
+        EXPECT_EQ(112, s2.ReferenceEnd());         // RefStart + 7= + 3D
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,      view.Sequence());
+        EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s2_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s2_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s2_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s2_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(102, s2_rev.ReferenceStart());        // 100 + startOffset
+        EXPECT_EQ(112, s2_rev.ReferenceEnd());          // RefStart + 7= + 3D
+
+        EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(clipStart, s3.QueryStart());
+        EXPECT_EQ(clipEnd,   s3.QueryEnd());
+        EXPECT_EQ(clipStart, s3.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s3.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+        EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s3_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s3_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s3_rev.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s3_rev.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(102, s3_rev.ReferenceStart());         // 100 + startOffset
+        EXPECT_EQ(110, s3_rev.ReferenceEnd());           // RefStart + 5= + 3D
+
+        EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+}
+
+TEST(BamRecordClippingTest, ClipToQuery_WithSoftClips)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const std::string seq      = "TTAACCGTTAGCAAA";
+    const std::string seq_rev  = "TTTGCTAACGGTTAA";
+    const std::string quals    = "--?]?]?]?]?*+++";
+    const std::string tagBases = "TTAACCGTTAGCAAA";
+    const std::string tagQuals = "--?]?]?]?]?*+++";
+    const std::string tagQuals_rev = "+++*?]?]?]?]?--";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+    const f_data frames_rev = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const std::string s1_cigar = "2S10=3S";
+    const std::string s1_cigar_clipped = "7=";
+    const std::string s1_seq_clipped      = "AACCGTT";
+    const std::string s1_quals_clipped    = "?]?]?]?";
+    const std::string s1_tagBases_clipped = s1_seq_clipped;
+    const std::string s1_tagQuals_clipped = s1_quals_clipped;
+    const f_data s1_frames_clipped   = { 10, 10, 20, 20, 30, 40, 40 };
+    const std::string s1_seq_rev_clipped   = "AACGGTT";
+    const std::string s1_quals_rev_clipped = "?]?]?]?";
+    const std::string s1_tagBases_rev_clipped = s1_seq_rev_clipped;
+    const std::string s1_tagQuals_rev_clipped = s1_quals_rev_clipped;
+    const f_data s1_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 };
+
+    const std::string s2_cigar = "2S5=3D5=3S";
+    const std::string s2_cigar_clipped = "5=3D2=";
+    const std::string s2_seq_clipped      = "AACCGTT";
+    const std::string s2_quals_clipped    = "?]?]?]?";
+    const std::string s2_tagBases_clipped = s2_seq_clipped;
+    const std::string s2_tagQuals_clipped = s2_quals_clipped;
+    const f_data s2_frames_clipped   = { 10, 10, 20, 20, 30, 40, 40 };
+    const std::string s2_seq_rev_clipped   = "AACGGTT";
+    const std::string s2_quals_rev_clipped = "?]?]?]?";
+    const std::string s2_tagBases_rev_clipped = s2_seq_rev_clipped;
+    const std::string s2_tagQuals_rev_clipped = s2_quals_rev_clipped;
+    const f_data s2_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 };
+
+    const std::string s3_cigar = "2S4=1D2I2D4=3S";
+    const std::string s3_cigar_clipped = "4=1D2I2D1=";
+    const std::string s3_seq_clipped      = "AACCGTT";
+    const std::string s3_quals_clipped    = "?]?]?]?";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 10, 10, 20, 20, 30, 40, 40 };
+    const std::string s3_seq_rev_clipped   = "AACGGTT";
+    const std::string s3_quals_rev_clipped = "?]?]?]?";
+    const std::string s3_tagBases_rev_clipped = s3_seq_rev_clipped;
+    const std::string s3_tagQuals_rev_clipped = s3_quals_rev_clipped;
+    const f_data s3_frames_rev_clipped = { 40, 40, 30, 20, 20, 10, 10 };
+
+    const BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  seq, tagBases, tagQuals, frames);
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    // sanity checks before clipping
+    EXPECT_TRUE(s1.IsMapped());
+    EXPECT_EQ(tPos, s1.ReferenceStart());
+    EXPECT_EQ(tPos + 10, s1.ReferenceEnd()); // 10=
+
+    EXPECT_TRUE(s1_rev.IsMapped());
+    EXPECT_EQ(tPos, s1_rev.ReferenceStart());
+    EXPECT_EQ(tPos + 10, s1_rev.ReferenceEnd()); // 10=
+
+    EXPECT_TRUE(s2.IsMapped());
+    EXPECT_EQ(tPos, s2.ReferenceStart());
+    EXPECT_EQ(tPos + 13, s2.ReferenceEnd());   // 5= + 3D + 5=
+
+    EXPECT_TRUE(s2_rev.IsMapped());
+    EXPECT_EQ(tPos, s2_rev.ReferenceStart());
+    EXPECT_EQ(tPos + 13, s2_rev.ReferenceEnd());   // 5= + 3D + 5=
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(tPos, s3.ReferenceStart());
+    EXPECT_EQ(tPos + 11, s3.ReferenceEnd());   // 4= + 1D + 2D + 4=
+
+    EXPECT_TRUE(s3_rev.IsMapped());
+    EXPECT_EQ(tPos, s3_rev.ReferenceStart());
+    EXPECT_EQ(tPos + 11, s3_rev.ReferenceEnd());   // 4= + 1D + 2D + 4=
+
+    s1.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(clipStart, s1.QueryStart());
+        EXPECT_EQ(clipEnd,   s1.QueryEnd());
+        EXPECT_EQ(clipStart, s1.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s1.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s1.ReferenceStart());  // tPos
+        EXPECT_EQ(tPos + 7,  s1.ReferenceEnd());    // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s1_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s1_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s1_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s1_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s1_rev.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 7,  s1_rev.ReferenceEnd());    // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(clipStart, s2.QueryStart());
+        EXPECT_EQ(clipEnd,   s2.QueryEnd());
+        EXPECT_EQ(clipStart, s2.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s2.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s2.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 10, s2.ReferenceEnd());    // RefStart + 5=3D2=
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s2_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s2_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s2_rev.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s2_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s2_rev.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 10, s2_rev.ReferenceEnd());    // RefStart + 5=3D2=
+
+        EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(clipStart, s3.QueryStart());
+        EXPECT_EQ(clipEnd,   s3.QueryEnd());
+        EXPECT_EQ(clipStart, s3.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s3.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s3.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 8,  s3.ReferenceEnd());    // RefStart + 4=1D2D1=
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s3_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s3_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s3_rev.AlignedStart());    // queryStart (no soft clips left)
+        EXPECT_EQ(clipEnd,   s3_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(tPos,      s3_rev.ReferenceStart());  // 100 + startOffset
+        EXPECT_EQ(tPos + 8,  s3_rev.ReferenceEnd());    // RefStart + 4=1D2D1=
+
+        EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_rev_clipped,   view.IPD().Data());
+    }
+}
+
+TEST(BamRecordClippingTest, ClipToReference_Basic)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const std::string tagQuals_rev = "*?]?]?]?]?";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const std::string s1_cigar = "10=";
+    const std::string s1_cigar_clipped = "5=";
+    const std::string s1_seq_clipped      = "CCGTT";
+    const std::string s1_quals_clipped    = "?]?]?";
+    const std::string s1_tagBases_clipped = s1_seq_clipped;
+    const std::string s1_tagQuals_clipped = s1_quals_clipped;
+    const f_data s1_frames_clipped   = { 20, 20, 30, 40, 40 };
+    const std::string s1_seq_rev_clipped   = "TAACG";
+    const std::string s1_quals_rev_clipped = "]?]?]";
+    const std::string s1_tagBases_rev_clipped = s1_seq_rev_clipped;
+    const std::string s1_tagQuals_rev_clipped = s1_quals_rev_clipped;
+    const f_data s1_frames_rev_clipped = { 10, 40, 40, 30, 20 };
+
+    const std::string s2_cigar = "5=3D5=";
+    const std::string s2_cigar_clipped = "3=2D";
+    const std::string s2_seq_clipped      = "CCG";
+    const std::string s2_quals_clipped    = "?]?";
+    const std::string s2_tagBases_clipped = s2_seq_clipped;
+    const std::string s2_tagQuals_clipped = s2_quals_clipped;
+    const f_data s2_frames_clipped   = { 20, 20, 30 };
+    const std::string s2_seq_rev_clipped   = "TAA";
+    const std::string s2_quals_rev_clipped = "]?]";
+    const std::string s2_tagBases_rev_clipped = s2_seq_rev_clipped;
+    const std::string s2_tagQuals_rev_clipped = s2_quals_rev_clipped;
+    const f_data s2_frames_rev_clipped = { 10, 40, 40 };
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D";
+    const std::string s3_seq_clipped      = "CCGT";
+    const std::string s3_quals_clipped    = "?]?]";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+    const std::string s3_seq_rev_clipped   = "TAAC";
+    const std::string s3_quals_rev_clipped = "]?]?";
+    const std::string s3_tagBases_rev_clipped = s3_seq_rev_clipped;
+    const std::string s3_tagQuals_rev_clipped = s3_quals_rev_clipped;
+    const f_data s3_frames_rev_clipped = { 10, 40, 40, 30};
+
+    const BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  seq, tagBases, tagQuals, frames);
+    BamRecord s0 = prototype;
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    s0.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    {   // s0 - no clipping should have been done to unmapped record
+
+        EXPECT_FALSE(s0.IsMapped());
+        EXPECT_EQ(prototype.QueryStart(),     s0.QueryStart());
+        EXPECT_EQ(prototype.QueryEnd(),       s0.QueryEnd());
+        EXPECT_EQ(prototype.AlignedStart(),   s0.AlignedStart());
+        EXPECT_EQ(prototype.AlignedEnd(),     s0.AlignedEnd());
+        EXPECT_EQ(prototype.ReferenceStart(), s0.ReferenceStart());
+        EXPECT_EQ(prototype.ReferenceEnd(),   s0.ReferenceEnd());
+
+        const BamRecordView protoView
+        {
+            prototype,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        const BamRecordView view
+        {
+            s0,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(protoView.Sequence(),       view.Sequence());
+        EXPECT_EQ(protoView.Qualities(),      view.Qualities());
+        EXPECT_EQ(protoView.DeletionTags(),    view.DeletionTags());
+        EXPECT_EQ(protoView.DeletionQVs(),     view.DeletionQVs());
+        EXPECT_EQ(protoView.LabelQVs(),        view.LabelQVs());
+        EXPECT_EQ(protoView.AltLabelQVs(),     view.AltLabelQVs());
+        EXPECT_EQ(protoView.IPD(),            view.IPD());
+    }
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(502,   s1.QueryStart());
+        EXPECT_EQ(507,   s1.QueryEnd());
+        EXPECT_EQ(502,   s1.AlignedStart());       // queryStart (no soft clips)
+        EXPECT_EQ(507,   s1.AlignedEnd());         // alignStart + seqLength
+        EXPECT_EQ(clipStart, s1.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s1.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(503, s1_rev.QueryStart());
+        EXPECT_EQ(508, s1_rev.QueryEnd());
+        EXPECT_EQ(503, s1_rev.AlignedStart());          // queryStart (no soft clips)
+        EXPECT_EQ(508, s1_rev.AlignedEnd());            // alignStart + seqLength
+        EXPECT_EQ(clipStart, s1_rev.ReferenceStart());  // clipStart
+        EXPECT_EQ(clipEnd,   s1_rev.ReferenceEnd());    // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(502, s2.QueryStart());
+        EXPECT_EQ(505, s2.QueryEnd());
+        EXPECT_EQ(502, s2.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(505, s2.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s2.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s2.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(505, s2_rev.QueryStart());
+        EXPECT_EQ(508, s2_rev.QueryEnd());
+        EXPECT_EQ(505, s2_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(508, s2_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(clipStart, s2_rev.ReferenceStart());  // clipStart
+        EXPECT_EQ(clipEnd,   s2_rev.ReferenceEnd());    // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(502, s3.QueryStart());
+        EXPECT_EQ(506, s3.QueryEnd());
+        EXPECT_EQ(502, s3.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(506, s3.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(504, s3_rev.QueryStart());
+        EXPECT_EQ(508, s3_rev.QueryEnd());
+        EXPECT_EQ(504, s3_rev.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(508, s3_rev.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s3_rev.ReferenceStart());  // clipStart
+        EXPECT_EQ(clipEnd,   s3_rev.ReferenceEnd());    // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_rev_clipped,   view.IPD().Data());
+    }
+}
+
+TEST(BamRecordClippingTest, ClipToReference_WithSoftClips)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const std::string seq      = "TTAACCGTTAGCAAA";
+    const std::string quals    = "--?]?]?]?]?*+++";
+    const std::string tagBases = "TTAACCGTTAGCAAA";
+    const std::string tagQuals = "--?]?]?]?]?*+++";
+    const std::string tagQuals_rev = "+++*?]?]?]?]?--";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const std::string seq_rev      = "TTTGCTAACGGTTAA";
+    const std::string quals_rev    = "+++*?]?]?]?]?--";
+    const f_data frames_rev   = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 };
+
+    const std::string s1_cigar = "2S10=3S";
+    const std::string s1_cigar_clipped = "5=";
+    const std::string s1_seq_clipped      = "CCGTT";
+    const std::string s1_quals_clipped    = "?]?]?";
+    const std::string s1_tagBases_clipped = s1_seq_clipped;
+    const std::string s1_tagQuals_clipped = s1_quals_clipped;
+    const f_data s1_frames_clipped   = { 20, 20, 30, 40, 40 };
+    const std::string s1_seq_rev_clipped   = "CTAAC";
+    const std::string s1_quals_rev_clipped = "?]?]?";
+    const std::string s1_tagBases_rev_clipped = s1_seq_rev_clipped;
+    const std::string s1_tagQuals_rev_clipped = s1_quals_rev_clipped;
+    const f_data s1_frames_rev_clipped = { 30, 10, 40, 40, 30 };
+
+    const std::string s2_cigar = "2S5=3D5=3S";
+    const std::string s2_cigar_clipped = "3=2D";
+    const std::string s2_seq_clipped      = "CCG";
+    const std::string s2_quals_clipped    = "?]?";
+    const std::string s2_tagBases_clipped = s2_seq_clipped;
+    const std::string s2_tagQuals_clipped = s2_quals_clipped;
+    const f_data s2_frames_clipped   = { 20, 20, 30 };
+    const std::string s2_seq_rev_clipped   = "CTA";
+    const std::string s2_quals_rev_clipped = "?]?";
+    const std::string s2_tagBases_rev_clipped = s2_seq_rev_clipped;
+    const std::string s2_tagQuals_rev_clipped = s2_quals_rev_clipped;
+    const f_data s2_frames_rev_clipped = { 30, 10, 40 };
+
+    const std::string s3_cigar = "2S4=1D2I2D4=3S";
+    const std::string s3_cigar_clipped = "2=1D2I2D";
+    const std::string s3_seq_clipped      = "CCGT";
+    const std::string s3_quals_clipped    = "?]?]";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+    const std::string s3_seq_rev_clipped   = "CTAA";
+    const std::string s3_quals_rev_clipped = "?]?]";
+    const std::string s3_tagBases_rev_clipped = s3_seq_rev_clipped;
+    const std::string s3_tagQuals_rev_clipped = s3_quals_rev_clipped;
+    const f_data s3_frames_rev_clipped = { 30, 10, 40, 40 };
+
+    const BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  seq, tagBases, tagQuals, frames);
+    BamRecord s0 = prototype;
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    // sanity checks before clipping
+    EXPECT_FALSE(s0.IsMapped());
+
+    EXPECT_TRUE(s1.IsMapped());
+    EXPECT_EQ(500,       s1.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s1.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(502,       s1.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(512,       s1.AlignedEnd());      // alignedStart + 10=
+    EXPECT_EQ(tPos,      s1.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 10, s1.ReferenceEnd());    // tPos + 10=
+
+    EXPECT_TRUE(s1_rev.IsMapped());
+    EXPECT_EQ(500,       s1_rev.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s1_rev.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(503,       s1_rev.AlignedStart());    // queryStart + 3S
+    EXPECT_EQ(513,       s1_rev.AlignedEnd());      // alignedStart + 10=
+    EXPECT_EQ(tPos,      s1_rev.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 10, s1_rev.ReferenceEnd());    // tPos + 10=
+
+    EXPECT_TRUE(s2.IsMapped());
+    EXPECT_EQ(500,       s2.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s2.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(502,       s2.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(512,       s2.AlignedEnd());      // alignedStart + 5=5=
+    EXPECT_EQ(tPos,      s2.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 13, s2.ReferenceEnd());    // tPos + 5=3D5=
+
+    EXPECT_TRUE(s2_rev.IsMapped());
+    EXPECT_EQ(500,       s2_rev.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s2_rev.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(503,       s2_rev.AlignedStart());    // queryStart + S
+    EXPECT_EQ(513,       s2_rev.AlignedEnd());      // alignedStart + 5=5=
+    EXPECT_EQ(tPos,      s2_rev.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 13, s2_rev.ReferenceEnd());    // tPos + 5=3D5=
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(500,       s3.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s3.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(502,       s3.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(512,       s3.AlignedEnd());      // alignedStart + 4=2I4=
+    EXPECT_EQ(tPos,      s3.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 11, s3.ReferenceEnd());    // tPos + 4=1D2D4=
+
+    EXPECT_TRUE(s3_rev.IsMapped());
+    EXPECT_EQ(500,       s3_rev.QueryStart());      // queryStart
+    EXPECT_EQ(515,       s3_rev.QueryEnd());        // queryStart + seqLength
+    EXPECT_EQ(503,       s3_rev.AlignedStart());    // queryStart + 2S
+    EXPECT_EQ(513,       s3_rev.AlignedEnd());      // alignedStart + 4=2I4=
+    EXPECT_EQ(tPos,      s3_rev.ReferenceStart());  // tPos
+    EXPECT_EQ(tPos + 11, s3_rev.ReferenceEnd());    // tPos + 4=1D2D4=
+
+    s0.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    {   // s0 - no clipping should have been done to unmapped record
+
+        EXPECT_FALSE(s0.IsMapped());
+        EXPECT_EQ(prototype.QueryStart(),     s0.QueryStart());
+        EXPECT_EQ(prototype.QueryEnd(),       s0.QueryEnd());
+        EXPECT_EQ(prototype.AlignedStart(),   s0.AlignedStart());
+        EXPECT_EQ(prototype.AlignedEnd(),     s0.AlignedEnd());
+        EXPECT_EQ(prototype.ReferenceStart(), s0.ReferenceStart());
+        EXPECT_EQ(prototype.ReferenceEnd(),   s0.ReferenceEnd());
+
+        const BamRecordView protoView
+        {
+            prototype,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        const BamRecordView view
+        {
+            s0,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(protoView.Sequence(),      view.Sequence());
+        EXPECT_EQ(protoView.Qualities(),     view.Qualities());
+        EXPECT_EQ(protoView.DeletionTags(),  view.DeletionTags());
+        EXPECT_EQ(protoView.DeletionQVs(),   view.DeletionQVs());
+        EXPECT_EQ(protoView.LabelQVs(),      view.LabelQVs());
+        EXPECT_EQ(protoView.AltLabelQVs(),   view.AltLabelQVs());
+        EXPECT_EQ(protoView.IPD(),           view.IPD());
+    }
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(504,   s1.QueryStart());         // new queryStart
+        EXPECT_EQ(509,   s1.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(504,   s1.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(509,   s1.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s1.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s1.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(506,   s1_rev.QueryStart());         // new queryStart
+        EXPECT_EQ(511,   s1_rev.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(506,   s1_rev.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(511,   s1_rev.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s1_rev.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s1_rev.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s1_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s1_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s1_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s1_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s1_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(504, s2.QueryStart());
+        EXPECT_EQ(507, s2.QueryEnd());
+        EXPECT_EQ(504, s2.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(507, s2.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s2.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s2.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(508,   s2_rev.QueryStart());         // new queryStart
+        EXPECT_EQ(511,   s2_rev.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(508,   s2_rev.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(511,   s2_rev.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s2_rev.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s2_rev.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s2_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s2_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s2_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s2_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s2_frames_rev_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(504, s3.QueryStart());
+        EXPECT_EQ(508, s3.QueryEnd());
+        EXPECT_EQ(504, s3.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(508, s3.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+        EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(507,   s3_rev.QueryStart());         // new queryStart
+        EXPECT_EQ(511,   s3_rev.QueryEnd());           // queryStart + new seqLength
+        EXPECT_EQ(507,   s3_rev.AlignedStart());       // queryStart (no soft clips remaining)
+        EXPECT_EQ(511,   s3_rev.AlignedEnd());         // alignStart + new seqLength
+        EXPECT_EQ(clipStart, s3_rev.ReferenceStart()); // clipStart
+        EXPECT_EQ(clipEnd,   s3_rev.ReferenceEnd());   // clipEnd
+
+        EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(s3_seq_rev_clipped,      view.Sequence());
+        EXPECT_EQ(s3_quals_rev_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(s3_tagBases_rev_clipped, view.DeletionTags());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(s3_tagQuals_rev_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(s3_frames_rev_clipped,   view.IPD().Data());
+    }
+}
+
+TEST(BamRecordClippingTest, ClippedToQueryCopy)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const std::string seq_clipped      = "CCGTTAG";
+    const std::string quals_clipped    = "?]?]?]?";
+    const std::string tagBases_clipped = "CCGTTAG";
+    const std::string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D3=";
+
+    BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(clipStart, s3.QueryStart());
+    EXPECT_EQ(clipEnd,   s3.QueryEnd());
+    EXPECT_EQ(clipStart, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(clipEnd,   s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+    EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq_clipped,      view.Sequence());
+    EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordClippingTest, ClippedToReferenceCopy)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D";
+    const std::string s3_seq_clipped      = "CCGT";
+    const std::string s3_quals_clipped    = "?]?]";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+
+    BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    // s3 - FORWARD
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(502, s3.QueryStart());
+    EXPECT_EQ(506, s3.QueryEnd());
+    EXPECT_EQ(502, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(506, s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+    EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+    EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordClippingTest, StaticClippedToQuery)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const std::string seq_clipped      = "CCGTTAG";
+    const std::string quals_clipped    = "?]?]?]?";
+    const std::string tagBases_clipped = "CCGTTAG";
+    const std::string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D3=";
+
+    BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(clipStart, s3.QueryStart());
+    EXPECT_EQ(clipEnd,   s3.QueryEnd());
+    EXPECT_EQ(clipStart, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(clipEnd,   s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+    EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq_clipped,      view.Sequence());
+    EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordClippingTest, StaticClippedToReference)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D";
+    const std::string s3_seq_clipped      = "CCGT";
+    const std::string s3_quals_clipped    = "?]?]";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+
+    BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                            seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    // s3 - FORWARD
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(502, s3.QueryStart());
+    EXPECT_EQ(506, s3.QueryEnd());
+    EXPECT_EQ(502, s3.AlignedStart());     // queryStart (no soft clips)
+    EXPECT_EQ(506, s3.AlignedEnd());       // alignStart + seqLength
+    EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+    EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+    EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordTest, ClipCigarData)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const std::string seq      = "TTAACCGTTAGCAAA";
+    const std::string quals    = "--?]?]?]?]?*+++";
+    const std::string tagBases = "TTAACCGTTAGCAAA";
+    const std::string tagQuals = "--?]?]?]?]?*+++";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+    const uint8_t mapQual = 80;
+    BamRecord s3 = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                     seq, tagBases, tagQuals, frames);
+    BamRecord s3_rev = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                         seq, tagBases, tagQuals, frames);
+
+    const std::string s3_cigar = "5H2S4=1D2I2D4=3S7H";
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    const Cigar s3_cigar_raw     = s3.CigarData();
+    const Cigar s3_cigar_clipped = s3.CigarData(true);
+
+    EXPECT_EQ(s3_cigar, s3_cigar_raw.ToStdString());
+    EXPECT_EQ(std::string("4=1D2I2D4="), s3_cigar_clipped.ToStdString());
+}
+
+TEST(BamRecordTest, CCS_ClipToQuery)
+{
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 2;
+    const Position clipEnd   = 9;
+
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const std::string seq_clipped      = "CCGTTAG";
+    const std::string quals_clipped    = "?]?]?]?";
+    const std::string tagBases_clipped = "CCGTTAG";
+    const std::string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D3=";
+
+    BamRecord prototype = BamRecordClippingTests::MakeCCSRecord(seq, quals, tagBases, tagQuals, frames,
+                                               seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    BamRecord s3 = prototype.Clipped(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(0,   s3.AlignedStart());     // record start (no soft clips)
+    EXPECT_EQ(7,   s3.AlignedEnd());       // alignStart + clipped seqLength
+    EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+    EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq_clipped,      view.Sequence());
+    EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordTest, CCS_ClipToReference)
+{
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+    const Position clipStart = 102;
+    const Position clipEnd   = 107;
+
+    const std::string s3_cigar = "4=1D2I2D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D";
+    const std::string s3_seq_clipped      = "CCGT";
+    const std::string s3_quals_clipped    = "?]?]";
+    const std::string s3_tagBases_clipped = s3_seq_clipped;
+    const std::string s3_tagQuals_clipped = s3_quals_clipped;
+    const f_data s3_frames_clipped   = { 20, 20, 30, 40 };
+
+    BamRecord prototype = BamRecordClippingTests::MakeCCSRecord(seq, quals, tagBases, tagQuals, frames,
+                                               seq, tagBases, tagQuals, frames);
+    prototype.Map(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+
+    const BamRecord s3 = BamRecord::Clipped(prototype, ClipType::CLIP_TO_REFERENCE, clipStart, clipEnd);
+
+    EXPECT_TRUE(s3.IsMapped());
+    EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+    EXPECT_EQ(0, s3.AlignedStart());     // record tart (no soft clips)
+    EXPECT_EQ(4, s3.AlignedEnd());       // alignStart + clipped seqLength (4)
+    EXPECT_EQ(clipStart, s3.ReferenceStart());   // clipStart
+    EXPECT_EQ(clipEnd,   s3.ReferenceEnd());     // clipEnd
+
+    EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+    const BamRecordView view
+    {
+        s3,
+        Orientation::GENOMIC,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(s3_seq_clipped,      view.Sequence());
+    EXPECT_EQ(s3_quals_clipped,    view.Qualities().Fastq());
+    EXPECT_EQ(s3_tagBases_clipped, view.DeletionTags());
+    EXPECT_EQ(s3_tagQuals_clipped, view.DeletionQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.LabelQVs().Fastq());
+    EXPECT_EQ(s3_tagQuals_clipped, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(s3_frames_clipped,   view.IPD().Data());
+}
+
+TEST(BamRecordTest, ClipEncodedFrames)
+{
+    const Position qStart  = 500;
+    const Position qEnd    = 510;
+    const std::string seq       = "AACCGTTAGC";
+    const std::string quals     = "?]?]?]?]?*";
+    const std::string tagBases  = "AACCGTTAGC";
+    const std::string tagQuals  = "?]?]?]?]?*";
+    const f_data frames    = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const std::string pulseCall   = "ttAaAtaCCGggatTTAcatGCt";
+    const std::string pulseBases  = pulseCall;
+    const std::string pulseQuals  = "==?=]==?]?====]?]===?*=";
+    const f_data pulseFrames = { 0,0,10,0,10,0,0,20,20,30,0,0,0,0,40,40,10,0,0,0,30,20,0 };
+
+    const int32_t  tId     = 0;
+    const Position tPos    = 100;
+    const uint8_t  mapQual = 80;
+
+    const Position clipStart = 502;
+    const Position clipEnd   = 509;
+
+    const std::string seq_clipped      = "CCGTTAG";
+    const std::string quals_clipped    = "?]?]?]?";
+    const std::string tagBases_clipped = "CCGTTAG";
+    const std::string tagQuals_clipped = "?]?]?]?";
+    const f_data frames_clipped   = { 20, 20, 30, 40, 40, 10, 30 };
+
+    const std::string pulseCall_clipped = "CCGggatTTAcatG";
+    const std::string pulseQuals_clipped = "?]?====]?]===?";
+    const f_data pulseFrames_clipped = { 20,20,30,0,0,0,0,40,40,10,0,0,0,30 };
+
+    const std::string seq_rev       = "GCTAACGGTT";
+    const std::string pulseCall_rev = "aGCatgTAAatccCGGtaTtTaa";
+    const std::string quals_rev     = "*?]?]?]?]?";
+    const std::string tagQuals_rev  = quals_rev;
+    const f_data frames_rev    = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 };
+
+    const std::string seq_rev_clipped   = "CTAACGG";
+    const std::string quals_rev_clipped = "?]?]?]?";
+    const std::string tagBases_rev_clipped = seq_rev_clipped;
+    const std::string tagQuals_rev_clipped = quals_rev_clipped;
+    const f_data frames_rev_clipped = { 30, 10, 40, 40, 30, 20, 20 };
+
+    const std::string pulseCall_rev_clipped = "CatgTAAatccCGG";
+    const std::string pulseQuals_rev_clipped    = "?===]?]====?]?";
+    const f_data pulseFrames_rev_clipped = { 30,0,0,0,10,40,40,0,0,0,0,30,20,20 };
+
+    const std::string s1_cigar = "10=";
+    const std::string s2_cigar = "5=3D5=";
+    const std::string s3_cigar = "4=1D2I2D4=";
+
+    const std::string s1_cigar_clipped = "7=";
+    const std::string s2_cigar_clipped = "3=3D4=";
+    const std::string s3_cigar_clipped = "2=1D2I2D3=";
+
+    const BamRecord prototype = BamRecordClippingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames,
+                                                  pulseCall, pulseBases, pulseQuals, pulseFrames, FrameCodec::V1);
+
+    BamRecord s0 = prototype; // unmapped record
+    BamRecord s1 = prototype.Mapped(tId, tPos, Strand::FORWARD, s1_cigar, mapQual);
+    BamRecord s2 = prototype.Mapped(tId, tPos, Strand::FORWARD, s2_cigar, mapQual);
+    BamRecord s3 = prototype.Mapped(tId, tPos, Strand::FORWARD, s3_cigar, mapQual);
+    BamRecord s1_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s1_cigar, mapQual);
+    BamRecord s2_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s2_cigar, mapQual);
+    BamRecord s3_rev = prototype.Mapped(tId, tPos, Strand::REVERSE, s3_cigar, mapQual);
+
+    s0.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s1.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s1_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s2_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+    s3_rev.Clip(ClipType::CLIP_TO_QUERY, clipStart, clipEnd);
+
+    {   // s0
+
+        EXPECT_FALSE(s0.IsMapped());
+        EXPECT_EQ(clipStart, s0.QueryStart());
+        EXPECT_EQ(clipEnd,   s0.QueryEnd());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedStart());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.AlignedEnd());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceStart());
+        EXPECT_EQ(PacBio::BAM::UnmappedPosition, s0.ReferenceEnd());
+
+        const BamRecordView view
+        {
+            s0,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(clipStart, s1.QueryStart());
+        EXPECT_EQ(clipEnd,   s1.QueryEnd());
+        EXPECT_EQ(clipStart, s1.AlignedStart());   // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s1.AlignedEnd());     // alignStart + seqLength
+        EXPECT_EQ(102, s1.ReferenceStart());       // 100 + startOffset
+        EXPECT_EQ(109, s1.ReferenceEnd());         // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s1_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s1_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s1_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s1_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(102, s1_rev.ReferenceStart());        // 100 + startOffset
+        EXPECT_EQ(109, s1_rev.ReferenceEnd());          // RefStart + 7=
+
+        EXPECT_EQ(s1_cigar_clipped, s1_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(clipStart, s2.QueryStart());
+        EXPECT_EQ(clipEnd,   s2.QueryEnd());
+        EXPECT_EQ(clipStart, s2.AlignedStart());   // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s2.AlignedEnd());     // alignStart + seqLength
+        EXPECT_EQ(102, s2.ReferenceStart());       // 100 + startOffset
+        EXPECT_EQ(112, s2.ReferenceEnd());         // RefStart + 7= + 3D
+
+        EXPECT_EQ(s2_cigar_clipped, s2.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,      view.Sequence());
+        EXPECT_EQ(quals_clipped,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped, view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped, view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s2_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s2_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s2_rev.AlignedStart());    // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s2_rev.AlignedEnd());      // alignStart + seqLength
+        EXPECT_EQ(102, s2_rev.ReferenceStart());        // 100 + startOffset
+        EXPECT_EQ(112, s2_rev.ReferenceEnd());          // RefStart + 7= + 3D
+
+        EXPECT_EQ(s2_cigar_clipped, s2_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(clipStart, s3.QueryStart());
+        EXPECT_EQ(clipEnd,   s3.QueryEnd());
+        EXPECT_EQ(clipStart, s3.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s3.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(102, s3.ReferenceStart());         // 100 + startOffset
+        EXPECT_EQ(110, s3.ReferenceEnd());           // RefStart + 5= + 3D
+
+        EXPECT_EQ(s3_cigar_clipped, s3.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_clipped,       view.Sequence());
+        EXPECT_EQ(quals_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_clipped, view.PulseCalls());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(clipStart, s3_rev.QueryStart());
+        EXPECT_EQ(clipEnd,   s3_rev.QueryEnd());
+        EXPECT_EQ(clipStart, s3_rev.AlignedStart());     // queryStart (no soft clips)
+        EXPECT_EQ(clipEnd,   s3_rev.AlignedEnd());       // alignStart + seqLength
+        EXPECT_EQ(102, s3_rev.ReferenceStart());         // 100 + startOffset
+        EXPECT_EQ(110, s3_rev.ReferenceEnd());           // RefStart + 5= + 3D
+
+        EXPECT_EQ(s3_cigar_clipped, s3_rev.CigarData().ToStdString());
+
+        const BamRecordView view
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq_rev_clipped,       view.Sequence());
+        EXPECT_EQ(quals_rev_clipped,     view.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev_clipped,  view.DeletionTags());
+        EXPECT_EQ(tagQuals_rev_clipped,  view.DeletionQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.LabelQVs().Fastq());
+        EXPECT_EQ(pulseQuals_rev_clipped,  view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev_clipped,    view.IPD().Data());
+        EXPECT_EQ(pulseCall_rev_clipped, view.PulseCalls());
+    }
+
+}
+
+// clang-format on
diff --git a/tests/src/test_BamRecordImplCore.cpp b/tests/src/test_BamRecordImplCore.cpp

new file mode 100644 (file)

index 0000000..9d4bb52
--- /dev/null
+++ b/tests/src/test_BamRecordImplCore.cpp
@@ -0,0 +1,597 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecordImpl.h>
+#include <pbbam/BamTagCodec.h>
+#include <pbbam/Tag.h>
+#include <pbbam/TagCollection.h>
+#include "../src/MemoryUtils.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace BamRecordImplCoreTests {
+
+struct Bam1Deleter
+{
+    void operator()(bam1_t* b)
+    {
+        if (b) bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+static BamRecordImpl CreateBamImpl()
+{
+    BamRecordImpl bam;
+    bam.Bin(42);
+    bam.Flag(42);
+    bam.InsertSize(42);
+    bam.MapQuality(42);
+    bam.MatePosition(42);
+    bam.MateReferenceId(42);
+    bam.Position(42);
+    bam.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam.Tags(tags);
+
+    return bam;
+}
+
+static void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+    const uint32_t expectedNameBytes = bam.Name().size() + 1;  // include NULL term
+    const uint32_t expectedNameNulls = 4 - (expectedNameBytes % 4);
+    const uint32_t expectedNameLength = expectedNameBytes + expectedNameNulls;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t expectedSeqLength = bam.Sequence().length();
+    const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + << TAGS >>
+    const int expectedTotalDataLength = expectedNameLength + (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength + 1) / 2 + expectedSeqLength +
+                                        expectedTagsLength;
+
+    const auto rawData = PacBio::BAM::internal::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    EXPECT_EQ(expectedNameNulls, rawData->core.l_extranul);
+    EXPECT_EQ(expectedNameLength, rawData->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps, rawData->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength, rawData->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, rawData->l_data);
+}
+
+}  // namespace BamRecordImplCoreTests
+
+TEST(BamRecordImplCoreTestsTest, RawDataDefaultValues)
+{
+    std::shared_ptr<bam1_t> rawData(bam_init1(), BamRecordImplCoreTests::Bam1Deleter());
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    // fixed-length (core) data
+    EXPECT_EQ(0, rawData->core.tid);
+    EXPECT_EQ(0, rawData->core.pos);
+    EXPECT_EQ(0, rawData->core.bin);
+    EXPECT_EQ(0, rawData->core.qual);
+    EXPECT_EQ(0, rawData->core.l_qname);
+    EXPECT_EQ(0, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(0, rawData->core.mtid);
+    EXPECT_EQ(0, rawData->core.mpos);
+    EXPECT_EQ(0, rawData->core.isize);
+
+    // variable length data
+    EXPECT_EQ(0, rawData->data);
+    EXPECT_EQ(0, rawData->l_data);  // initial aligned QNAME
+    EXPECT_EQ(0, rawData->m_data);  // check this if we change or tune later
+}
+
+TEST(BamRecordImplCoreTestsTest, DefaultValues)
+{
+    BamRecordImpl bam;
+
+    // -------------------------------
+    // check raw data
+    // -------------------------------
+
+    const auto rawData = PacBio::BAM::internal::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    // fixed-length (core) data
+    // (forced init unmapped, with NULL-term as QNAME)
+    EXPECT_EQ(-1, rawData->core.tid);
+    EXPECT_EQ(-1, rawData->core.pos);
+    EXPECT_EQ(0, rawData->core.bin);
+    EXPECT_EQ(255, rawData->core.qual);
+    EXPECT_EQ(3, rawData->core.l_extranul);  // alignment nulls
+    EXPECT_EQ(4, rawData->core.l_qname);     // normal null term + alignment nulls
+    EXPECT_EQ(BamRecordImpl::UNMAPPED, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(-1, rawData->core.mtid);
+    EXPECT_EQ(-1, rawData->core.mpos);
+    EXPECT_EQ(0, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(4, rawData->l_data);           // initial aligned QNAME
+    EXPECT_EQ(int{0x800}, rawData->m_data);  // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(0, bam.Bin());
+    EXPECT_EQ(BamRecordImpl::UNMAPPED, bam.Flag());
+    EXPECT_EQ(0, bam.InsertSize());
+    EXPECT_EQ(255, bam.MapQuality());
+    EXPECT_EQ(-1, bam.MateReferenceId());
+    EXPECT_EQ(-1, bam.MatePosition());
+    EXPECT_EQ(-1, bam.Position());
+    EXPECT_EQ(-1, bam.ReferenceId());
+    EXPECT_EQ(0, bam.Tags().size());
+
+    EXPECT_FALSE(bam.IsDuplicate());
+    EXPECT_FALSE(bam.IsFailedQC());
+    EXPECT_FALSE(bam.IsFirstMate());
+    EXPECT_FALSE(bam.IsMapped());
+    EXPECT_TRUE(bam.IsMateMapped());
+    EXPECT_FALSE(bam.IsMateReverseStrand());
+    EXPECT_FALSE(bam.IsPaired());
+    EXPECT_TRUE(bam.IsPrimaryAlignment());
+    EXPECT_FALSE(bam.IsProperPair());
+    EXPECT_FALSE(bam.IsReverseStrand());
+    EXPECT_FALSE(bam.IsSecondMate());
+    EXPECT_FALSE(bam.IsSupplementaryAlignment());
+
+    const std::string emptyString = "";
+    EXPECT_EQ(emptyString, bam.Name());
+    EXPECT_EQ(emptyString, bam.CigarData().ToStdString());
+    EXPECT_EQ(emptyString, bam.Sequence());
+    EXPECT_EQ(emptyString, bam.Qualities().Fastq());
+    BamRecordImplCoreTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTestsTest, CoreSetters)
+{
+    BamRecordImpl bam;
+    bam.Bin(42);
+    bam.Flag(42);
+    bam.InsertSize(42);
+    bam.MapQuality(42);
+    bam.MatePosition(42);
+    bam.MateReferenceId(42);
+    bam.Position(42);
+    bam.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam.Tags(tags);  // (28 bytes encoded)
+
+    // -------------------------------
+    // check raw data
+    // -------------------------------
+
+    const auto rawData = PacBio::BAM::internal::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    // fixed-length (core) data
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(3, rawData->core.l_extranul);  // alignment nulls
+    EXPECT_EQ(4, rawData->core.l_qname);     // normal null term + alignment nulls
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+
+    // variable length data
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_EQ(32, rawData->l_data);          // aligned qname + tags
+    EXPECT_EQ(int{0x800}, rawData->m_data);  // check this if we change or tune later
+
+    // -------------------------------
+    // check data via API calls
+    // -------------------------------
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    const TagCollection fetchedTags = bam.Tags();
+
+    EXPECT_TRUE(fetchedTags.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags.at("CA").ToUInt8Array());
+}
+
+TEST(BamRecordImplCoreTestsTest, DeepCopyFromRawData)
+{
+    // init raw data
+    std::shared_ptr<bam1_t> rawData(bam_init1(), BamRecordImplCoreTests::Bam1Deleter());
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    rawData->core.tid = 42;
+    rawData->core.pos = 42;
+    rawData->core.bin = 42;
+    rawData->core.qual = 42;
+    rawData->core.flag = 42;
+    rawData->core.mtid = 42;
+    rawData->core.mpos = 42;
+    rawData->core.isize = 42;
+
+    const int32_t x = 42;
+    char valueBytes[sizeof x];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&x)),
+              static_cast<const char*>(static_cast<const void*>(&x)) + sizeof x, valueBytes);
+    bam_aux_append(rawData.get(), "XY", 'i', sizeof(x), reinterpret_cast<uint8_t*>(&valueBytes[0]));
+
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(0, rawData->core.l_qname);
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+    const int32_t fetchedX = bam_aux2i(bam_aux_get(rawData.get(), "XY"));
+    EXPECT_EQ(42, fetchedX);
+
+    // create from raw data
+    BamRecordImpl bam = [&rawData]() {
+        BamRecordImpl result;
+        bam_copy1(PacBio::BAM::internal::BamRecordMemory::GetRawData(result).get(), rawData.get());
+        return result;
+    }();
+
+    // make sure raw data is still valid
+    EXPECT_EQ(42, rawData->core.tid);
+    EXPECT_EQ(42, rawData->core.pos);
+    EXPECT_EQ(42, rawData->core.bin);
+    EXPECT_EQ(42, rawData->core.qual);
+    EXPECT_EQ(0, rawData->core.l_qname);
+    EXPECT_EQ(42, rawData->core.flag);
+    EXPECT_EQ(0, rawData->core.n_cigar);
+    EXPECT_EQ(0, rawData->core.l_qseq);
+    EXPECT_EQ(42, rawData->core.mtid);
+    EXPECT_EQ(42, rawData->core.mpos);
+    EXPECT_EQ(42, rawData->core.isize);
+    EXPECT_TRUE(rawData->data != nullptr);
+    EXPECT_TRUE(0 != rawData->l_data);
+    EXPECT_TRUE(0 != rawData->m_data);
+
+    // check new record
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+    EXPECT_EQ(x, bam.Tags()["XY"].ToInt32());
+
+    const auto newBamRawData = PacBio::BAM::internal::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(newBamRawData));
+
+    EXPECT_TRUE(newBamRawData->data != nullptr);
+    EXPECT_TRUE(newBamRawData->m_data >= int{0x800});  // check this if we change or tune later
+
+    // tweak raw data, make sure we've done a deep copy (so BamRecordImpl isn't changed)
+    rawData->core.pos = 37;
+    EXPECT_EQ(37, rawData->core.pos);
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, newBamRawData->core.pos);
+}
+
+TEST(BamRecordImplCoreTestsTest, CopyAssignment)
+{
+    BamRecordImpl bam1;
+    bam1.Bin(42);
+    bam1.Flag(42);
+    bam1.InsertSize(42);
+    bam1.MapQuality(42);
+    bam1.MatePosition(42);
+    bam1.MateReferenceId(42);
+    bam1.Position(42);
+    bam1.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam1.Tags(tags);
+
+    BamRecordImpl bam2;
+    bam2 = bam1;
+
+    EXPECT_EQ(42, bam1.Bin());
+    EXPECT_EQ(42, bam1.Flag());
+    EXPECT_EQ(42, bam1.InsertSize());
+    EXPECT_EQ(42, bam1.MapQuality());
+    EXPECT_EQ(42, bam1.MateReferenceId());
+    EXPECT_EQ(42, bam1.MatePosition());
+    EXPECT_EQ(42, bam1.Position());
+    EXPECT_EQ(42, bam1.ReferenceId());
+
+    const TagCollection fetchedTags1 = bam1.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    EXPECT_EQ(42, bam2.Bin());
+    EXPECT_EQ(42, bam2.Flag());
+    EXPECT_EQ(42, bam2.InsertSize());
+    EXPECT_EQ(42, bam2.MapQuality());
+    EXPECT_EQ(42, bam2.MateReferenceId());
+    EXPECT_EQ(42, bam2.MatePosition());
+    EXPECT_EQ(42, bam2.Position());
+    EXPECT_EQ(42, bam2.ReferenceId());
+
+    const TagCollection fetchedTags2 = bam2.Tags();
+    EXPECT_TRUE(fetchedTags2.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags2.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags2.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags2.at("CA").ToUInt8Array());
+
+    BamRecordImplCoreTests::CheckRawData(bam1);
+    BamRecordImplCoreTests::CheckRawData(bam2);
+}
+
+TEST(BamRecordImplCoreTestsTest, SelfAssignmentTolerated)
+{
+    BamRecordImpl bam1;
+    bam1.Bin(42);
+    bam1.Flag(42);
+    bam1.InsertSize(42);
+    bam1.MapQuality(42);
+    bam1.MatePosition(42);
+    bam1.MateReferenceId(42);
+    bam1.Position(42);
+    bam1.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam1.Tags(tags);
+
+    bam1 = bam1;
+
+    EXPECT_EQ(42, bam1.Bin());
+    EXPECT_EQ(42, bam1.Flag());
+    EXPECT_EQ(42, bam1.InsertSize());
+    EXPECT_EQ(42, bam1.MapQuality());
+    EXPECT_EQ(42, bam1.MateReferenceId());
+    EXPECT_EQ(42, bam1.MatePosition());
+    EXPECT_EQ(42, bam1.Position());
+    EXPECT_EQ(42, bam1.ReferenceId());
+
+    const TagCollection fetchedTags1 = bam1.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    BamRecordImplCoreTests::CheckRawData(bam1);
+}
+
+TEST(BamRecordImplCoreTestsTest, CopyConstructor)
+{
+    BamRecordImpl bam1;
+    bam1.Bin(42);
+    bam1.Flag(42);
+    bam1.InsertSize(42);
+    bam1.MapQuality(42);
+    bam1.MatePosition(42);
+    bam1.MateReferenceId(42);
+    bam1.Position(42);
+    bam1.ReferenceId(42);
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam1.Tags(tags);
+
+    BamRecordImpl bam2(bam1);
+
+    EXPECT_EQ(42, bam1.Bin());
+    EXPECT_EQ(42, bam1.Flag());
+    EXPECT_EQ(42, bam1.InsertSize());
+    EXPECT_EQ(42, bam1.MapQuality());
+    EXPECT_EQ(42, bam1.MateReferenceId());
+    EXPECT_EQ(42, bam1.MatePosition());
+    EXPECT_EQ(42, bam1.Position());
+    EXPECT_EQ(42, bam1.ReferenceId());
+
+    const TagCollection fetchedTags1 = bam1.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    EXPECT_EQ(42, bam2.Bin());
+    EXPECT_EQ(42, bam2.Flag());
+    EXPECT_EQ(42, bam2.InsertSize());
+    EXPECT_EQ(42, bam2.MapQuality());
+    EXPECT_EQ(42, bam2.MateReferenceId());
+    EXPECT_EQ(42, bam2.MatePosition());
+    EXPECT_EQ(42, bam2.Position());
+    EXPECT_EQ(42, bam2.ReferenceId());
+
+    const TagCollection fetchedTags2 = bam2.Tags();
+    EXPECT_TRUE(fetchedTags2.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags2.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags2.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags2.at("CA").ToUInt8Array());
+
+    BamRecordImplCoreTests::CheckRawData(bam1);
+    BamRecordImplCoreTests::CheckRawData(bam2);
+}
+
+TEST(BamRecordImplCoreTestsTest, CreateRecord_InternalTest)
+{
+    BamRecordImpl bam = BamRecordImplCoreTests::CreateBamImpl();
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+    bam.Tags(tags);
+
+    BamRecordImplCoreTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTestsTest, MoveAssignment)
+{
+    BamRecordImpl bam;
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    bam = std::move(BamRecordImplCoreTests::CreateBamImpl());
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    const TagCollection fetchedTags1 = bam.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    BamRecordImplCoreTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTestsTest, MoveConstructor)
+{
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    BamRecordImpl bam(std::move(BamRecordImplCoreTests::CreateBamImpl()));
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+    EXPECT_EQ(42, bam.Bin());
+    EXPECT_EQ(42, bam.Flag());
+    EXPECT_EQ(42, bam.InsertSize());
+    EXPECT_EQ(42, bam.MapQuality());
+    EXPECT_EQ(42, bam.MateReferenceId());
+    EXPECT_EQ(42, bam.MatePosition());
+    EXPECT_EQ(42, bam.Position());
+    EXPECT_EQ(42, bam.ReferenceId());
+
+    const TagCollection fetchedTags1 = bam.Tags();
+    EXPECT_TRUE(fetchedTags1.at("HX").HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), fetchedTags1.at("HX").ToString());
+    EXPECT_EQ(int32_t{-42}, fetchedTags1.at("XY").ToInt32());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), fetchedTags1.at("CA").ToUInt8Array());
+
+    BamRecordImplCoreTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplCoreTestsTest, AlignmentFlags)
+{
+    // same set of flags, different ways of getting there
+
+    // raw number
+    BamRecordImpl bam1;
+    bam1.Flag(1107);
+
+    // enum values
+    BamRecordImpl bam2;
+    bam2.Flag(BamRecordImpl::DUPLICATE | BamRecordImpl::MATE_1 | BamRecordImpl::REVERSE_STRAND |
+              BamRecordImpl::PROPER_PAIR | BamRecordImpl::PAIRED);
+
+    // convenience calls
+    BamRecordImpl bam3;
+    bam3.SetDuplicate(true);
+    bam3.SetFirstMate(true);
+    bam3.SetReverseStrand(true);
+    bam3.SetMapped(true);
+    bam3.SetMateMapped(true);
+    bam3.SetPaired(true);
+    bam3.SetProperPair(true);
+    bam3.SetPrimaryAlignment(true);
+
+    // make sure all are same
+    EXPECT_EQ(1107, bam1.Flag());
+    EXPECT_EQ(1107, bam2.Flag());
+    EXPECT_EQ(1107, bam3.Flag());
+
+    // check API calls
+    EXPECT_TRUE(bam1.IsPaired());
+    EXPECT_TRUE(bam1.IsProperPair());
+    EXPECT_TRUE(bam1.IsMapped());
+    EXPECT_TRUE(bam1.IsMateMapped());
+    EXPECT_TRUE(bam1.IsReverseStrand());
+    EXPECT_FALSE(bam1.IsMateReverseStrand());
+    EXPECT_TRUE(bam1.IsFirstMate());
+    EXPECT_FALSE(bam1.IsSecondMate());
+    EXPECT_TRUE(bam1.IsPrimaryAlignment());
+    EXPECT_FALSE(bam1.IsFailedQC());
+    EXPECT_TRUE(bam1.IsDuplicate());
+    EXPECT_FALSE(bam1.IsSupplementaryAlignment());
+}
diff --git a/tests/src/test_BamRecordImplTags.cpp b/tests/src/test_BamRecordImplTags.cpp

new file mode 100644 (file)

index 0000000..2ab169a
--- /dev/null
+++ b/tests/src/test_BamRecordImplTags.cpp
@@ -0,0 +1,179 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecordImpl.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+// NOTE: these tests check "high-level" tag query/manipulation via BamRecordImpl.
+//       For raw Tag/TagCollection tests, see test_Tags.cpp
+//       For encoding tests, see test_BamRecordImplVariableData.cpp
+
+TEST(BamRecordImplTagsTest, HasTagTest)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    EXPECT_FALSE(bam.HasTag("zz"));
+    EXPECT_FALSE(bam.HasTag(""));
+    EXPECT_FALSE(bam.HasTag("some_too_long_name"));
+
+    const TagCollection fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_TRUE(fetchedTags.Contains("XY"));
+    EXPECT_FALSE(fetchedTags.Contains("zz"));
+    EXPECT_FALSE(fetchedTags.Contains(""));
+    EXPECT_FALSE(fetchedTags.Contains("some_too_long_name"));
+}
+
+TEST(BamRecordImplTagsTest, SimpleAddTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_FALSE(bam.HasTag("XY"));
+
+    bam.AddTag("XY", int32_t{-42});
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const TagCollection fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_TRUE(fetchedTags.Contains("XY"));
+    EXPECT_FALSE(fetchedTags.Contains("zz"));
+    EXPECT_FALSE(fetchedTags.Contains(""));
+    EXPECT_FALSE(fetchedTags.Contains("some_too_long_name"));
+
+    EXPECT_EQ(-42, fetchedTags.at("XY").ToInt32());
+
+    // fail on invalid adds
+    EXPECT_FALSE(bam.AddTag("", int32_t{-42}));
+    EXPECT_FALSE(bam.AddTag("some_too_long_name", int32_t{-42}));
+    EXPECT_FALSE(bam.AddTag("XY", int32_t{-42}));  // reject duplicate
+}
+
+TEST(BamRecordImplTagsTest, SimpleRemoveTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const bool removedOk = bam.RemoveTag("XY");
+    EXPECT_TRUE(removedOk);
+
+    EXPECT_TRUE(bam.HasTag("HX"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_FALSE(bam.HasTag("XY"));
+
+    const TagCollection fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_FALSE(fetchedTags.Contains("XY"));
+    EXPECT_FALSE(fetchedTags.Contains("zz"));
+    EXPECT_FALSE(fetchedTags.Contains(""));
+    EXPECT_FALSE(fetchedTags.Contains("some_too_long_name"));
+
+    // fail on invalid removes
+    EXPECT_FALSE(bam.RemoveTag(""));
+    EXPECT_FALSE(bam.RemoveTag("some_too_long_name"));
+    EXPECT_FALSE(bam.RemoveTag("zz"));  // reject remove unknown
+}
+
+TEST(BamRecordImplTagsTest, SimpleEditTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const TagCollection fetchedTags = bam.Tags();
+    EXPECT_TRUE(fetchedTags.Contains("HX"));
+    EXPECT_TRUE(fetchedTags.Contains("CA"));
+    EXPECT_TRUE(fetchedTags.Contains("XY"));
+    EXPECT_EQ(-42, fetchedTags.at("XY").ToInt32());
+
+    const bool editedOk = bam.EditTag("XY", int32_t{500});
+    EXPECT_TRUE(editedOk);
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    const TagCollection fetchedTags2 = bam.Tags();
+    EXPECT_TRUE(fetchedTags2.Contains("HX"));
+    EXPECT_TRUE(fetchedTags2.Contains("CA"));
+    EXPECT_TRUE(fetchedTags2.Contains("XY"));
+    EXPECT_EQ(500, fetchedTags2.at("XY").ToInt32());
+
+    // fail on invalid edits
+    EXPECT_FALSE(bam.EditTag("", 500));
+    EXPECT_FALSE(bam.EditTag("some_too_long_name", 500));
+    EXPECT_FALSE(bam.EditTag("zz", 500));  // reject edit unknown
+}
+
+TEST(BamRecordImplTagsTest, SimpleQueryTag)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    EXPECT_TRUE(bam.HasTag("XY"));
+    EXPECT_TRUE(bam.HasTag("CA"));
+    EXPECT_TRUE(bam.HasTag("XY"));
+
+    EXPECT_EQ(std::string("1abc75"), bam.TagValue("HX").ToString());
+    EXPECT_EQ(std::vector<uint8_t>({34, 5, 125}), bam.TagValue("CA").ToUInt8Array());
+    EXPECT_EQ(int32_t{-42}, bam.TagValue("XY").ToInt32());
+
+    EXPECT_FALSE(bam.HasTag("zz"));
+    EXPECT_FALSE(bam.HasTag(""));
+    EXPECT_FALSE(bam.HasTag("some_too_long_name"));
+
+    EXPECT_EQ(Tag(), bam.TagValue("zz"));
+    EXPECT_EQ(Tag(), bam.TagValue(""));
+    EXPECT_EQ(Tag(), bam.TagValue("some_too_long_name"));
+}
diff --git a/tests/src/test_BamRecordImplVariableData.cpp b/tests/src/test_BamRecordImplVariableData.cpp

new file mode 100644 (file)

index 0000000..fa49db3
--- /dev/null
+++ b/tests/src/test_BamRecordImplVariableData.cpp
@@ -0,0 +1,4526 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecordImpl.h>
+#include <pbbam/BamTagCodec.h>
+#include <pbbam/SamTagCodec.h>
+#include <pbbam/Tag.h>
+#include <pbbam/TagCollection.h>
+#include "../src/MemoryUtils.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+// NOTE: this file has a *TON* of tests. Probably overkill, but I wanted to check
+//       every possible combination of variable data, and then manipulate each
+//       element within each combo to shrink & expand.
+
+namespace BamRecordImplVariableDataTests {
+
+static void CheckRawData(const BamRecordImpl& bam)
+{
+    // ensure raw data (lengths at least) matches API-facing data
+    const uint32_t expectedNameBytes = bam.Name().size() + 1;  // include NULL term
+    const uint32_t expectedNameNulls = 4 - (expectedNameBytes % 4);
+    const uint32_t expectedNameLength = expectedNameBytes + expectedNameNulls;
+    const uint32_t expectedNumCigarOps = bam.CigarData().size();
+    const int32_t expectedSeqLength = bam.Sequence().length();
+    const size_t expectedTagsLength = BamTagCodec::Encode(bam.Tags()).size();
+
+    //  Name        CIGAR         Sequence       Quals      Tags
+    // l_qname + (n_cigar * 4) + (l_qseq+1)/2 + l_qseq + <encoded length>
+    const int expectedTotalDataLength = expectedNameLength + (expectedNumCigarOps * 4) +
+                                        (expectedSeqLength + 1) / 2 + expectedSeqLength +
+                                        expectedTagsLength;
+
+    const auto rawData = PacBio::BAM::internal::BamRecordMemory::GetRawData(bam);
+    ASSERT_TRUE(static_cast<bool>(rawData));
+
+    EXPECT_EQ(expectedNameNulls, rawData->core.l_extranul);
+    EXPECT_EQ(expectedNameLength, rawData->core.l_qname);
+    EXPECT_EQ(expectedNumCigarOps, rawData->core.n_cigar);
+    EXPECT_EQ(expectedSeqLength, rawData->core.l_qseq);
+    EXPECT_EQ(expectedTotalDataLength, rawData->l_data);
+}
+
+}  // namespace BamRecordImplVariableDataTests
+
+TEST(BamRecordImplVariableDataTest, InitEmpty)
+{
+    BamRecordImpl bam;
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_InitEmpty)
+{
+    BamRecordImpl bam;
+    bam.Tags(TagCollection());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_InitNormal)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithLongerTags)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithShorterTags)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamRecordImplVariableDataTest, TagOnly_ThenOverwriteWithEmptyTags)
+{
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_InitEmpty)
+{
+    BamRecordImpl bam;
+    bam.CigarData(std::string());
+    EXPECT_EQ(0, bam.CigarData().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_InitNormal_CigarObject)
+{
+    Cigar cigar;
+    cigar.push_back(CigarOperation('=', 100));
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData());
+    EXPECT_TRUE("100=" == bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_InitNormal_StdString)
+{
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithLongerCigar)
+{
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithShorterCigar)
+{
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarOnly_ThenOverwriteWithEmptyCigar)
+{
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_Init_Normal)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_Init_EmptyCigar)
+{
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_Init_EmptyTag)
+{
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, CigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Empty)
+{
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(std::string(), std::string());
+    EXPECT_EQ(0, bam.Sequence().size());
+    EXPECT_EQ(0, bam.Qualities().Fastq().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Preencoded)
+{
+
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    const size_t encodedLength = (sequence.size() + 1) / 2;
+    char* encoded = static_cast<char*>(std::calloc(encodedLength, sizeof(char)));
+    char* e = encoded;
+
+    uint8_t nucleotideCode{};
+    bool useHighWord = true;
+    for (size_t i = 0; i < sequence.size(); ++i) {
+        switch (sequence.at(i)) {
+            case 'A':
+                nucleotideCode = 1;
+                break;
+            case 'C':
+                nucleotideCode = 2;
+                break;
+            case 'G':
+                nucleotideCode = 4;
+                break;
+            case 'T':
+                nucleotideCode = 8;
+                break;
+            default:
+                EXPECT_FALSE(true);
+                break;
+        }
+
+        // pack the nucleotide code
+        if (useHighWord) {
+            *e = nucleotideCode << 4;
+            useHighWord = false;
+        } else {
+            *e |= nucleotideCode;
+            ++e;
+            useHighWord = true;
+        }
+    }
+
+    BamRecordImpl bam;
+    bam.SetPreencodedSequenceAndQualities(encoded, sequence.size(), qualities.c_str());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+
+    if (encoded) free(encoded);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_Init_Preencoded_EmptyQual)
+{
+
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    const auto encodedLength = (sequence.size() + 1) / 2;
+    auto* encoded = static_cast<char*>(std::calloc(encodedLength, sizeof(char)));
+    auto* e = encoded;
+
+    uint8_t nucleotideCode{};
+    bool useHighWord = true;
+    for (size_t i = 0; i < sequence.size(); ++i) {
+        switch (sequence.at(i)) {
+            case 'A':
+                nucleotideCode = 1;
+                break;
+            case 'C':
+                nucleotideCode = 2;
+                break;
+            case 'G':
+                nucleotideCode = 4;
+                break;
+            case 'T':
+                nucleotideCode = 8;
+                break;
+            default:
+                EXPECT_FALSE(true);
+                break;
+        }
+
+        // pack the nucleotide code
+        if (useHighWord) {
+            *e = nucleotideCode << 4;
+            useHighWord = false;
+        } else {
+            *e |= nucleotideCode;
+            ++e;
+            useHighWord = true;
+        }
+    }
+
+    BamRecordImpl bam;
+    bam.SetPreencodedSequenceAndQualities(encoded, sequence.size(), qualities.c_str());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+
+    if (encoded) free(encoded);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualOnly_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_Normal)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptySeqQual)
+{
+    const std::string sequence = "";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_Init_EmptyTag)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithLongerTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithShorterTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_Normal)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptySeqQual)
+{
+    const std::string sequence = "";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_Init_EmptyCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithLongerCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithShorterCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigar_ThenOverwriteWithEmptyCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_Normal)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptySeqQual)
+{
+    const std::string sequence = "";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_Init_EmptyTag)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, SeqQualCigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_InitEmpty)
+{
+    BamRecordImpl bam;
+    bam.Name(std::string());
+    EXPECT_EQ(0, bam.Name().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_InitNormal)
+{
+    const std::string readName = "foo";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameOnly_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string emptyName = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Name(emptyName);
+
+    EXPECT_EQ(emptyName, bam.Name());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_Init_Normal)
+{
+    const std::string readName = "foo";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_Init_EmptyName)
+{
+    const std::string readName = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_Init_EmptyTag)
+{
+    const std::string readName = "foo";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName = "foo";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName = "foo";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName = "foo";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_Init_EmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.CigarData(cigar);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigar_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_Init_EmptyTag)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameCigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName = "foo";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_EmptySeqQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "";
+    const std::string qualities = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_Init_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQual_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptySeqQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_Init_EmptyTag)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0, bam.Tags().size());
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptySeqQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_Init_EmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigar_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+// @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_Normal)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyName)
+{
+    const std::string readName = "";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptySeqQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_Init_EmptyTag)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(longerName);
+
+    EXPECT_EQ(longerName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerName = "this is a long read name";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(longerName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(readName);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyName)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Name(empty);
+
+    EXPECT_EQ(empty, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(sequence, qualities);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterSeq_NormalQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "?]?]";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterSeq_EmptyQual)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string shortSeq = "ACGT";
+    const std::string shortQual = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(shortSeq, shortQual);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(shortSeq, bam.Sequence());
+    EXPECT_EQ(shortQual, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptySeq)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.SetSequenceAndQualities(empty, empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(empty, bam.Sequence());
+    EXPECT_EQ(empty, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(longerCigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(longerCigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string longerCigar = "100=10D100=10I100X";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(longerCigar);
+    bam.Tags(tags);
+    bam.CigarData(cigar);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyCigar)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+    const std::string empty = "";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.CigarData(empty);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(empty, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithLongerTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(longerTags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithShorterTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection longerTags;
+    longerTags["HX"] = std::string("1abc75");
+    longerTags["HX"].Modifier(TagModifier::HEX_STRING);
+    longerTags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    longerTags["XY"] = int32_t{-42};
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(longerTags);
+    bam.Tags(tags);
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+
+    const std::string sam = SamTagCodec::Encode(bam.Tags());
+    EXPECT_EQ(expected, sam);
+
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
+
+TEST(BamRecordImplVariableDataTest, NameSeqQualCigarTag_ThenOverwriteWithEmptyTags)
+{
+    const std::string readName = "foo";
+    const std::string sequence = "ACGTACGTACGT";
+    const std::string qualities = "?]?]?]?]?]?]";
+    const std::string cigar = "100=";
+
+    TagCollection tags;
+    tags["HX"] = std::string("1abc75");
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+    tags["CA"] = std::vector<uint8_t>({34, 5, 125});
+    tags["XY"] = int32_t{-42};
+
+    BamRecordImpl bam;
+    bam.Name(readName);
+    bam.SetSequenceAndQualities(sequence, qualities);
+    bam.CigarData(cigar);
+    bam.Tags(tags);
+    bam.Tags(TagCollection());
+
+    EXPECT_EQ(readName, bam.Name());
+    EXPECT_EQ(sequence, bam.Sequence());
+    EXPECT_EQ(qualities, bam.Qualities().Fastq());
+    EXPECT_EQ(cigar, bam.CigarData().ToStdString());
+    EXPECT_EQ(0, bam.Tags().size());
+    BamRecordImplVariableDataTests::CheckRawData(bam);
+}
diff --git a/tests/src/test_BamRecordMapping.cpp b/tests/src/test_BamRecordMapping.cpp

new file mode 100644 (file)

index 0000000..424b4bc
--- /dev/null
+++ b/tests/src/test_BamRecordMapping.cpp
@@ -0,0 +1,714 @@
+// Author: Derek Barnett
+
+#include <chrono>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamRecordView.h>
+#include <pbbam/BamTagCodec.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+using f_data = std::vector<uint16_t>;
+
+namespace BamRecordMappingTests {
+
+static
+BamRecord MakeRecord(const Position qStart,
+                     const Position qEnd,
+                     const std::string& seq,
+                     const std::string& quals,
+                     const std::string& tagBases,
+                     const std::string& tagQuals,
+                     const f_data& frames)
+{
+    BamRecordImpl impl;
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["qs"] = qStart;
+    tags["qe"] = qEnd;
+    tags["ip"] = frames;
+    tags["pw"] = frames;
+    tags["dt"] = tagBases;
+    tags["st"] = tagBases;
+    tags["dq"] = tagQuals;
+    tags["iq"] = tagQuals;
+    tags["mq"] = tagQuals;
+    tags["sq"] = tagQuals;
+    tags["pq"] = tagQuals;
+    tags["pv"] = tagQuals;
+    impl.Tags(tags);
+
+    return BamRecord(std::move(impl));
+}
+
+} // namespace BamRecordMappingTests
+
+TEST(BamRecordMappingTest, BasicMap)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+
+    const std::string seq_rev   = "GCTAACGGTT";
+    const std::string quals_rev = "*?]?]?]?]?";
+    const std::string tagBases_rev = seq_rev;
+    const std::string tagQuals_rev = quals_rev;
+    const f_data frames_rev = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 };
+
+    const std::string s1_cigar = "10=";
+    const std::string s2_cigar = "5=3D5=";
+    const std::string s3_cigar = "4=1D2I2D4=";
+
+    BamRecord s1 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s1_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+
+    s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual);
+    s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual);
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual);
+    s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    {   // s1 - FORWARD
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(0, s1.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(mapQual, s1.MapQuality());
+
+        EXPECT_EQ(qStart, s1.QueryStart());
+        EXPECT_EQ(qEnd,   s1.QueryEnd());
+        EXPECT_EQ(500, s1.AlignedStart());
+        EXPECT_EQ(510, s1.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s1.ReferenceStart());
+        EXPECT_EQ(110, s1.ReferenceEnd());       // 100 + 10=
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(0, s1_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s1_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s1_rev.QueryStart());
+        EXPECT_EQ(qEnd,   s1_rev.QueryEnd());
+        EXPECT_EQ(500, s1_rev.AlignedStart());
+        EXPECT_EQ(510, s1_rev.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s1_rev.ReferenceStart());
+        EXPECT_EQ(110, s1_rev.ReferenceEnd());       // 100 + 10=
+
+        // native
+        const BamRecordView nativeView
+        {
+            s1_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(0, s2.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(mapQual, s2.MapQuality());
+
+        EXPECT_EQ(qStart, s2.QueryStart());
+        EXPECT_EQ(qEnd,   s2.QueryEnd());
+        EXPECT_EQ(500, s2.AlignedStart());
+        EXPECT_EQ(510, s2.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s2.ReferenceStart());
+        EXPECT_EQ(113, s2.ReferenceEnd());      // 100 + 10= + 3D
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(0, s2_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s2_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s2_rev.QueryStart());
+        EXPECT_EQ(qEnd,   s2_rev.QueryEnd());
+        EXPECT_EQ(500, s2_rev.AlignedStart());
+        EXPECT_EQ(510, s2_rev.AlignedEnd());         // 500 + 10=
+        EXPECT_EQ(100, s2_rev.ReferenceStart());
+        EXPECT_EQ(113, s2_rev.ReferenceEnd());      // 100 + 10= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s2_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(0, s3.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(mapQual, s3.MapQuality());
+
+        EXPECT_EQ(qStart, s3.QueryStart());
+        EXPECT_EQ(qEnd,   s3.QueryEnd());
+        EXPECT_EQ(500, s3.AlignedStart());
+        EXPECT_EQ(510, s3.AlignedEnd());         // 500 + 8= + 2I
+        EXPECT_EQ(100, s3.ReferenceStart());
+        EXPECT_EQ(111, s3.ReferenceEnd());      // 100 + 8= + 3D
+
+        const BamRecordView view
+        {
+            s3,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(0, s3_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s3_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s3_rev.QueryStart());
+        EXPECT_EQ(qEnd,   s3_rev.QueryEnd());
+        EXPECT_EQ(500, s3_rev.AlignedStart());
+        EXPECT_EQ(510, s3_rev.AlignedEnd());         // 500 + 8= + 2I
+        EXPECT_EQ(100, s3_rev.ReferenceStart());
+        EXPECT_EQ(111, s3_rev.ReferenceEnd());      // 100 + 8= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s3_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+}
+
+TEST(BamRecordMappingTest, SoftClipMapping)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 515;
+    const std::string seq      = "TTAACCGTTAGCAAA";
+    const std::string quals    = "--?]?]?]?]?*+++";
+    const std::string tagBases = "TTAACCGTTAGCAAA";
+    const std::string tagQuals = "--?]?]?]?]?*+++";
+    const f_data frames   = { 40, 40, 10, 10, 20, 20, 30, 40, 40, 10, 30, 20, 10, 10, 10 };
+    const uint8_t mapQual = 80;
+
+    const std::string clipped_seq   = "AACCGTTAGC";
+    const std::string clipped_quals = "?]?]?]?]?*";
+    const std::string clipped_tagBases   = "AACCGTTAGC";
+    const std::string clipped_tagQuals = "?]?]?]?]?*";
+    const f_data clipped_frames = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+
+    const std::string seq_rev   = "TTTGCTAACGGTTAA";
+    const std::string quals_rev = "+++*?]?]?]?]?--";
+    const std::string tagBases_rev = seq_rev;
+    const std::string tagQuals_rev = quals_rev;
+    const f_data frames_rev = { 10, 10, 10, 20, 30, 10, 40, 40, 30, 20, 20, 10, 10, 40, 40 };
+
+    const std::string clipped_seq_rev   = "GCTAACGGTT";
+    const std::string clipped_quals_rev = "*?]?]?]?]?";
+    const std::string clipped_tagBases_rev = clipped_seq_rev;
+    const std::string clipped_tagQuals_rev = clipped_quals_rev;
+    const f_data clipped_frames_rev = { 20, 30, 10, 40, 40, 30, 20, 20, 10, 10 };
+
+    const std::string s1_cigar = "2S10=3S";
+    const std::string s2_cigar = "2S5=3D5=3S";
+    const std::string s3_cigar = "2S4=1D2I2D4=3S";
+
+    BamRecord s1 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3 = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s1_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3_rev = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+
+    s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual);
+    s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual);
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual);
+    s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    {   // s1 - FORWARD
+
+        EXPECT_TRUE(s1.IsMapped());
+        EXPECT_EQ(0, s1.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s1.AlignedStrand());
+        EXPECT_EQ(mapQual, s1.MapQuality());
+
+        EXPECT_EQ(qStart, s1.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s1.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(502, s1.AlignedStart());       // QStart + 2S
+        EXPECT_EQ(512, s1.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s1.ReferenceStart());     // 100
+        EXPECT_EQ(110, s1.ReferenceEnd());       // RefStart + 10=
+
+        const BamRecordView view
+        {
+            s1,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s1 - REVERSE
+
+        EXPECT_TRUE(s1_rev.IsMapped());
+        EXPECT_EQ(0, s1_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s1_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s1_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s1_rev.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s1_rev.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(503, s1_rev.AlignedStart());       // QStart + 3S
+        EXPECT_EQ(513, s1_rev.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s1_rev.ReferenceStart());     // 100
+        EXPECT_EQ(110, s1_rev.ReferenceEnd());       // RefStart + 10=
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s1_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s1_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s2 - FORWARD
+
+        EXPECT_TRUE(s2.IsMapped());
+        EXPECT_EQ(0, s2.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s2.AlignedStrand());
+        EXPECT_EQ(mapQual, s2.MapQuality());
+
+        EXPECT_EQ(qStart, s2.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s2.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(502, s2.AlignedStart());       // QStart + 2S
+        EXPECT_EQ(512, s2.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s2.ReferenceStart());     // 100
+        EXPECT_EQ(113, s2.ReferenceEnd());       // RefStart + 10= + 3D
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s2 - REVERSE
+
+        EXPECT_TRUE(s2_rev.IsMapped());
+        EXPECT_EQ(0, s2_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s2_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s2_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s2_rev.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s2_rev.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(503, s2_rev.AlignedStart());       // QStart + 3S
+        EXPECT_EQ(513, s2_rev.AlignedEnd());         // AStart + 10=
+        EXPECT_EQ(100, s2_rev.ReferenceStart());     // 100
+        EXPECT_EQ(113, s2_rev.ReferenceEnd());       // RefStart + 10= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s2_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s2_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+
+    {   // s3 - FORWARD
+
+        EXPECT_TRUE(s3.IsMapped());
+        EXPECT_EQ(0, s3.ReferenceId());
+        EXPECT_EQ(Strand::FORWARD, s3.AlignedStrand());
+        EXPECT_EQ(mapQual, s3.MapQuality());
+
+        EXPECT_EQ(qStart, s3.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s3.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(502, s3.AlignedStart());       // QStart + 2S
+        EXPECT_EQ(512, s3.AlignedEnd());         // AStart + 8= + 2I
+        EXPECT_EQ(100, s3.ReferenceStart());     // 100
+        EXPECT_EQ(111, s3.ReferenceEnd());       // RefStart + 8= + 3D
+
+        const BamRecordView view
+        {
+            s2,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+
+        EXPECT_EQ(seq,      view.Sequence());
+        EXPECT_EQ(quals,    view.Qualities().Fastq());
+        EXPECT_EQ(tagBases, view.DeletionTags());
+        EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   view.IPD().Data());
+    }
+
+    {   // s3 - REVERSE
+
+        EXPECT_TRUE(s3_rev.IsMapped());
+        EXPECT_EQ(0, s3_rev.ReferenceId());
+        EXPECT_EQ(Strand::REVERSE, s3_rev.AlignedStrand());
+        EXPECT_EQ(mapQual, s3_rev.MapQuality());
+
+        EXPECT_EQ(qStart, s3_rev.QueryStart());      // 500
+        EXPECT_EQ(qEnd,   s3_rev.QueryEnd());        // QStart + seqLength
+        EXPECT_EQ(503, s3_rev.AlignedStart());       // QStart + 3S
+        EXPECT_EQ(513, s3_rev.AlignedEnd());         // AStart + 8= + 2I
+        EXPECT_EQ(100, s3_rev.ReferenceStart());     // 100
+        EXPECT_EQ(111, s3_rev.ReferenceEnd());       // RefStart + 8= + 3D
+
+        // - native
+        const BamRecordView nativeView
+        {
+            s3_rev,
+            Orientation::NATIVE,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq,      nativeView.Sequence());
+        EXPECT_EQ(quals,    nativeView.Qualities().Fastq());
+        EXPECT_EQ(tagBases, nativeView.DeletionTags());
+        EXPECT_EQ(tagQuals, nativeView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals, nativeView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames,   nativeView.IPD().Data());
+
+        // - genomic
+        const BamRecordView genomicView
+        {
+            s3_rev,
+            Orientation::GENOMIC,
+            false,
+            false,
+            PulseBehavior::ALL
+        };
+        EXPECT_EQ(seq_rev,      genomicView.Sequence());
+        EXPECT_EQ(quals_rev,    genomicView.Qualities().Fastq());
+        EXPECT_EQ(tagBases_rev, genomicView.DeletionTags());
+        EXPECT_EQ(tagQuals_rev, genomicView.DeletionQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.LabelQVs().Fastq());
+        EXPECT_EQ(tagQuals_rev, genomicView.AltLabelQVs().Fastq());
+        EXPECT_EQ(frames_rev,   genomicView.IPD().Data());
+    }
+}
+
+TEST(BamRecordMappingTest, MappedCopy)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+    const std::string cigar    = "4=1D2I2D4=";
+
+    const BamRecord orig = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    const BamRecord mapped = orig.Mapped(0, 100, Strand::FORWARD, cigar, mapQual);
+
+    EXPECT_TRUE(mapped.IsMapped());
+    EXPECT_EQ(0, mapped.ReferenceId());
+    EXPECT_EQ(Strand::FORWARD, mapped.AlignedStrand());
+    EXPECT_EQ(mapQual, mapped.MapQuality());
+
+    EXPECT_EQ(500, mapped.QueryStart());      // 500
+    EXPECT_EQ(510, mapped.QueryEnd());        // QStart + seqLength
+    EXPECT_EQ(500, mapped.AlignedStart());    // QStart
+    EXPECT_EQ(510, mapped.AlignedEnd());      // QStart + 8= + 2I
+    EXPECT_EQ(100, mapped.ReferenceStart());  // 100
+    EXPECT_EQ(111, mapped.ReferenceEnd());    // RefStart + 8= + 3D
+
+    const BamRecordView view
+    {
+        mapped,
+        Orientation::NATIVE,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq,      view.Sequence());
+    EXPECT_EQ(quals,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases, view.DeletionTags());
+    EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames,   view.IPD().Data());
+}
+
+TEST(BamRecordMappingTest, StaticMapped)
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const f_data frames   = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+    const std::string cigar    = "4=1D2I2D4=";
+
+    const BamRecord orig = BamRecordMappingTests::MakeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    const BamRecord mapped = BamRecord::Mapped(orig, 0, 100, Strand::FORWARD, cigar, mapQual);
+
+    EXPECT_TRUE(mapped.IsMapped());
+    EXPECT_EQ(0, mapped.ReferenceId());
+    EXPECT_EQ(Strand::FORWARD, mapped.AlignedStrand());
+    EXPECT_EQ(mapQual, mapped.MapQuality());
+
+    EXPECT_EQ(500, mapped.QueryStart());      // 500
+    EXPECT_EQ(510, mapped.QueryEnd());        // QStart + seqLength
+    EXPECT_EQ(500, mapped.AlignedStart());    // QStart
+    EXPECT_EQ(510, mapped.AlignedEnd());      // QStart + 8= + 2I
+    EXPECT_EQ(100, mapped.ReferenceStart());  // 100
+    EXPECT_EQ(111, mapped.ReferenceEnd());    // RefStart + 8= + 3D
+
+    const BamRecordView view
+    {
+        mapped,
+        Orientation::NATIVE,
+        false,
+        false,
+        PulseBehavior::ALL
+    };
+
+    EXPECT_EQ(seq,      view.Sequence());
+    EXPECT_EQ(quals,    view.Qualities().Fastq());
+    EXPECT_EQ(tagBases, view.DeletionTags());
+    EXPECT_EQ(tagQuals, view.DeletionQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.LabelQVs().Fastq());
+    EXPECT_EQ(tagQuals, view.AltLabelQVs().Fastq());
+    EXPECT_EQ(frames,   view.IPD().Data());
+}
+
+// clang-format on
diff --git a/tests/src/test_BamWriter.cpp b/tests/src/test_BamWriter.cpp

new file mode 100644 (file)

index 0000000..a5ac803
--- /dev/null
+++ b/tests/src/test_BamWriter.cpp
@@ -0,0 +1,113 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamHeader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/EntireFileQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+// clang-format off
+
+namespace BamWriterTests {
+
+void checkSingleRecord(bool useTempFile)
+{
+    const std::string fullName = "test/100/0_5";
+    const std::string rgId = "6002b307";
+    const std::vector<float> expectedSnr = {0.2, 0.2, 0.2, 0.2};
+
+    // setup header
+    const std::string hdrText = {
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;"
+        "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t"
+        "PU:test\tPM:SEQUEL\n"};
+    BamHeader inputHeader(hdrText);
+
+    // setup record
+    BamRecord bamRecord(inputHeader);
+    bamRecord.Impl().Name(fullName);
+    bamRecord.Impl().SetSequenceAndQualities("ACGTC", 5);
+    bamRecord.Impl().CigarData("");
+    bamRecord.Impl().Bin(0);
+    bamRecord.Impl().Flag(0);
+    bamRecord.Impl().InsertSize(0);
+    bamRecord.Impl().MapQuality(0);
+    bamRecord.Impl().MatePosition(-1);
+    bamRecord.Impl().MateReferenceId(-1);
+    bamRecord.Impl().Position(-1);
+    bamRecord.Impl().ReferenceId(-1);
+    bamRecord.Impl().SetMapped(false);
+
+    TagCollection tags;
+    tags["zm"] = int32_t{100};
+    tags["qs"] = int32_t{0};
+    tags["qe"] = int32_t{5};
+    tags["np"] = int32_t{1};
+    tags["rq"] = static_cast<float>(0.6);
+    tags["RG"] = rgId;
+    tags["sn"] = expectedSnr;
+    bamRecord.Impl().Tags(tags);
+
+    // write record to file
+    const std::string generatedBamFn =
+        PbbamTestsConfig::GeneratedData_Dir + "/bamwriter_generated.bam";
+    {
+        BamWriter::Config config;
+        config.useTempFile = useTempFile;
+        BamWriter writer(generatedBamFn, inputHeader, config);
+        writer.Write(bamRecord);
+    }
+
+    // check written header
+    BamFile file(generatedBamFn);
+    const auto header = file.Header();
+    EXPECT_EQ(std::string("1.1"), header.Version());
+    EXPECT_EQ(std::string("unknown"), header.SortOrder());
+    EXPECT_EQ(std::string("3.0.1"), header.PacBioBamVersion());
+
+    // check written record
+    EntireFileQuery entireFile(file);
+    auto firstIter = entireFile.begin();
+    auto record = *firstIter;
+    EXPECT_EQ(std::string("ACGTC"), record.Sequence());
+    EXPECT_EQ(std::string("test/100/0_5"), record.FullName());
+    EXPECT_TRUE(record.HasHoleNumber());
+    EXPECT_TRUE(record.HasNumPasses());
+    EXPECT_TRUE(record.HasQueryEnd());
+    EXPECT_TRUE(record.HasQueryStart());
+    EXPECT_TRUE(record.HasReadAccuracy());
+    EXPECT_TRUE(record.HasSignalToNoise());
+    EXPECT_EQ(100, record.HoleNumber());
+    EXPECT_EQ(1, record.NumPasses());
+    EXPECT_EQ(0, record.QueryStart());
+    EXPECT_EQ(5, record.QueryEnd());
+    EXPECT_EQ(expectedSnr, record.SignalToNoise());
+    EXPECT_EQ(rgId, record.ReadGroupId());
+
+    // clean up
+    remove(generatedBamFn.c_str());
+}
+
+} // namespace BamWriterTests
+
+TEST(BamWriterTest, SingleWrite_UserRecord_WithTempFile)
+{
+    BamWriterTests::checkSingleRecord(true);
+}
+
+TEST(BamWriterTest, SingleWrite_UserRecord_NoTempFile)
+{
+    BamWriterTests::checkSingleRecord(false);
+}
+
+// clang-format on
diff --git a/tests/src/test_BarcodeQuery.cpp b/tests/src/test_BarcodeQuery.cpp

new file mode 100644 (file)

index 0000000..1827ccf
--- /dev/null
+++ b/tests/src/test_BarcodeQuery.cpp
@@ -0,0 +1,17 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BarcodeQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(BarcodeQueryTest, QueryOk)
+{
+    // come back with barcoded data
+}
diff --git a/tests/src/test_Cigar.cpp b/tests/src/test_Cigar.cpp

new file mode 100644 (file)

index 0000000..fff3666
--- /dev/null
+++ b/tests/src/test_Cigar.cpp
@@ -0,0 +1,165 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/Cigar.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(CigarTest, TypeToCar)
+{
+    EXPECT_EQ('M', CigarOperation::TypeToChar(CigarOperationType::ALIGNMENT_MATCH) );
+    EXPECT_EQ('I', CigarOperation::TypeToChar(CigarOperationType::INSERTION) );
+    EXPECT_EQ('D', CigarOperation::TypeToChar(CigarOperationType::DELETION) );
+    EXPECT_EQ('N', CigarOperation::TypeToChar(CigarOperationType::REFERENCE_SKIP) );
+    EXPECT_EQ('S', CigarOperation::TypeToChar(CigarOperationType::SOFT_CLIP) );
+    EXPECT_EQ('H', CigarOperation::TypeToChar(CigarOperationType::HARD_CLIP) );
+    EXPECT_EQ('P', CigarOperation::TypeToChar(CigarOperationType::PADDING) );
+    EXPECT_EQ('=', CigarOperation::TypeToChar(CigarOperationType::SEQUENCE_MATCH) );
+    EXPECT_EQ('X', CigarOperation::TypeToChar(CigarOperationType::SEQUENCE_MISMATCH) );
+}
+
+TEST(CigarTest, CharToType)
+{
+    EXPECT_EQ(CigarOperationType::ALIGNMENT_MATCH,   CigarOperation::CharToType('M'));
+    EXPECT_EQ(CigarOperationType::INSERTION,         CigarOperation::CharToType('I'));
+    EXPECT_EQ(CigarOperationType::DELETION,          CigarOperation::CharToType('D'));
+    EXPECT_EQ(CigarOperationType::REFERENCE_SKIP,    CigarOperation::CharToType('N'));
+    EXPECT_EQ(CigarOperationType::SOFT_CLIP,         CigarOperation::CharToType('S'));
+    EXPECT_EQ(CigarOperationType::HARD_CLIP,         CigarOperation::CharToType('H'));
+    EXPECT_EQ(CigarOperationType::PADDING,           CigarOperation::CharToType('P'));
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MATCH,    CigarOperation::CharToType('='));
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MISMATCH, CigarOperation::CharToType('X'));
+}
+
+TEST(CigarTest, SetOperationYieldsCorrectType)
+{
+    CigarOperation c1; c1.Type(CigarOperationType::ALIGNMENT_MATCH);
+    CigarOperation c2; c2.Type(CigarOperationType::INSERTION);
+    CigarOperation c3; c3.Type(CigarOperationType::DELETION);
+    CigarOperation c4; c4.Type(CigarOperationType::REFERENCE_SKIP);
+    CigarOperation c5; c5.Type(CigarOperationType::SOFT_CLIP);
+    CigarOperation c6; c6.Type(CigarOperationType::HARD_CLIP);
+    CigarOperation c7; c7.Type(CigarOperationType::PADDING);
+    CigarOperation c8; c8.Type(CigarOperationType::SEQUENCE_MATCH);
+    CigarOperation c9; c9.Type(CigarOperationType::SEQUENCE_MISMATCH);
+
+    EXPECT_EQ('M', c1.Char());
+    EXPECT_EQ('I', c2.Char());
+    EXPECT_EQ('D', c3.Char());
+    EXPECT_EQ('N', c4.Char());
+    EXPECT_EQ('S', c5.Char());
+    EXPECT_EQ('H', c6.Char());
+    EXPECT_EQ('P', c7.Char());
+    EXPECT_EQ('=', c8.Char());
+    EXPECT_EQ('X', c9.Char());
+}
+
+TEST(CigarTest, SetTypeYieldsCorrectOperation)
+{
+    CigarOperation c1; c1.Char('M');
+    CigarOperation c2; c2.Char('I');
+    CigarOperation c3; c3.Char('D');
+    CigarOperation c4; c4.Char('N');
+    CigarOperation c5; c5.Char('S');
+    CigarOperation c6; c6.Char('H');
+    CigarOperation c7; c7.Char('P');
+    CigarOperation c8; c8.Char('=');
+    CigarOperation c9; c9.Char('X');
+
+    EXPECT_EQ(CigarOperationType::ALIGNMENT_MATCH,   c1.Type());
+    EXPECT_EQ(CigarOperationType::INSERTION,         c2.Type());
+    EXPECT_EQ(CigarOperationType::DELETION,          c3.Type());
+    EXPECT_EQ(CigarOperationType::REFERENCE_SKIP,    c4.Type());
+    EXPECT_EQ(CigarOperationType::SOFT_CLIP,         c5.Type());
+    EXPECT_EQ(CigarOperationType::HARD_CLIP,         c6.Type());
+    EXPECT_EQ(CigarOperationType::PADDING,           c7.Type());
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MATCH,    c8.Type());
+    EXPECT_EQ(CigarOperationType::SEQUENCE_MISMATCH, c9.Type());
+}
+
+TEST(CigarStringTest, FromStdString_Empty)
+{
+    const std::string emptyCigar = "";
+    Cigar cigar = Cigar::FromStdString(emptyCigar);
+    EXPECT_TRUE(cigar.empty());
+}
+
+TEST(CigarStringTest, FromStdString_SingleOp)
+{
+    const std::string singleCigar = "100=";
+
+    Cigar cigar = Cigar::FromStdString(singleCigar);
+    ASSERT_TRUE(cigar.size() == 1);
+
+    const CigarOperation& op = cigar.front();
+    EXPECT_TRUE(op.Char()   == '=');
+    EXPECT_TRUE(op.Length() == 100);
+}
+
+TEST(CigarStringTest, FromStdString_MultipleOps)
+{
+    const std::string multiCigar = "100=2D34I6=6X6=";
+
+    Cigar cigar = Cigar::FromStdString(multiCigar);
+    ASSERT_TRUE(cigar.size() == 6);
+
+    CigarOperation op0 = cigar.at(0);
+    CigarOperation op1 = cigar.at(1);
+    CigarOperation op2 = cigar.at(2);
+    CigarOperation op3 = cigar.at(3);
+    CigarOperation op4 = cigar.at(4);
+    CigarOperation op5 = cigar.at(5);
+
+    EXPECT_TRUE(op0.Char()   == '=');
+    EXPECT_TRUE(op0.Length() == 100);
+    EXPECT_TRUE(op1.Char()   == 'D');
+    EXPECT_TRUE(op1.Length() == 2);
+    EXPECT_TRUE(op2.Char()   == 'I');
+    EXPECT_TRUE(op2.Length() == 34);
+    EXPECT_TRUE(op3.Char()   == '=');
+    EXPECT_TRUE(op3.Length() == 6);
+    EXPECT_TRUE(op4.Char()   == 'X');
+    EXPECT_TRUE(op4.Length() == 6);
+    EXPECT_TRUE(op5.Char()   == '=');
+    EXPECT_TRUE(op5.Length() == 6);
+}
+
+TEST(CigarStringTest, ToStdString_Empty)
+{
+    const std::string empty;
+    Cigar cigar;
+    EXPECT_EQ(empty, cigar.ToStdString());
+}
+
+TEST(CigarStringTest, ToStdString_SingleOp)
+{
+    const std::string singleCigar = "100=";
+
+    Cigar cigar;
+    cigar.push_back( CigarOperation(CigarOperationType::SEQUENCE_MATCH, 100) );
+
+    EXPECT_EQ(singleCigar, cigar.ToStdString());
+}
+
+TEST(CigarStringTest, ToStdString_MultipleOps)
+{
+    const std::string multiCigar = "100=2D34I6=6X6=";
+
+    Cigar cigar;
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,  100));
+    cigar.push_back(CigarOperation(CigarOperationType::DELETION,          2));
+    cigar.push_back(CigarOperation(CigarOperationType::INSERTION,        34));
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,    6));
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MISMATCH, 6));
+    cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH,    6));
+
+    EXPECT_EQ(multiCigar, cigar.ToStdString());
+}
+
+// clang-format on
diff --git a/tests/src/test_Compare.cpp b/tests/src/test_Compare.cpp

new file mode 100644 (file)

index 0000000..31b98fc
--- /dev/null
+++ b/tests/src/test_Compare.cpp
@@ -0,0 +1,721 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/Compare.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace CompareTests {
+
+static inline
+BamRecord makeRecordWithTag(const std::string& tagName,
+                            const Tag& tag)
+{
+    auto r = BamRecord{ };
+    r.Impl().AddTag(tagName, tag);
+    return r;
+}
+
+static
+BamRecord makeRecord(const Position qStart,
+                     const Position qEnd,
+                     const std::string& seq,
+                     const std::string& quals,
+                     const std::string& tagBases,
+                     const std::string& tagQuals,
+                     const std::vector<uint16_t>& frames)
+{
+    BamRecordImpl impl;
+    impl.SetSequenceAndQualities(seq, quals);
+
+    TagCollection tags;
+    tags["qs"] = qStart;
+    tags["qe"] = qEnd;
+    tags["ip"] = frames;
+    tags["pw"] = frames;
+    tags["dt"] = tagBases;
+    tags["st"] = tagBases;
+    tags["dq"] = tagQuals;
+    tags["iq"] = tagQuals;
+    tags["mq"] = tagQuals;
+    tags["sq"] = tagQuals;
+    tags["pq"] = tagQuals;
+    tags["pv"] = tagQuals;
+    impl.Tags(tags);
+
+    return BamRecord(std::move(impl));
+}
+
+static
+std::vector<BamRecord> makeMappedRecords()
+{
+    const Position qStart = 500;
+    const Position qEnd   = 510;
+    const std::string seq      = "AACCGTTAGC";
+    const std::string quals    = "?]?]?]?]?*";
+    const std::string tagBases = "AACCGTTAGC";
+    const std::string tagQuals = "?]?]?]?]?*";
+    const std::vector<uint16_t> frames  = { 10, 10, 20, 20, 30, 40, 40, 10, 30, 20 };
+    const uint8_t mapQual = 80;
+
+    const std::string s1_cigar = "10=";
+    const std::string s2_cigar = "5=3D5=";
+    const std::string s3_cigar = "4=1D2I2D2X2=";
+
+    BamRecord s1 = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2 = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3 = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s1_rev = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s2_rev = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+    BamRecord s3_rev = CompareTests::makeRecord(qStart, qEnd, seq, quals, tagBases, tagQuals, frames);
+
+    s1.Map(0, 100, Strand::FORWARD, s1_cigar, mapQual);
+    s2.Map(0, 100, Strand::FORWARD, s2_cigar, mapQual);
+    s3.Map(0, 100, Strand::FORWARD, s3_cigar, mapQual);
+    s1_rev.Map(0, 100, Strand::REVERSE, s1_cigar, mapQual);
+    s2_rev.Map(0, 100, Strand::REVERSE, s2_cigar, mapQual);
+    s3_rev.Map(0, 100, Strand::REVERSE, s3_cigar, mapQual);
+
+    return std::vector<BamRecord> { s1, s2, s3, s1_rev, s2_rev, s3_rev };
+}
+
+} // namespace CompareTests
+
+TEST(CompareTest, TypeToNameOk)
+{
+    EXPECT_EQ(std::string{"Compare::EQUAL"},              Compare::TypeToName(Compare::EQUAL));
+    EXPECT_EQ(std::string{"Compare::NOT_EQUAL"},          Compare::TypeToName(Compare::NOT_EQUAL));
+    EXPECT_EQ(std::string{"Compare::LESS_THAN"},          Compare::TypeToName(Compare::LESS_THAN));
+    EXPECT_EQ(std::string{"Compare::LESS_THAN_EQUAL"},    Compare::TypeToName(Compare::LESS_THAN_EQUAL));
+    EXPECT_EQ(std::string{"Compare::GREATER_THAN"},       Compare::TypeToName(Compare::GREATER_THAN));
+    EXPECT_EQ(std::string{"Compare::GREATER_THAN_EQUAL"}, Compare::TypeToName(Compare::GREATER_THAN_EQUAL));
+    EXPECT_EQ(std::string{"Compare::CONTAINS"},           Compare::TypeToName(Compare::CONTAINS));
+    EXPECT_EQ(std::string{"Compare::NOT_CONTAINS"},       Compare::TypeToName(Compare::NOT_CONTAINS));
+
+    // invalid type throws
+    EXPECT_THROW(Compare::TypeToName(static_cast<Compare::Type>(42)), std::runtime_error);
+}
+
+TEST(CompareTest, TypeToOperatorOk)
+{
+    { // normal
+        EXPECT_EQ(Compare::TypeToOperator(Compare::EQUAL),              std::string{"=="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_EQUAL),          std::string{"!="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN),          std::string{"<"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN_EQUAL),    std::string{"<="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN),       std::string{">"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN_EQUAL), std::string{">="});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::CONTAINS),           std::string{"&"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_CONTAINS),       std::string{"~"});
+    }
+
+    { // alpha
+        EXPECT_EQ(Compare::TypeToOperator(Compare::EQUAL, true),              std::string{"eq"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_EQUAL, true),          std::string{"ne"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN, true),          std::string{"lt"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::LESS_THAN_EQUAL, true),    std::string{"lte"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN, true),       std::string{"gt"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::GREATER_THAN_EQUAL, true), std::string{"gte"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::CONTAINS, true),           std::string{"and"});
+        EXPECT_EQ(Compare::TypeToOperator(Compare::NOT_CONTAINS, true),       std::string{"not"});
+    }
+
+    // invalid type throws
+    EXPECT_THROW(Compare::TypeToOperator(static_cast<Compare::Type>(42)), std::runtime_error);
+}
+
+TEST(CompareTest, FromOperatorOk)
+{
+    EXPECT_EQ(Compare::EQUAL,              Compare::TypeFromOperator("=="));
+    EXPECT_EQ(Compare::EQUAL,              Compare::TypeFromOperator("="));
+    EXPECT_EQ(Compare::EQUAL,              Compare::TypeFromOperator("eq"));
+    EXPECT_EQ(Compare::NOT_EQUAL,          Compare::TypeFromOperator("!="));
+    EXPECT_EQ(Compare::NOT_EQUAL,          Compare::TypeFromOperator("ne"));
+    EXPECT_EQ(Compare::LESS_THAN,          Compare::TypeFromOperator("<"));
+    EXPECT_EQ(Compare::LESS_THAN,          Compare::TypeFromOperator("lt"));
+    EXPECT_EQ(Compare::LESS_THAN,          Compare::TypeFromOperator("&lt;"));
+    EXPECT_EQ(Compare::LESS_THAN_EQUAL,    Compare::TypeFromOperator("<="));
+    EXPECT_EQ(Compare::LESS_THAN_EQUAL,    Compare::TypeFromOperator("lte"));
+    EXPECT_EQ(Compare::LESS_THAN_EQUAL,    Compare::TypeFromOperator("&lt;="));
+    EXPECT_EQ(Compare::GREATER_THAN,       Compare::TypeFromOperator(">"));
+    EXPECT_EQ(Compare::GREATER_THAN,       Compare::TypeFromOperator("gt"));
+    EXPECT_EQ(Compare::GREATER_THAN,       Compare::TypeFromOperator("&gt;"));
+    EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator(">="));
+    EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator("gte"));
+    EXPECT_EQ(Compare::GREATER_THAN_EQUAL, Compare::TypeFromOperator("&gt;="));
+    EXPECT_EQ(Compare::CONTAINS,           Compare::TypeFromOperator("&"));
+    EXPECT_EQ(Compare::NOT_CONTAINS,       Compare::TypeFromOperator("~"));
+
+    // invalid operator strings throw
+    EXPECT_THROW(Compare::TypeFromOperator(""),        std::runtime_error);
+    EXPECT_THROW(Compare::TypeFromOperator("invalid"), std::runtime_error);
+}
+
+TEST(CompareTest, AlignedEndOk)
+{
+    BamRecord r1;
+    r1.Map(0, 290, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r2;
+    r2.Map(0, 190, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r3;
+    r3.Map(0, 290, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r4;
+    r4.Map(0, 90, Strand::FORWARD, Cigar{"10="}, 255);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::AlignedEnd());
+
+    EXPECT_EQ(r4.AlignedEnd(), records.at(0).AlignedEnd());
+    EXPECT_EQ(r2.AlignedEnd(), records.at(1).AlignedEnd());
+    EXPECT_EQ(r1.AlignedEnd(), records.at(2).AlignedEnd());
+    EXPECT_EQ(r3.AlignedEnd(), records.at(3).AlignedEnd());
+}
+
+TEST(CompareTest, AlignedStartOk)
+{
+    BamRecord r1;
+    r1.Map(0, 300, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r2;
+    r2.Map(0, 200, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r3;
+    r3.Map(0, 400, Strand::FORWARD, Cigar{"10="}, 255);
+
+    BamRecord r4;
+    r4.Map(0, 100, Strand::FORWARD, Cigar{"10="}, 255);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::AlignedStart());
+
+    EXPECT_EQ(r4.AlignedStart(), records.at(0).AlignedStart());
+    EXPECT_EQ(r2.AlignedStart(), records.at(1).AlignedStart());
+    EXPECT_EQ(r1.AlignedStart(), records.at(2).AlignedStart());
+    EXPECT_EQ(r3.AlignedStart(), records.at(3).AlignedStart());
+}
+
+TEST(CompareTest, AlignedStrandOk)
+{
+    BamRecord r1; r1.Impl().SetReverseStrand(true);
+    BamRecord r2; r2.Impl().SetReverseStrand(false);
+    BamRecord r3; r3.Impl().SetReverseStrand(true);
+    BamRecord r4; r4.Impl().SetReverseStrand(false);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::AlignedStrand());
+
+    EXPECT_EQ(Strand::FORWARD, records.at(0).AlignedStrand());
+    EXPECT_EQ(Strand::FORWARD, records.at(1).AlignedStrand());
+    EXPECT_EQ(Strand::REVERSE, records.at(2).AlignedStrand());
+    EXPECT_EQ(Strand::REVERSE, records.at(3).AlignedStrand());
+}
+
+TEST(CompareTest, BarcodeForwardOk)
+{
+    BamRecord r1; r1.Barcodes(std::make_pair<int16_t,int16_t>(30,20));
+    BamRecord r2; r2.Barcodes(std::make_pair<int16_t,int16_t>(20,30));
+    BamRecord r3; r3.Barcodes(std::make_pair<int16_t,int16_t>(40,10));
+    BamRecord r4; r4.Barcodes(std::make_pair<int16_t,int16_t>(10,40));
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::BarcodeForward());
+
+    EXPECT_EQ(r4.BarcodeForward(), records.at(0).BarcodeForward());
+    EXPECT_EQ(r2.BarcodeForward(), records.at(1).BarcodeForward());
+    EXPECT_EQ(r1.BarcodeForward(), records.at(2).BarcodeForward());
+    EXPECT_EQ(r3.BarcodeForward(), records.at(3).BarcodeForward());
+}
+
+TEST(CompareTest, BarcodeReverseOk)
+{
+    BamRecord r1; r1.Barcodes(std::make_pair<int16_t,int16_t>(30,20));
+    BamRecord r2; r2.Barcodes(std::make_pair<int16_t,int16_t>(20,30));
+    BamRecord r3; r3.Barcodes(std::make_pair<int16_t,int16_t>(40,10));
+    BamRecord r4; r4.Barcodes(std::make_pair<int16_t,int16_t>(10,40));
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::BarcodeReverse());
+
+    EXPECT_EQ(r3.BarcodeReverse(), records.at(0).BarcodeReverse());
+    EXPECT_EQ(r1.BarcodeReverse(), records.at(1).BarcodeReverse());
+    EXPECT_EQ(r2.BarcodeReverse(), records.at(2).BarcodeReverse());
+    EXPECT_EQ(r4.BarcodeReverse(), records.at(3).BarcodeReverse());
+}
+
+TEST(CompareTest, BarcodeQualityOk)
+{
+    uint8_t q1 = 30;
+    uint8_t q2 = 20;
+    uint8_t q3 = 40;
+    uint8_t q4 = 10;
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("bq", Tag(q1)),
+        CompareTests::makeRecordWithTag("bq", Tag(q2)),
+        CompareTests::makeRecordWithTag("bq", Tag(q3)),
+        CompareTests::makeRecordWithTag("bq", Tag(q4))
+    };
+    std::sort(records.begin(), records.end(), Compare::BarcodeQuality());
+
+    EXPECT_EQ(q4, records.at(0).BarcodeQuality());
+    EXPECT_EQ(q2, records.at(1).BarcodeQuality());
+    EXPECT_EQ(q1, records.at(2).BarcodeQuality());
+    EXPECT_EQ(q3, records.at(3).BarcodeQuality());
+}
+
+TEST(CompareTest, CustomCompareOk)
+{
+    struct CustomCompare : public Compare::MemberFunctionBase<bool, &BamRecord::HasDeletionTag> { };
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("dt", Tag(std::string("foo"))),
+        CompareTests::makeRecordWithTag("dt", Tag(std::string("foo"))),
+        CompareTests::makeRecordWithTag("dt", Tag(std::string("foo"))),
+        CompareTests::makeRecordWithTag("dt", Tag(std::string("foo")))
+    };
+    records.push_back(BamRecord());
+    records.push_back(BamRecord());
+    records.push_back(BamRecord());
+    records.push_back(BamRecord());
+    EXPECT_EQ(8, records.size());
+
+    std::sort(records.begin(), records.end(), CustomCompare());
+
+    EXPECT_FALSE(records.at(0).HasDeletionTag());
+    EXPECT_FALSE(records.at(1).HasDeletionTag());
+    EXPECT_FALSE(records.at(2).HasDeletionTag());
+    EXPECT_FALSE(records.at(3).HasDeletionTag());
+    EXPECT_TRUE(records.at(4).HasDeletionTag());
+    EXPECT_TRUE(records.at(5).HasDeletionTag());
+    EXPECT_TRUE(records.at(6).HasDeletionTag());
+    EXPECT_TRUE(records.at(7).HasDeletionTag());
+}
+
+TEST(CompareTest, FullNameOk)
+{
+    BamRecord r1; r1.Impl().Name("c");
+    BamRecord r2; r2.Impl().Name("b");
+    BamRecord r3; r3.Impl().Name("d");
+    BamRecord r4; r4.Impl().Name("a");
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::FullName());
+
+    EXPECT_EQ(r4.FullName(), records.at(0).FullName());
+    EXPECT_EQ(r2.FullName(), records.at(1).FullName());
+    EXPECT_EQ(r1.FullName(), records.at(2).FullName());
+    EXPECT_EQ(r3.FullName(), records.at(3).FullName());
+}
+
+TEST(CompareTest, LocalContextFlagOk)
+{
+    BamRecord r1; r1.LocalContextFlags(LocalContextFlags::BARCODE_AFTER);
+    BamRecord r2; r2.LocalContextFlags(LocalContextFlags::ADAPTER_AFTER);
+    BamRecord r3; r3.LocalContextFlags(LocalContextFlags::REVERSE_PASS);
+    BamRecord r4; r4.LocalContextFlags(LocalContextFlags::NO_LOCAL_CONTEXT);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::LocalContextFlag());
+
+    EXPECT_EQ(r4.LocalContextFlags(), records.at(0).LocalContextFlags());
+    EXPECT_EQ(r2.LocalContextFlags(), records.at(1).LocalContextFlags());
+    EXPECT_EQ(r1.LocalContextFlags(), records.at(2).LocalContextFlags());
+    EXPECT_EQ(r3.LocalContextFlags(), records.at(3).LocalContextFlags());
+}
+
+TEST(CompareTest, MapQualityOk)
+{
+    BamRecord r1; r1.Impl().MapQuality(30);
+    BamRecord r2; r2.Impl().MapQuality(20);
+    BamRecord r3; r3.Impl().MapQuality(40);
+    BamRecord r4; r4.Impl().MapQuality(10);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::MapQuality());
+
+    EXPECT_EQ(r4.MapQuality(), records.at(0).MapQuality());
+    EXPECT_EQ(r2.MapQuality(), records.at(1).MapQuality());
+    EXPECT_EQ(r1.MapQuality(), records.at(2).MapQuality());
+    EXPECT_EQ(r3.MapQuality(), records.at(3).MapQuality());
+}
+
+TEST(CompareTest, MovieNameOk)
+{
+    auto rg1 = ReadGroupInfo { "a", "SUBREAD" };
+    auto rg2 = ReadGroupInfo { "b", "SUBREAD" };
+    auto rg3 = ReadGroupInfo { "c", "SUBREAD" };
+    auto rg4 = ReadGroupInfo { "d", "SUBREAD" };
+
+    BamHeader header;
+    header.AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddReadGroup(rg4);
+
+    BamRecord r1(header); r1.ReadGroup(rg3);
+    BamRecord r2(header); r2.ReadGroup(rg2);
+    BamRecord r3(header); r3.ReadGroup(rg4);
+    BamRecord r4(header); r4.ReadGroup(rg1);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::MovieName());
+
+    EXPECT_EQ(r4.MovieName(), records.at(0).MovieName());
+    EXPECT_EQ(r2.MovieName(), records.at(1).MovieName());
+    EXPECT_EQ(r1.MovieName(), records.at(2).MovieName());
+    EXPECT_EQ(r3.MovieName(), records.at(3).MovieName());
+}
+
+TEST(CompareTest, NoneOk)
+{
+    BamRecord r1; r1.Impl().Name("c");
+    BamRecord r2; r2.Impl().Name("b");
+    BamRecord r3; r3.Impl().Name("d");
+    BamRecord r4; r4.Impl().Name("a");
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::None());
+
+    EXPECT_EQ(r1.FullName(), records.at(0).FullName());
+    EXPECT_EQ(r2.FullName(), records.at(1).FullName());
+    EXPECT_EQ(r3.FullName(), records.at(2).FullName());
+    EXPECT_EQ(r4.FullName(), records.at(3).FullName());
+}
+
+TEST(CompareTest, NumDeletedBasesOk)
+{
+    // create test data
+    auto records = CompareTests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(0, records.at(0).NumDeletedBases());
+    EXPECT_EQ(3, records.at(1).NumDeletedBases());
+    EXPECT_EQ(3, records.at(2).NumDeletedBases());
+    EXPECT_EQ(0, records.at(3).NumDeletedBases());
+    EXPECT_EQ(3, records.at(4).NumDeletedBases());
+    EXPECT_EQ(3, records.at(5).NumDeletedBases());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumDeletedBases());
+    EXPECT_EQ(0, records.at(0).NumDeletedBases());
+    EXPECT_EQ(0, records.at(1).NumDeletedBases());
+    EXPECT_EQ(3, records.at(2).NumDeletedBases());
+    EXPECT_EQ(3, records.at(3).NumDeletedBases());
+    EXPECT_EQ(3, records.at(4).NumDeletedBases());
+    EXPECT_EQ(3, records.at(5).NumDeletedBases());
+}
+
+TEST(CompareTest, NumInsertedBasesOk)
+{
+    // create test data
+    auto records = CompareTests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(0, records.at(0).NumInsertedBases());
+    EXPECT_EQ(0, records.at(1).NumInsertedBases());
+    EXPECT_EQ(2, records.at(2).NumInsertedBases());
+    EXPECT_EQ(0, records.at(3).NumInsertedBases());
+    EXPECT_EQ(0, records.at(4).NumInsertedBases());
+    EXPECT_EQ(2, records.at(5).NumInsertedBases());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumInsertedBases());
+    EXPECT_EQ(0, records.at(0).NumInsertedBases());
+    EXPECT_EQ(0, records.at(1).NumInsertedBases());
+    EXPECT_EQ(0, records.at(2).NumInsertedBases());
+    EXPECT_EQ(0, records.at(3).NumInsertedBases());
+    EXPECT_EQ(2, records.at(4).NumInsertedBases());
+    EXPECT_EQ(2, records.at(5).NumInsertedBases());
+}
+
+TEST(CompareTest, NumMatchesOk)
+{
+    // create test data
+    auto records = CompareTests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(10, records.at(0).NumMatches());
+    EXPECT_EQ(10, records.at(1).NumMatches());
+    EXPECT_EQ(6,  records.at(2).NumMatches());
+    EXPECT_EQ(10, records.at(3).NumMatches());
+    EXPECT_EQ(10, records.at(4).NumMatches());
+    EXPECT_EQ(6,  records.at(5).NumMatches());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumMatches());
+    EXPECT_EQ(6,  records.at(0).NumMatches());
+    EXPECT_EQ(6,  records.at(1).NumMatches());
+    EXPECT_EQ(10, records.at(2).NumMatches());
+    EXPECT_EQ(10, records.at(3).NumMatches());
+    EXPECT_EQ(10, records.at(4).NumMatches());
+    EXPECT_EQ(10, records.at(5).NumMatches());
+}
+
+TEST(CompareTest, NumMismatchesOk)
+{
+    // create test data
+    auto records = CompareTests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(0, records.at(0).NumMismatches());
+    EXPECT_EQ(0, records.at(1).NumMismatches());
+    EXPECT_EQ(2, records.at(2).NumMismatches());
+    EXPECT_EQ(0, records.at(3).NumMismatches());
+    EXPECT_EQ(0, records.at(4).NumMismatches());
+    EXPECT_EQ(2, records.at(5).NumMismatches());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::NumMismatches());
+    EXPECT_EQ(0, records.at(0).NumMismatches());
+    EXPECT_EQ(0, records.at(1).NumMismatches());
+    EXPECT_EQ(0, records.at(2).NumMismatches());
+    EXPECT_EQ(0, records.at(3).NumMismatches());
+    EXPECT_EQ(2, records.at(4).NumMismatches());
+    EXPECT_EQ(2, records.at(5).NumMismatches());
+}
+
+TEST(CompareTest, QueryEndOk)
+{
+    Position q1 = 30;
+    Position q2 = 20;
+    Position q3 = 40;
+    Position q4 = 10;
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("qe", Tag(q1)),
+        CompareTests::makeRecordWithTag("qe", Tag(q2)),
+        CompareTests::makeRecordWithTag("qe", Tag(q3)),
+        CompareTests::makeRecordWithTag("qe", Tag(q4))
+    };
+    std::sort(records.begin(), records.end(), Compare::QueryEnd());
+
+    EXPECT_EQ(q4, records.at(0).QueryEnd());
+    EXPECT_EQ(q2, records.at(1).QueryEnd());
+    EXPECT_EQ(q1, records.at(2).QueryEnd());
+    EXPECT_EQ(q3, records.at(3).QueryEnd());
+}
+
+TEST(CompareTest, QueryStartOk)
+{
+    Position q1 = 30;
+    Position q2 = 20;
+    Position q3 = 40;
+    Position q4 = 10;
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("qs", Tag(q1)),
+        CompareTests::makeRecordWithTag("qs", Tag(q2)),
+        CompareTests::makeRecordWithTag("qs", Tag(q3)),
+        CompareTests::makeRecordWithTag("qs", Tag(q4))
+    };
+    std::sort(records.begin(), records.end(), Compare::QueryStart());
+
+    EXPECT_EQ(q4, records.at(0).QueryStart());
+    EXPECT_EQ(q2, records.at(1).QueryStart());
+    EXPECT_EQ(q1, records.at(2).QueryStart());
+    EXPECT_EQ(q3, records.at(3).QueryStart());
+}
+
+TEST(CompareTest, ReadGroupIdOk)
+{
+    auto rg1 = ReadGroupInfo { "foo", "SUBREAD" };
+    auto rg2 = ReadGroupInfo { "bar", "SUBREAD" };
+    auto rg3 = ReadGroupInfo { "c",   "SUBREAD" };
+    auto rg4 = ReadGroupInfo { "d",   "SUBREAD" };
+
+    BamHeader header;
+    header.AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddReadGroup(rg4);
+
+    BamRecord r1(header); r1.ReadGroup(rg3); // -> 99365356
+    BamRecord r2(header); r2.ReadGroup(rg2); // -> d9f305e4
+    BamRecord r3(header); r3.ReadGroup(rg4); // -> 54397cd6
+    BamRecord r4(header); r4.ReadGroup(rg1); // -> a60ddc69
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReadGroupId()); // lexical, NOT numeric ordering
+
+    EXPECT_EQ(r3.ReadGroupId(), records.at(0).ReadGroupId());
+    EXPECT_EQ(r1.ReadGroupId(), records.at(1).ReadGroupId());
+    EXPECT_EQ(r4.ReadGroupId(), records.at(2).ReadGroupId());
+    EXPECT_EQ(r2.ReadGroupId(), records.at(3).ReadGroupId());
+}
+
+TEST(CompareTest, ReadGroupNumericIdOk)
+{
+    auto rg1 = ReadGroupInfo { "a", "SUBREAD" };
+    auto rg2 = ReadGroupInfo { "b", "SUBREAD" };
+    auto rg3 = ReadGroupInfo { "c", "SUBREAD" };
+    auto rg4 = ReadGroupInfo { "d", "SUBREAD" };
+
+    BamHeader header;
+    header.AddReadGroup(rg1)
+          .AddReadGroup(rg2)
+          .AddReadGroup(rg3)
+          .AddReadGroup(rg4);
+
+    BamRecord r1(header); r1.ReadGroup(rg3); // -> -1724492970
+    BamRecord r2(header); r2.ReadGroup(rg2); // ->   235381373
+    BamRecord r3(header); r3.ReadGroup(rg4); // ->  1413053654
+    BamRecord r4(header); r4.ReadGroup(rg1); // ->  1153643386
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReadGroupNumericId()); // numeric ordering
+
+    EXPECT_EQ(r1.ReadGroupNumericId(), records.at(0).ReadGroupNumericId());
+    EXPECT_EQ(r2.ReadGroupNumericId(), records.at(1).ReadGroupNumericId());
+    EXPECT_EQ(r4.ReadGroupNumericId(), records.at(2).ReadGroupNumericId());
+    EXPECT_EQ(r3.ReadGroupNumericId(), records.at(3).ReadGroupNumericId());
+}
+
+TEST(CompareTest, ReadAccuracyOk)
+{
+    Accuracy a1 = 30;
+    Accuracy a2 = 20;
+    Accuracy a3 = 40;
+    Accuracy a4 = 10;
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("rq", Tag(a1)),
+        CompareTests::makeRecordWithTag("rq", Tag(a2)),
+        CompareTests::makeRecordWithTag("rq", Tag(a3)),
+        CompareTests::makeRecordWithTag("rq", Tag(a4))
+    };
+    std::sort(records.begin(), records.end(), Compare::ReadAccuracy());
+
+    EXPECT_EQ(a4, records.at(0).ReadAccuracy());
+    EXPECT_EQ(a2, records.at(1).ReadAccuracy());
+    EXPECT_EQ(a1, records.at(2).ReadAccuracy());
+    EXPECT_EQ(a3, records.at(3).ReadAccuracy());
+}
+
+TEST(CompareTest, ReferenceEndOk)
+{
+    // create test data
+    auto records = CompareTests::makeMappedRecords();
+
+    // sanity checks on initial conditions
+    EXPECT_EQ(6, records.size());
+    EXPECT_EQ(110, records.at(0).ReferenceEnd());
+    EXPECT_EQ(113, records.at(1).ReferenceEnd());
+    EXPECT_EQ(111, records.at(2).ReferenceEnd());
+    EXPECT_EQ(110, records.at(3).ReferenceEnd());
+    EXPECT_EQ(113, records.at(4).ReferenceEnd());
+    EXPECT_EQ(111, records.at(5).ReferenceEnd());
+
+    // sort & check
+    std::sort(records.begin(), records.end(), Compare::ReferenceEnd());
+    EXPECT_EQ(110, records.at(0).ReferenceEnd());
+    EXPECT_EQ(110, records.at(1).ReferenceEnd());
+    EXPECT_EQ(111, records.at(2).ReferenceEnd());
+    EXPECT_EQ(111, records.at(3).ReferenceEnd());
+    EXPECT_EQ(113, records.at(4).ReferenceEnd());
+    EXPECT_EQ(113, records.at(5).ReferenceEnd());
+}
+
+TEST(CompareTest, ReferenceIdOk)
+{
+    BamRecord r1; r1.Impl().ReferenceId(30);
+    BamRecord r2; r2.Impl().ReferenceId(20);
+    BamRecord r3; r3.Impl().ReferenceId(40);
+    BamRecord r4; r4.Impl().ReferenceId(10);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReferenceId());
+
+    EXPECT_EQ(r4.ReferenceId(), records.at(0).ReferenceId());
+    EXPECT_EQ(r2.ReferenceId(), records.at(1).ReferenceId());
+    EXPECT_EQ(r1.ReferenceId(), records.at(2).ReferenceId());
+    EXPECT_EQ(r3.ReferenceId(), records.at(3).ReferenceId());
+}
+
+TEST(CompareTest, ReferenceNameOk)
+{
+    auto seq1 = SequenceInfo { "seq1" };
+    auto seq2 = SequenceInfo { "seq2" };
+    auto seq3 = SequenceInfo { "seq3" };
+    auto seq4 = SequenceInfo { "seq4" };
+
+    BamHeader header;
+    header.AddSequence(seq1)  // -> 0
+          .AddSequence(seq2)  // -> 1
+          .AddSequence(seq3)  // -> 2
+          .AddSequence(seq4); // -> 3
+
+    BamRecord r1(header); r1.Impl().SetMapped(true); r1.Impl().ReferenceId(2);
+    BamRecord r2(header); r2.Impl().SetMapped(true); r2.Impl().ReferenceId(1);
+    BamRecord r3(header); r3.Impl().SetMapped(true); r3.Impl().ReferenceId(3);
+    BamRecord r4(header); r4.Impl().SetMapped(true); r4.Impl().ReferenceId(0);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReferenceName());
+
+    EXPECT_EQ(seq1.Name(), records.at(0).ReferenceName());
+    EXPECT_EQ(seq2.Name(), records.at(1).ReferenceName());
+    EXPECT_EQ(seq3.Name(), records.at(2).ReferenceName());
+    EXPECT_EQ(seq4.Name(), records.at(3).ReferenceName());
+}
+
+TEST(CompareTest, ReferenceStartOk)
+{
+    BamRecord r1; r1.Impl().Position(30);
+    BamRecord r2; r2.Impl().Position(20);
+    BamRecord r3; r3.Impl().Position(40);
+    BamRecord r4; r4.Impl().Position(10);
+
+    auto records = std::vector<BamRecord>{ r1, r2, r3, r4 };
+    std::sort(records.begin(), records.end(), Compare::ReferenceStart());
+
+    EXPECT_EQ(r4.ReferenceStart(), records.at(0).ReferenceStart());
+    EXPECT_EQ(r2.ReferenceStart(), records.at(1).ReferenceStart());
+    EXPECT_EQ(r1.ReferenceStart(), records.at(2).ReferenceStart());
+    EXPECT_EQ(r3.ReferenceStart(), records.at(3).ReferenceStart());
+}
+
+TEST(CompareTest, ZmwOk)
+{
+    int32_t z1 = 30;
+    int32_t z2 = 20;
+    int32_t z3 = 40;
+    int32_t z4 = 10;
+
+    auto records = std::vector<BamRecord>
+    {
+        CompareTests::makeRecordWithTag("zm", Tag(z1)),
+        CompareTests::makeRecordWithTag("zm", Tag(z2)),
+        CompareTests::makeRecordWithTag("zm", Tag(z3)),
+        CompareTests::makeRecordWithTag("zm", Tag(z4))
+    };
+    std::sort(records.begin(), records.end(), Compare::Zmw());
+
+    EXPECT_EQ(z4, records.at(0).HoleNumber());
+    EXPECT_EQ(z2, records.at(1).HoleNumber());
+    EXPECT_EQ(z1, records.at(2).HoleNumber());
+    EXPECT_EQ(z3, records.at(3).HoleNumber());
+}
+
+// clang-format on
diff --git a/tests/src/test_DataSetCore.cpp b/tests/src/test_DataSetCore.cpp

new file mode 100644 (file)

index 0000000..86fbdfc
--- /dev/null
+++ b/tests/src/test_DataSetCore.cpp
@@ -0,0 +1,503 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/DataSet.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace DataSetCoreTests {
+
+static inline DataSet CreateDataSet()
+{
+    DataSet d;
+    d.Name("foo");
+    return d;
+}
+
+}  // namespace DataSetCoreTests
+
+TEST(DataSetCoreTest, XmlNameParts)
+{
+    internal::XmlName name("ns:node_name");
+    EXPECT_EQ(boost::string_ref("ns"), name.Prefix());
+    EXPECT_EQ(boost::string_ref("node_name"), name.LocalName());
+    EXPECT_EQ(boost::string_ref("ns:node_name"), name.QualifiedName());
+
+    internal::XmlName bareName("node_name");
+    EXPECT_EQ(boost::string_ref(""), bareName.Prefix());
+    EXPECT_EQ(boost::string_ref("node_name"), bareName.LocalName());
+    EXPECT_EQ(boost::string_ref("node_name"), bareName.QualifiedName());
+
+    internal::XmlName leadingColon(":node_name");
+    EXPECT_EQ(boost::string_ref(""), leadingColon.Prefix());
+    EXPECT_EQ(boost::string_ref(":node_name"), leadingColon.LocalName());
+    EXPECT_EQ(boost::string_ref(":node_name"), leadingColon.QualifiedName());
+}
+
+TEST(DataSetCoreTest, DefaultsOk)
+{
+    DataSet dataset;
+    EXPECT_EQ(DataSet::GENERIC, dataset.Type());
+    EXPECT_FALSE(dataset.CreatedAt().empty());
+    EXPECT_FALSE(dataset.MetaType().empty());
+    EXPECT_FALSE(dataset.TimeStampedName().empty());
+    EXPECT_FALSE(dataset.UniqueId().empty());
+    EXPECT_FALSE(dataset.Version().empty());
+
+    EXPECT_EQ(0, dataset.TimeStampedName().find("pacbio_dataset_"));
+
+    EXPECT_TRUE(dataset.Format().empty());
+    EXPECT_TRUE(dataset.ModifiedAt().empty());
+    EXPECT_TRUE(dataset.Name().empty());
+    EXPECT_TRUE(dataset.ResourceId().empty());
+    EXPECT_TRUE(dataset.Tags().empty());
+    EXPECT_EQ(0, dataset.ExternalResources().Size());
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    EXPECT_EQ(std::string{"3.0.1"}, dataset.Version());
+}
+
+TEST(DataSetCoreTest, TimeStampedNamesOk)
+{
+    DataSet dataset;
+    AlignmentSet alignmentSet;
+    BarcodeSet barcodeSet;
+    ContigSet contigSet;
+    ConsensusAlignmentSet consensusAlignmentSet;
+    ConsensusReadSet consensusReadSet;
+    HdfSubreadSet hdfSubreadSet;
+    ReferenceSet referenceSet;
+    SubreadSet subreadSet;
+    TranscriptSet transcriptSet;
+
+    EXPECT_EQ(0, dataset.TimeStampedName().find("pacbio_dataset_dataset-"));
+    EXPECT_EQ(0, alignmentSet.TimeStampedName().find("pacbio_dataset_alignmentset-"));
+    EXPECT_EQ(0, barcodeSet.TimeStampedName().find("pacbio_dataset_barcodeset-"));
+    EXPECT_EQ(0, contigSet.TimeStampedName().find("pacbio_dataset_contigset-"));
+    EXPECT_EQ(
+        0, consensusAlignmentSet.TimeStampedName().find("pacbio_dataset_consensusalignmentset-"));
+    EXPECT_EQ(0, consensusReadSet.TimeStampedName().find("pacbio_dataset_consensusreadset-"));
+    EXPECT_EQ(0, hdfSubreadSet.TimeStampedName().find("pacbio_dataset_hdfsubreadset-"));
+    EXPECT_EQ(0, referenceSet.TimeStampedName().find("pacbio_dataset_referenceset-"));
+    EXPECT_EQ(0, subreadSet.TimeStampedName().find("pacbio_dataset_subreadset-"));
+    EXPECT_EQ(0, transcriptSet.TimeStampedName().find("pacbio_dataset_transcriptset-"));
+}
+
+TEST(DataSetCoreTest, BasicGettersSettersOk)
+{
+    DataSet dataset;
+    dataset.CreatedAt("now");
+    dataset.Format("format");
+    dataset.MetaType("meta");
+    dataset.ModifiedAt("later");
+    dataset.Name("foo");
+    dataset.ResourceId("path/to/file");
+    dataset.Tags("tag tag");
+    dataset.TimeStampedName("now:30");
+    dataset.UniqueId("uuid");
+    dataset.Version("0.0.0");
+
+    EXPECT_EQ(std::string("now"), dataset.CreatedAt());
+    EXPECT_EQ(std::string("format"), dataset.Format());
+    EXPECT_EQ(std::string("meta"), dataset.MetaType());
+    EXPECT_EQ(std::string("later"), dataset.ModifiedAt());
+    EXPECT_EQ(std::string("foo"), dataset.Name());
+    EXPECT_EQ(std::string("path/to/file"), dataset.ResourceId());
+    EXPECT_EQ(std::string("tag tag"), dataset.Tags());
+    EXPECT_EQ(std::string("now:30"), dataset.TimeStampedName());
+    EXPECT_EQ(std::string("uuid"), dataset.UniqueId());
+    EXPECT_EQ(std::string("0.0.0"), dataset.Version());
+}
+
+TEST(DataSetCoreTest, CopyOk)
+{
+    DataSet d1;
+    d1.Name("foo");
+
+    // copy ctor
+    DataSet d2(d1);
+    EXPECT_EQ(std::string("foo"), d2.Name());
+
+    // copy assignment
+    DataSet d3;
+    d3 = d1;
+    EXPECT_EQ(std::string("foo"), d3.Name());
+}
+
+TEST(DataSetCoreTest, MoveOk)
+{
+    DataSet d1;
+    d1.Name("foo");
+
+// move ctor
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    DataSet d2(std::move(DataSetCoreTests::CreateDataSet()));
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+    EXPECT_EQ(std::string("foo"), d2.Name());
+
+    // move assignment
+    DataSet d3;
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpessimizing-move"
+#endif
+    d3 = std::move(DataSetCoreTests::CreateDataSet());
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+    EXPECT_EQ(std::string("foo"), d3.Name());
+}
+
+TEST(DataSetCoreTest, AddExternalResources)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.ExternalResources().Size());
+
+    ExternalResource resource1("metatype", "id");
+    resource1.Name("file1");
+
+    ExternalResource resource2("metatype", "id2");
+    resource2.Name("file2");
+
+    dataset.ExternalResources().Add(resource1);
+    dataset.ExternalResources().Add(resource2);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+    // disallow duplicates (checking on ResourceId)
+    ExternalResource duplicateResource("metatype", "id");
+    dataset.ExternalResources().Add(duplicateResource);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+    // direct access
+    const ExternalResources& resources = dataset.ExternalResources();
+    EXPECT_EQ(std::string("file1"), resources[0].Name());
+    EXPECT_EQ(std::string("file2"), resources[1].Name());
+
+    // iterable
+    size_t i = 0;
+    for (auto r : resources) {
+        if (i == 0)
+            EXPECT_EQ(std::string("file1"), r.Name());
+        else
+            EXPECT_EQ(std::string("file2"), r.Name());
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, EditExternalResources)
+{
+    DataSet dataset;
+
+    ExternalResource resource("metatype", "id");
+    resource.Name("file1");
+    dataset.ExternalResources().Add(resource);
+
+    resource.Name("file2").ResourceId("id2");
+    dataset.ExternalResources().Add(resource);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+    // edit
+    dataset.ExternalResources()[0].Name("some new name");
+    EXPECT_EQ(std::string("some new name"), dataset.ExternalResources()[0].Name());
+    EXPECT_EQ(std::string("file2"), dataset.ExternalResources()[1].Name());
+}
+
+TEST(DataSetCoreTest, NestedExternalResources)
+{
+    ExternalResource resource("metatype", "filename");
+    resource.ExternalResources().Add(ExternalResource("metatype.child", "filename.child"));
+    resource.ExternalResources().Add(ExternalResource("metatype.child2", "filename.child2"));
+
+    const ExternalResources& childResources = resource.ExternalResources();
+    EXPECT_EQ(2, childResources.Size());
+    EXPECT_EQ(std::string("metatype.child"), childResources[0].MetaType());
+    EXPECT_EQ(std::string("metatype.child2"), childResources[1].MetaType());
+    EXPECT_EQ(std::string("filename.child"), childResources[0].ResourceId());
+    EXPECT_EQ(std::string("filename.child2"), childResources[1].ResourceId());
+}
+
+TEST(DataSetCoreTest, AddFilters)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    Filter filter;
+    filter.Properties().Add(Property("rq", "0.85", ">"));
+    filter.Properties().Add(Property("RNAME", "chr1", "=="));
+    EXPECT_EQ(2, filter.Properties().Size());
+
+    Filter filter2;
+    filter2.Properties().Add(Property("rq", "0.50", ">="));
+    filter2.Properties().Add(Property("RNAME", "chr2", "!="));
+    EXPECT_EQ(2, filter2.Properties().Size());
+
+    dataset.Filters().Add(filter);
+    dataset.Filters().Add(filter2);
+
+    const Filters& filters = dataset.Filters();
+    EXPECT_EQ(2, filters.Size());
+    EXPECT_EQ(2, filters[0].Properties().Size());
+    EXPECT_EQ(2, filters[1].Properties().Size());
+
+    // direct access
+    const Property& p0 = filters[0].Properties()[0];
+    EXPECT_EQ(std::string("rq"), p0.Name());
+    EXPECT_EQ(std::string("0.85"), p0.Value());
+    EXPECT_EQ(std::string(">"), p0.Operator());
+
+    const Property& p1 = filters[0].Properties()[1];
+    EXPECT_EQ(std::string("RNAME"), p1.Name());
+    EXPECT_EQ(std::string("chr1"), p1.Value());
+    EXPECT_EQ(std::string("=="), p1.Operator());
+
+    const Property& p2 = filters[1].Properties()[0];
+    EXPECT_EQ(std::string("rq"), p2.Name());
+    EXPECT_EQ(std::string("0.50"), p2.Value());
+    EXPECT_EQ(std::string(">="), p2.Operator());
+
+    const Property& p3 = filters[1].Properties()[1];
+    EXPECT_EQ(std::string("RNAME"), p3.Name());
+    EXPECT_EQ(std::string("chr2"), p3.Value());
+    EXPECT_EQ(std::string("!="), p3.Operator());
+
+    // iteratable
+    size_t i = 0;
+    size_t j = 0;
+    for (const Filter& f : filters) {
+        if (i == 0) {
+            const Properties& properties = f.Properties();
+            for (const Property& p : properties) {
+                if (j == 0) {
+                    EXPECT_EQ(std::string("rq"), p.Name());
+                    EXPECT_EQ(std::string("0.85"), p.Value());
+                    EXPECT_EQ(std::string(">"), p.Operator());
+                } else {
+                    EXPECT_EQ(std::string("RNAME"), p.Name());
+                    EXPECT_EQ(std::string("chr1"), p.Value());
+                    EXPECT_EQ(std::string("=="), p.Operator());
+                }
+                ++j;
+            }
+        } else {
+            const Properties& properties = f.Properties();
+            for (const Property& p : properties) {
+                if (j == 0) {
+                    EXPECT_EQ(std::string("rq"), p.Name());
+                    EXPECT_EQ(std::string("0.50"), p.Value());
+                    EXPECT_EQ(std::string(">="), p.Operator());
+                } else {
+                    EXPECT_EQ(std::string("RNAME"), p.Name());
+                    EXPECT_EQ(std::string("chr2"), p.Value());
+                    EXPECT_EQ(std::string("!="), p.Operator());
+                }
+                ++j;
+            }
+        }
+        ++i;
+        j = 0;
+    }
+}
+
+TEST(DataSetCoreTest, EditFilters)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    Filter filter;
+    filter.Properties().Add(Property("rq", "0.85", ">"));
+    filter.Properties().Add(Property("RNAME", "chr1", "=="));
+    EXPECT_EQ(2, filter.Properties().Size());
+
+    Filter filter2;
+    filter2.Properties().Add(Property("rq", "0.50", ">="));
+    filter2.Properties().Add(Property("RNAME", "chr2", "!="));
+    EXPECT_EQ(2, filter2.Properties().Size());
+
+    dataset.Filters().Add(filter);
+    dataset.Filters().Add(filter2);
+    EXPECT_EQ(2, dataset.Filters().Size());
+    EXPECT_EQ(2, dataset.Filters()[0].Properties().Size());
+    EXPECT_EQ(2, dataset.Filters()[1].Properties().Size());
+
+    // edit property in-place
+    Property& p = dataset.Filters()[0].Properties()[0];
+    p.Name("someNewName");
+    p.Value("someNewValue");
+    p.Operator("==");
+
+    const Property& p0 = dataset.Filters()[0].Properties()[0];
+    EXPECT_EQ(std::string("someNewName"), p0.Name());
+    EXPECT_EQ(std::string("someNewValue"), p0.Value());
+    EXPECT_EQ(std::string("=="), p0.Operator());
+
+    const Property& p1 = dataset.Filters()[0].Properties()[1];
+    EXPECT_EQ(std::string("RNAME"), p1.Name());
+    EXPECT_EQ(std::string("chr1"), p1.Value());
+    EXPECT_EQ(std::string("=="), p1.Operator());
+
+    const Property& p2 = dataset.Filters()[1].Properties()[0];
+    EXPECT_EQ(std::string("rq"), p2.Name());
+    EXPECT_EQ(std::string("0.50"), p2.Value());
+    EXPECT_EQ(std::string(">="), p2.Operator());
+
+    const Property& p3 = dataset.Filters()[1].Properties()[1];
+    EXPECT_EQ(std::string("RNAME"), p3.Name());
+    EXPECT_EQ(std::string("chr2"), p3.Value());
+    EXPECT_EQ(std::string("!="), p3.Operator());
+}
+
+TEST(DataSetCoreTest, AddSubDataSets)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    DataSetBase sub1;
+    sub1.Name("subset_1");
+
+    DataSetBase sub2;
+    sub2.Name("subset_2");
+
+    dataset.SubDataSets().Add(sub1);
+    dataset.SubDataSets().Add(sub2);
+    EXPECT_EQ(2, dataset.SubDataSets().Size());
+
+    // direct access
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    EXPECT_EQ(std::string("subset_1"), subdatasets[0].Name());
+    EXPECT_EQ(std::string("subset_2"), subdatasets[1].Name());
+
+    // iterable
+    size_t i = 0;
+    for (const DataSetBase& ds : subdatasets) {
+        if (i == 0)
+            EXPECT_EQ(std::string("subset_1"), ds.Name());
+        else
+            EXPECT_EQ(std::string("subset_2"), ds.Name());
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, EditSubDataSets)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    DataSetBase sub1;
+    sub1.Name("subset_1");
+
+    DataSetBase sub2;
+    sub2.Name("subset_2");
+
+    dataset.SubDataSets().Add(sub1);
+    dataset.SubDataSets().Add(sub2);
+    EXPECT_EQ(2, dataset.SubDataSets().Size());
+
+    // edit
+    dataset.SubDataSets()[0].Name("subset_1_edited");
+
+    // direct access
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    EXPECT_EQ(std::string("subset_1_edited"), subdatasets[0].Name());
+    EXPECT_EQ(std::string("subset_2"), subdatasets[1].Name());
+
+    // iterable
+    size_t i = 0;
+    for (const DataSetBase& ds : subdatasets) {
+        if (i == 0)
+            EXPECT_EQ(std::string("subset_1_edited"), ds.Name());
+        else
+            EXPECT_EQ(std::string("subset_2"), ds.Name());
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, RemoveExternalResources)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.ExternalResources().Size());
+
+    ExternalResource resource1("metatype", "id");
+    resource1.Name("file1");
+
+    ExternalResource resource2("metatype", "id2");
+    resource2.Name("file2");
+
+    dataset.ExternalResources().Add(resource1);
+    dataset.ExternalResources().Add(resource2);
+    EXPECT_EQ(2, dataset.ExternalResources().Size());
+
+    // remove
+    dataset.ExternalResources().Remove(resource1);
+    EXPECT_EQ(1, dataset.ExternalResources().Size());
+
+    // direct access
+    const ExternalResources& resources = dataset.ExternalResources();
+    EXPECT_EQ(std::string("file2"), resources[0].Name());
+
+    // iterable
+    size_t i = 0;
+    for (auto r : resources) {
+        if (i == 0) {
+            EXPECT_EQ(std::string("file2"), r.Name());
+        }
+        ++i;
+    }
+}
+
+TEST(DataSetCoreTest, RemoveFilters)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    Filter filter;
+    filter.Properties().Add(Property("rq", "0.85", ">"));
+    filter.Properties().Add(Property("RNAME", "chr1", "=="));
+    EXPECT_EQ(2, filter.Properties().Size());
+
+    Filter filter2;
+    filter2.Properties().Add(Property("rq", "0.50", ">="));
+    filter2.Properties().Add(Property("RNAME", "chr2", "!="));
+    EXPECT_EQ(2, filter2.Properties().Size());
+
+    dataset.Filters().Add(filter);
+    dataset.Filters().Add(filter2);
+    EXPECT_EQ(2, dataset.Filters().Size());
+
+    // remove
+    dataset.Filters().Remove(filter);
+    EXPECT_EQ(1, dataset.Filters().Size());
+
+    const Filters& filters = dataset.Filters();
+    EXPECT_EQ(2, filters[0].Properties().Size());
+}
+
+TEST(DataSetCoreTest, RemoveSubDataSets)
+{
+    DataSet dataset;
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    DataSetBase sub1;
+    sub1.Name("subset_1");
+
+    DataSetBase sub2;
+    sub2.Name("subset_2");
+
+    dataset.SubDataSets().Add(sub1);
+    dataset.SubDataSets().Add(sub2);
+    EXPECT_EQ(2, dataset.SubDataSets().Size());
+
+    // remove
+    dataset.SubDataSets().Remove(sub2);
+    EXPECT_EQ(1, dataset.SubDataSets().Size());
+}
diff --git a/tests/src/test_DataSetIO.cpp b/tests/src/test_DataSetIO.cpp

new file mode 100644 (file)

index 0000000..834d111
--- /dev/null
+++ b/tests/src/test_DataSetIO.cpp
@@ -0,0 +1,1615 @@
+// Author: Derek Barnett
+
+#include <unistd.h>
+#include <cstddef>
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "../src/FileUtils.h"
+#include "PbbamTestData.h"
+
+#include <pbbam/DataSet.h>
+#include <pbbam/internal/DataSetElement.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace DataSetIOTests {
+
+const std::string alignedBamFn  = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+const std::string bamGroupFofn  = PbbamTestsConfig::Generated_Dir + "/group.fofn";
+
+const std::string ali1XmlFn = PbbamTestsConfig::Data_Dir + "/dataset/ali1.xml";
+const std::string ali2XmlFn = PbbamTestsConfig::Data_Dir + "/dataset/ali2.xml";
+const std::string ali3XmlFn = PbbamTestsConfig::Data_Dir + "/dataset/ali3.xml";
+const std::string ali4XmlFn = PbbamTestsConfig::Data_Dir + "/dataset/ali4.xml";
+const std::string mappingStaggeredXmlFn = PbbamTestsConfig::Data_Dir + "/dataset/bam_mapping_staggered.xml";
+const std::string barcodeXmlFn = PbbamTestsConfig::Data_Dir + "/dataset/barcode.dataset.xml";
+const std::string ccsReadXmlFn = PbbamTestsConfig::Data_Dir + "/dataset/ccsread.dataset.xml";
+const std::string lambdaContigsXmlFn = PbbamTestsConfig::Data_Dir + "/dataset/lambda_contigs.xml";
+const std::string pbalchemyXmlFn   = PbbamTestsConfig::Data_Dir + "/dataset/pbalchemy10kbp.xml";
+const std::string referenceXmlFn   = PbbamTestsConfig::Data_Dir + "/dataset/reference.dataset.xml";
+const std::string subread1XmlFn    = PbbamTestsConfig::Data_Dir + "/dataset/subread_dataset1.xml";
+const std::string subread2XmlFn    = PbbamTestsConfig::Data_Dir + "/dataset/subread_dataset2.xml";
+const std::string subread3XmlFn    = PbbamTestsConfig::Data_Dir + "/dataset/subread_dataset3.xml";
+const std::string transformedXmlFn = PbbamTestsConfig::Data_Dir + "/dataset/transformed_rs_subread_dataset.xml";
+
+static void TestFromXmlString();
+static void TestAli1Xml();
+static void TestAli2Xml();
+static void TestAli3Xml();
+static void TestAli4Xml();
+static void TestMappingStaggeredXml();
+static void TestBarcodeXml();
+static void TestCcsReadXml();
+static void TestLambdaContigsXml();
+static void TestPbalchemyXml();
+static void TestReferenceXml();
+static void TestSubread1Xml();
+static void TestSubread2Xml();
+static void TestSubread3Xml();
+static void TestTransformedXml();
+
+static inline
+void changeCurrentDirectory(const std::string& dir)
+{ ASSERT_EQ(0, chdir(dir.c_str())); }
+
+} // namespace DataSetIOTests
+
+TEST(DataSetIOTest, FromBamFilename)
+{
+    DataSet dataset(DataSetIOTests::alignedBamFn);
+
+    EXPECT_EQ(1, dataset.ExternalResources().Size());
+    const ExternalResource& bamRef = dataset.ExternalResources()[0];
+
+    EXPECT_EQ(DataSetIOTests::alignedBamFn, bamRef.ResourceId());
+}
+
+TEST(DataSetIOTest, FromBamFilenames)
+{
+    std::ifstream fofn(DataSetIOTests::bamGroupFofn);
+    std::vector<std::string> files;
+    std::string file;
+    while (std::getline(fofn, file)) if (!file.empty()) files.emplace_back(file);
+    DataSet dataset(files);
+    EXPECT_EQ(3, dataset.ExternalResources().Size());
+}
+
+TEST(DataSetIOTest, FromBamFileObject)
+{
+    BamFile bamFile(DataSetIOTests::alignedBamFn);
+    DataSet dataset(bamFile.Filename());
+
+    EXPECT_EQ(1, dataset.ExternalResources().Size());
+    const ExternalResource& bamRef = dataset.ExternalResources()[0];
+
+    EXPECT_EQ(DataSetIOTests::alignedBamFn, bamRef.ResourceId());
+}
+
+TEST(DataSetIOTest, FromFofn)
+{
+    DataSet dataset(DataSetIOTests::bamGroupFofn);
+    EXPECT_EQ(3, dataset.ExternalResources().Size());
+}
+
+TEST(DataSetIOTest, FromXml)
+{
+    EXPECT_NO_THROW(DataSetIOTests::TestFromXmlString());
+}
+
+TEST(DataSetIOTest, FromXmlFile)
+{
+    EXPECT_NO_THROW(DataSetIOTests::TestAli1Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestAli2Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestAli3Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestAli4Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestMappingStaggeredXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestBarcodeXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestCcsReadXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestLambdaContigsXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestPbalchemyXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestReferenceXml());
+    EXPECT_NO_THROW(DataSetIOTests::TestSubread1Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestSubread2Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestSubread3Xml());
+    EXPECT_NO_THROW(DataSetIOTests::TestTransformedXml());
+}
+
+TEST(DataSetIOTest, ThrowsOnNonexistentFofnFile)
+{
+    bool checkedException = false;
+    try
+    {
+        auto ds = DataSet{"does/not/exist.fofn"};
+    }
+    catch(std::runtime_error& e)
+    {
+        const std::string msg = "could not open FOFN for reading: does/not/exist.fofn";
+        EXPECT_EQ(msg, e.what()) ;
+        checkedException = true;
+    }
+    EXPECT_TRUE(checkedException);
+}
+
+TEST(DataSetIOTest, ThrowsOnNonexistentXmlFile)
+{
+    bool checkedException = false;
+    try
+    {
+        auto ds = DataSet{"does/not/exist.xml"};
+    }
+    catch(std::runtime_error& e)
+    {
+        const std::string msg = "could not open XML file for reading: does/not/exist.xml";
+        EXPECT_EQ(msg, e.what()) ;
+        checkedException = true;
+    }
+    EXPECT_TRUE(checkedException);
+}
+
+TEST(DataSetIOTest, ThrowsOnUnsupportedExtension)
+{
+    bool checkedException = false;
+    try
+    {
+        auto ds = DataSet{"bad/extension.foo"};
+    }
+    catch(std::runtime_error& e)
+    {
+        const std::string msg = "unsupported extension on input file: bad/extension.foo";
+        EXPECT_EQ(msg, e.what()) ;
+        checkedException = true;
+    }
+    EXPECT_TRUE(checkedException);
+}
+
+TEST(DataSetIOTest, ThrowsIfCannotOpenSaveFile)
+{
+    bool checkedException = false;
+    try
+    {
+        auto ds = DataSet{};
+        std::string fn = "fake_directory_that_should_not_exist/out.xml";
+        ds.Save(fn);
+    }
+    catch(std::runtime_error& e)
+    {
+        const std::string msg = "could not open XML file for writing: fake_directory_that_should_not_exist/out.xml";
+        EXPECT_EQ(msg, e.what()) ;
+        checkedException = true;
+    }
+    EXPECT_TRUE(checkedException);
+}
+
+TEST(DataSetIOTest, ToXml)
+{
+    // top-level data
+    DataSet dataset(DataSet::ALIGNMENT);
+    dataset.CreatedAt("2015-01-27T09:00:01");
+    dataset.MetaType("PacBio.DataSet.AlignmentSet");
+    dataset.Name("DataSet_AlignmentSet");
+    dataset.Tags("barcode moreTags mapping mytags");
+    dataset.TimeStampedName("my_tsn");
+    dataset.UniqueId("b095d0a3-94b8-4918-b3af-a3f81bbe519c");
+    dataset.Attribute("xmlns",              "http://pacificbiosciences.com/PacBioDatasets.xsd")
+           .Attribute("xmlns:xsi",          "http://www.w3.org/2001/XMLSchema-instance")
+           .Attribute("xsi:schemaLocation", "http://pacificbiosciences.com/PacBioDatasets.xsd");
+
+    // external resources
+    ExternalResource resource1("AlignmentFile.AlignmentBamFile", "file:/mnt/path/to/alignments2.bam");    
+    resource1.Name("Third Alignments BAM");
+    resource1.Description("Points to an example Alignments BAM file.");
+    resource1.Tags("Example");
+    resource1.TimeStampedName("my_tsn");
+    resource1.UniqueId("my_uuid");
+    FileIndex pbi1("PacBio.Index.PacBioIndex", "file:/mnt/path/to/alignments2.pbi");
+    pbi1.TimeStampedName("my_tsn");
+    pbi1.UniqueId("my_uuid");
+    resource1.FileIndices().Add(pbi1);
+    dataset.ExternalResources().Add(resource1);
+
+    ExternalResource resource2("AlignmentFile.AlignmentBamFile", "file:./alignments3.bam");
+    resource2.Name("Fourth Alignments BAM");
+    resource2.Description("Points to another example Alignments BAM file, by relative path.");
+    resource2.Tags("Example");
+    resource2.TimeStampedName("my_tsn");
+    resource2.UniqueId("my_uuid");
+    FileIndex pbi2("PacBio.Index.PacBioIndex", "file:/mnt/path/to/alignments3.pbi");
+    pbi2.TimeStampedName("my_tsn");
+    pbi2.UniqueId("my_uuid");
+
+    resource2.FileIndices().Add(pbi2);
+    dataset.ExternalResources().Add(resource2);
+
+    // sub-datasets with filters
+    DataSetBase subDataSet1;
+    subDataSet1.Name("HighQuality Read Alignments");
+    subDataSet1.TimeStampedName("my_tsn");
+    subDataSet1.UniqueId("ab95d0a3-94b8-4918-b3af-a3f81bbe519c");
+    Filter filter1;
+    filter1.Properties().Add(Property("rq", "0.85", ">"));
+    subDataSet1.Filters().Add(filter1);
+    dataset.SubDataSets().Add(subDataSet1);
+
+    DataSetBase subDataSet2;
+    subDataSet2.Name("Alignments to chromosome 1");
+    subDataSet2.TimeStampedName("my_tsn");
+    subDataSet2.UniqueId("ac95d0a3-94b8-4918-b3af-a3f81bbe519c");
+    Filter filter2;
+    filter2.Properties().Add(Property("RNAME", "chr1", "=="));
+    subDataSet2.Filters().Add(filter2);
+    dataset.SubDataSets().Add(subDataSet2);
+
+    // write dataset
+    const std::string expectedXml{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet "
+                "CreatedAt=\"2015-01-27T09:00:01\" "
+                "MetaType=\"PacBio.DataSet.AlignmentSet\" "
+                "Name=\"DataSet_AlignmentSet\" "
+                "Tags=\"barcode moreTags mapping mytags\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" Version=\"3.0.1\" "
+                "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:pbbase=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+                "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to an example Alignments BAM file.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Third Alignments BAM\" "
+                "ResourceId=\"file:/mnt/path/to/alignments2.bam\" "
+                "Tags=\"Example\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                "MetaType=\"PacBio.Index.PacBioIndex\" "
+                "ResourceId=\"file:/mnt/path/to/alignments2.pbi\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to another example Alignments BAM file, by relative path.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Fourth Alignments BAM\" "
+                "ResourceId=\"file:./alignments3.bam\" "
+                "Tags=\"Example\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                "MetaType=\"PacBio.Index.PacBioIndex\" "
+                "ResourceId=\"file:/mnt/path/to/alignments3.pbi\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"my_uuid\" Version=\"3.0.1\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t</pbbase:ExternalResources>\n"
+        "\t<pbds:DataSets>\n"
+        "\t\t<pbds:DataSet "
+                "MetaType=\"PacBio.DataSet.DataSet\" "
+                "Name=\"HighQuality Read Alignments\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"ab95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"3.0.1\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"rq\" Operator=\">\" Value=\"0.85\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t\t<pbds:DataSet "
+                "MetaType=\"PacBio.DataSet.DataSet\" "
+                "Name=\"Alignments to chromosome 1\" "
+                "TimeStampedName=\"my_tsn\" "
+                "UniqueId=\"ac95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"3.0.1\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"RNAME\" Operator=\"==\" Value=\"chr1\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t</pbds:DataSets>\n"
+        "</pbds:AlignmentSet>\n"};
+
+    std::ostringstream s;
+    dataset.SaveToStream(s);
+    EXPECT_EQ(expectedXml, s.str());
+}
+
+namespace DataSetIOTests {
+
+static void TestFromXmlString()
+{
+    const std::string inputXml{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet "
+            "CreatedAt=\"2015-01-27T09:00:01\" "
+            "MetaType=\"PacBio.DataSet.AlignmentSet\" "
+            "Name=\"DataSet_AlignmentSet\" "
+            "Tags=\"barcode moreTags mapping mytags\" "
+            "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+            "Version=\"2.3.0\" "
+            "xmlns=\"http://pacificbiosciences.com/PacBioDataModel.xsd\" "
+            "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+            "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDataModel.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to an example Alignments BAM file.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Third Alignments BAM\" "
+                "ResourceId=\"file:/mnt/path/to/alignments2.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"file:/mnt/path/to/alignments2.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to another example Alignments BAM file, by relative path.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Fourth Alignments BAM\" "
+                "ResourceId=\"file:./alignments3.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"file:/mnt/path/to/alignments3.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t</pbbase:ExternalResources>\n"
+        "\t<pbds:DataSets>\n"
+        "\t\t<pbds:DataSet "
+                "Name=\"HighQuality Read Alignments\" "
+                "UniqueId=\"ab95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"2.3.0\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"rq\" Operator=\">\" Value=\"0.85\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t\t<pbds:DataSet "
+                "Name=\"Alignments to chromosome 1\" "
+                "UniqueId=\"ac95d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+                "Version=\"2.3.0\">\n"
+        "\t\t\t<pbds:Filters>\n"
+        "\t\t\t\t<pbds:Filter>\n"
+        "\t\t\t\t\t<pbbase:Properties>\n"
+        "\t\t\t\t\t\t<pbbase:Property Name=\"RNAME\" Operator=\"==\" Value=\"chr1\" />\n"
+        "\t\t\t\t\t</pbbase:Properties>\n"
+        "\t\t\t\t</pbds:Filter>\n"
+        "\t\t\t</pbds:Filters>\n"
+        "\t\t</pbds:DataSet>\n"
+        "\t</pbds:DataSets>\n"
+        "</pbds:AlignmentSet>\n"};
+
+    const DataSet dataset = DataSet::FromXml(inputXml);
+
+    EXPECT_EQ(DataSet::ALIGNMENT,                     dataset.Type());
+    EXPECT_EQ("2015-01-27T09:00:01",                  dataset.CreatedAt());
+    EXPECT_EQ("PacBio.DataSet.AlignmentSet",          dataset.MetaType());
+    EXPECT_EQ("DataSet_AlignmentSet",                 dataset.Name());
+    EXPECT_EQ("barcode moreTags mapping mytags",      dataset.Tags());
+    EXPECT_EQ("b095d0a3-94b8-4918-b3af-a3f81bbe519c", dataset.UniqueId());
+    EXPECT_EQ("2.3.0",                                dataset.Version());
+    EXPECT_EQ("http://pacificbiosciences.com/PacBioDataModel.xsd", dataset.Attribute("xmlns"));
+    EXPECT_EQ("http://www.w3.org/2001/XMLSchema-instance",         dataset.Attribute("xmlns:xsi"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    EXPECT_EQ(2, resources.Size());
+
+    const ExternalResource& resource1 = resources[0];
+    EXPECT_EQ("Third Alignments BAM",                      resource1.Name());
+    EXPECT_EQ("Points to an example Alignments BAM file.", resource1.Description());
+    EXPECT_EQ("AlignmentFile.AlignmentBamFile",            resource1.MetaType());
+    EXPECT_EQ("file:/mnt/path/to/alignments2.bam",         resource1.ResourceId());
+    EXPECT_EQ("Example",                                   resource1.Tags());
+    const FileIndices& fileIndices1 = resource1.FileIndices();
+    EXPECT_EQ(1, fileIndices1.Size());
+    const FileIndex& pbi1 = fileIndices1[0];
+    EXPECT_EQ("PacBio.Index.PacBioIndex",          pbi1.MetaType());
+    EXPECT_EQ("file:/mnt/path/to/alignments2.pbi", pbi1.ResourceId());
+
+    const ExternalResource& resource2 = resources[1];
+    EXPECT_EQ("Fourth Alignments BAM",                     resource2.Name());
+    EXPECT_EQ("Points to another example Alignments BAM file, by relative path.", resource2.Description());
+    EXPECT_EQ("AlignmentFile.AlignmentBamFile",            resource2.MetaType());
+    EXPECT_EQ("file:./alignments3.bam",                    resource2.ResourceId());
+    EXPECT_EQ("Example",                                   resource2.Tags());
+    const FileIndices& fileIndices2 = resource2.FileIndices();
+    EXPECT_EQ(1, fileIndices2.Size());
+    const FileIndex& pbi2 = fileIndices2[0];
+    EXPECT_EQ("PacBio.Index.PacBioIndex",          pbi2.MetaType());
+    EXPECT_EQ("file:/mnt/path/to/alignments3.pbi", pbi2.ResourceId());
+
+    const SubDataSets& subDatasets = dataset.SubDataSets();
+    EXPECT_EQ(2, subDatasets.Size());
+
+    const DataSetBase& sub1 = subDatasets[0];
+    EXPECT_EQ("HighQuality Read Alignments",          sub1.Name());
+    EXPECT_EQ("ab95d0a3-94b8-4918-b3af-a3f81bbe519c", sub1.UniqueId());
+    EXPECT_EQ("2.3.0",                                sub1.Version());
+    const Filters& sub1Filters = sub1.Filters();
+    EXPECT_EQ(1, sub1Filters.Size());
+    const Filter& sub1Filter = sub1Filters[0];
+    EXPECT_EQ(1, sub1Filter.Properties().Size());
+    const Property& property1 = sub1Filter.Properties()[0];
+    EXPECT_EQ("rq",   property1.Name());
+    EXPECT_EQ(">",    property1.Operator());
+    EXPECT_EQ("0.85", property1.Value());
+
+    const DataSetBase& sub2 = subDatasets[1];
+    EXPECT_EQ("Alignments to chromosome 1",          sub2.Name());
+    EXPECT_EQ("ac95d0a3-94b8-4918-b3af-a3f81bbe519c", sub2.UniqueId());
+    EXPECT_EQ("2.3.0",                                sub2.Version());
+    const Filters& sub2Filters = sub2.Filters();
+    EXPECT_EQ(1, sub2Filters.Size());
+    const Filter& sub2Filter = sub2Filters[0];
+    EXPECT_EQ(1, sub2Filter.Properties().Size());
+    const Property& property2 = sub2Filter.Properties()[0];
+    EXPECT_EQ("RNAME",   property2.Name());
+    EXPECT_EQ("==",    property2.Operator());
+    EXPECT_EQ("chr1", property2.Value());
+}
+
+static void TestAli1Xml()
+{
+    const DataSet dataset(DataSetIOTests::ali1XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments0.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:./alignments1.bam"),  resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments1.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.85"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("RNAME"), property.Name());
+            EXPECT_EQ(std::string("chr1"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestAli2Xml()
+{
+    const DataSet dataset(DataSetIOTests::ali2XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments2.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:./alignments3.bam"),  resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments3.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.85"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("RNAME"), property.Name());
+            EXPECT_EQ(std::string("chr1"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestAli3Xml()
+{
+    const DataSet dataset(DataSetIOTests::ali3XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments2.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:./alignments3.bam"),  resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments3.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.75"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("RNAME"), property.Name());
+            EXPECT_EQ(std::string("chr1"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestAli4Xml()
+{
+    const DataSet dataset(DataSetIOTests::ali4XmlFn);
+    EXPECT_EQ(DataSet::ALIGNMENT, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),                  dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.AlignmentSet"),          dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_AlignmentSet"),                 dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"),      dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"),                                dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Alignments BAM file."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments0.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Alignments BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Alignments BAM file, by relative path."), resource.Description());
+            EXPECT_EQ(std::string("AlignmentFile.AlignmentBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:./alignments1.bam"),  resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/alignments1.pbi"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("HighQuality Read Alignments"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ab95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.85"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        }
+        else {
+            EXPECT_EQ(std::string(""), subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string("Alignments to chromosome 1"), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("ac95d0a3-94b8-4918-b3af-a3f81bbe519c"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const Filters& filters = subdataset.Filters();
+            ASSERT_EQ(1, filters.Size());
+            const Filter& filter = filters[0];
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("RNAME"), property.Name());
+            EXPECT_EQ(std::string("chr1"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+}
+
+static void TestMappingStaggeredXml()
+{
+    const DataSet dataset(DataSetIOTests::mappingStaggeredXmlFn);
+    EXPECT_EQ(DataSet::GENERIC, dataset.Type());
+    EXPECT_EQ(std::string("2015-05-13T10:58:26"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.DataSet"), dataset.MetaType());
+    EXPECT_EQ(std::string(""), dataset.Name());
+    EXPECT_EQ(std::string(""), dataset.Tags());
+    EXPECT_EQ(std::string("30f72098-bc5b-e06b-566c-8b28dda909a8"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string(""), resource.Name());
+            EXPECT_EQ(std::string(""), resource.Description());
+            EXPECT_EQ(std::string(""), resource.MetaType());
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_1.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string(""), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_1.bam.bai"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string(""), resource.Name());
+            EXPECT_EQ(std::string(""), resource.Description());
+            EXPECT_EQ(std::string(""), resource.MetaType());
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_2.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string(""), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_2.bam.bai"), index.ResourceId());
+        }
+    }
+
+    const SubDataSets& subdatasets = dataset.SubDataSets();
+    ASSERT_EQ(2, subdatasets.Size());
+    for (size_t i = 0; i < subdatasets.Size(); ++i) {
+        const DataSetBase& subdataset = subdatasets[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("2015-05-13T10:58:26"),    subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string(""), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("c5402d06-4643-057c-e300-fe229b4e8909"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const ExternalResources& subResources = subdataset.ExternalResources();
+            ASSERT_EQ(1, subResources.Size());
+            const ExternalResource& resource = subResources[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_2.bam"), resource.ResourceId());
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_2.bam.bai"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("2015-05-13T10:58:26"),    subdataset.CreatedAt());
+            EXPECT_EQ(std::string(""), subdataset.MetaType());
+            EXPECT_EQ(std::string(""), subdataset.Name());
+            EXPECT_EQ(std::string(""), subdataset.Tags());
+            EXPECT_EQ(std::string("f8b54a55-5fb7-706f-ab35-39afc9c86924"), subdataset.UniqueId());
+            EXPECT_EQ(std::string("2.3.0"), subdataset.Version());
+
+            const ExternalResources& subResources = subdataset.ExternalResources();
+            ASSERT_EQ(1, subResources.Size());
+            const ExternalResource& resource = subResources[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_1.bam"), resource.ResourceId());
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:tests/data/bam_mapping_1.bam.bai"), index.ResourceId());
+        }
+    }
+}
+
+static void TestBarcodeXml()
+{
+    const DataSet dataset(DataSetIOTests::barcodeXmlFn);
+    EXPECT_EQ(DataSet::BARCODE, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.BarcodeSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_BarcodeSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(std::string("First Barcodes FASTA"), resource.Name());
+    EXPECT_EQ(std::string("Points to an example Barcodes FASTA file."), resource.Description());
+    EXPECT_EQ(std::string("BarcodeFile.BarcodeFastaFile"), resource.MetaType());
+    EXPECT_EQ(std::string("file:///mnt/path/to/barcode.fasta"), resource.ResourceId());
+    EXPECT_EQ(std::string("Example"), resource.Tags());
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("30"),     metadata.NumRecords());
+    EXPECT_EQ(std::string("400"),    metadata.TotalLength());
+
+    // access metadata extensions directly for now
+    EXPECT_EQ(std::string("paired"), metadata.ChildText("BarcodeConstruction"));
+}
+
+static void TestCcsReadXml()
+{
+    const DataSet dataset(DataSetIOTests::ccsReadXmlFn);
+    EXPECT_EQ(DataSet::CONSENSUS_READ, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.ConsensusReadSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_ConsensusReadSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First ConsensusRead BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example ConsensusRead BAM file."), resource.Description());
+            EXPECT_EQ(std::string("PacBio.ConsensusReadFile.ConsensusReadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/ccsreads0.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("PacBio.Index.PacBioIndex"), index.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/ccsreads0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second ConsensusRead BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example ConsensusRead BAM file."), resource.Description());
+            EXPECT_EQ(std::string("PacBio.ConsensusReadFile.ConsensusReadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/ccsreads1.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("PacBio.Index.PacBioIndex"), index.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/ccsreads0.pbi"), index.ResourceId());
+        }
+    }
+}
+
+static void TestLambdaContigsXml()
+{
+    const DataSet dataset(DataSetIOTests::lambdaContigsXmlFn);
+    EXPECT_EQ(DataSet::REFERENCE, dataset.Type());
+    EXPECT_EQ(std::string("2015-05-28T10:56:36"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.ReferenceSet"), dataset.MetaType());
+    EXPECT_EQ(std::string(""), dataset.Name());
+    EXPECT_EQ(std::string(""), dataset.Tags());
+    EXPECT_EQ(std::string("596e87db-34f9-d2fd-c905-b017543170e1"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(std::string("file:tests/data/lambda_contigs.fasta"), resource.ResourceId());
+}
+
+static void TestPbalchemyXml()
+{
+    const DataSet dataset(DataSetIOTests::pbalchemyXmlFn);
+    EXPECT_EQ(DataSet::GENERIC, dataset.Type());
+    EXPECT_EQ(std::string("2015-05-22T16:56:16"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.DataSet"), dataset.MetaType());
+    EXPECT_EQ(std::string(""), dataset.Name());
+    EXPECT_EQ(std::string(""), dataset.Tags());
+    EXPECT_EQ(std::string("58e3f7c5-24c1-b58b-fbd5-37de268cc2f0"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(std::string("file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam"), resource.ResourceId());
+    const FileIndices& fileIndices = resource.FileIndices();
+    ASSERT_EQ(1, fileIndices.Size());
+    const FileIndex& index = fileIndices[0];
+    EXPECT_EQ(std::string("file:tests/data/pbalchemy10kbp.pbalign.sorted.pbver1.bam.bai"), index.ResourceId());
+
+    // TYPOs: Should be Filter Properties/Property not Parameter(s)
+
+}
+
+static void TestReferenceXml()
+{
+    const DataSet dataset(DataSetIOTests::referenceXmlFn);
+    EXPECT_EQ(DataSet::REFERENCE, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.ReferenceSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_ReferenceSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(1, resources.Size());
+    const ExternalResource& resource = resources[0];
+    EXPECT_EQ(std::string("First References FASTA"), resource.Name());
+    EXPECT_EQ(std::string("Points to an example references FASTA file."), resource.Description());
+    EXPECT_EQ(std::string("PacBio.ReferenceFile.ReferenceFastaFile"), resource.MetaType());
+    EXPECT_EQ(std::string("file:///mnt/path/to/reference.fasta"), resource.ResourceId());
+    EXPECT_EQ(std::string("Example"), resource.Tags());
+    const FileIndices& fileIndices = resource.FileIndices();
+    ASSERT_EQ(2, fileIndices.Size());
+    for (size_t i = 0; i < fileIndices.Size(); ++i) {
+        const FileIndex& index = fileIndices[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("PacBio.Index.SaWriterIndex"), index.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/reference.fasta.sa"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("PacBio.Index.SamIndex"), index.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/reference.fasta.fai"), index.ResourceId());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("500"),     metadata.NumRecords());
+    EXPECT_EQ(std::string("5000000"), metadata.TotalLength());
+
+    // access metadata extensions directly for now
+    EXPECT_EQ(std::string("Tribble"), metadata.ChildText("Organism"));
+    EXPECT_EQ(std::string("Diploid"), metadata.ChildText("Ploidy"));
+
+    const internal::DataSetListElement<internal::DataSetElement>& contigs =
+            metadata.Child<internal::DataSetListElement<internal::DataSetElement> >("Contigs");
+    ASSERT_EQ(1, contigs.NumChildren());
+    const internal::DataSetElement& contig = contigs[0];
+    EXPECT_EQ(std::string("gi|229359445|emb|AM181176.4|"), contig.Attribute("Name"));
+    EXPECT_EQ(std::string("Pseudomonas fluorescens SBW25 complete genome|quiver"), contig.Attribute("Description"));
+    EXPECT_EQ(std::string("6722109"), contig.Attribute("Length"));
+    EXPECT_EQ(std::string("f627c795efad7ce0050ed42b942d408e"), contig.Attribute("Digest"));
+}
+
+static void TestSubread1Xml()
+{
+    const DataSet dataset(DataSetIOTests::subread1XmlFn);
+    EXPECT_EQ(DataSet::SUBREAD, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_SubreadSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads0.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads0.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads1.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads0.pbi"), index.ResourceId());
+        }
+    }
+
+    const Filters& filters = dataset.Filters();
+    ASSERT_EQ(2, filters.Size());
+    for (size_t i = 0; i < filters.Size(); ++i) {
+        const Filter& filter = filters[i];
+        if (i == 0) {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.75"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        } else {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("QNAME"), property.Name());
+            EXPECT_EQ(std::string("100/0/0_100"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("500"),    metadata.NumRecords());
+    EXPECT_EQ(std::string("500000"), metadata.TotalLength());
+}
+
+static void TestSubread2Xml()
+{
+    const DataSet dataset(DataSetIOTests::subread2XmlFn);
+    EXPECT_EQ(DataSet::SUBREAD, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"),    dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_SubreadSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads2.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads3.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads3.pbi"), index.ResourceId());
+        }
+    }
+
+    const Filters& filters = dataset.Filters();
+    ASSERT_EQ(2, filters.Size());
+    for (size_t i = 0; i < filters.Size(); ++i) {
+        const Filter& filter = filters[i];
+        if (i == 0) {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.75"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        } else {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("QNAME"), property.Name());
+            EXPECT_EQ(std::string("100/0/0_100"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("500"),    metadata.NumRecords());
+    EXPECT_EQ(std::string("500000"), metadata.TotalLength());
+}
+
+static void TestSubread3Xml()
+{
+    const DataSet dataset(DataSetIOTests::subread3XmlFn);
+    EXPECT_EQ(DataSet::SUBREAD, dataset.Type());
+    EXPECT_EQ(std::string("2015-01-27T09:00:01"), dataset.CreatedAt());
+    EXPECT_EQ(std::string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("DataSet_SubreadSet"), dataset.Name());
+    EXPECT_EQ(std::string("barcode moreTags mapping mytags"), dataset.Tags());
+    EXPECT_EQ(std::string("b095d0a3-94b8-4918-b3af-a3f81bbe519c"), dataset.UniqueId());
+    EXPECT_EQ(std::string("2.3.0"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema-instance"),        dataset.Attribute("xmlns:xsi"));
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xsi:schemaLocation"));
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(2, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("First Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to an example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads2.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads2.pbi"), index.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("Second Subreads BAM"), resource.Name());
+            EXPECT_EQ(std::string("Points to another example Subreads BAM file."), resource.Description());
+            EXPECT_EQ(std::string("SubreadFile.SubreadBamFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads3.bam"), resource.ResourceId());
+            EXPECT_EQ(std::string("Example"), resource.Tags());
+
+            const FileIndices& fileIndices = resource.FileIndices();
+            ASSERT_EQ(1, fileIndices.Size());
+            const FileIndex& index = fileIndices[0];
+            EXPECT_EQ(std::string("file:///mnt/path/to/subreads3.pbi"), index.ResourceId());
+        }
+    }
+
+    const Filters& filters = dataset.Filters();
+    ASSERT_EQ(2, filters.Size());
+    for (size_t i = 0; i < filters.Size(); ++i) {
+        const Filter& filter = filters[i];
+        if (i == 0) {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("rq"), property.Name());
+            EXPECT_EQ(std::string("0.85"), property.Value());
+            EXPECT_EQ(std::string(">"), property.Operator());
+        } else {
+            const Properties& properties = filter.Properties();
+            ASSERT_EQ(1, properties.Size());
+            const Property& property = properties[0];
+            EXPECT_EQ(std::string("QNAME"), property.Name());
+            EXPECT_EQ(std::string("100/0/0_100"), property.Value());
+            EXPECT_EQ(std::string("=="), property.Operator());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("500"),    metadata.NumRecords());
+    EXPECT_EQ(std::string("500000"), metadata.TotalLength());
+}
+
+static void TestTransformedXml()
+{
+    const DataSet dataset(DataSetIOTests::transformedXmlFn);
+    EXPECT_EQ(DataSet::HDF_SUBREAD, dataset.Type());
+    EXPECT_EQ(std::string("PacBio.DataSet.SubreadSet"), dataset.MetaType());
+    EXPECT_EQ(std::string("Subreads from run r001173_42129_130607"), dataset.Name());
+    EXPECT_EQ(std::string("pacbio.secondary.instrument=RS"), dataset.Tags());
+    EXPECT_EQ(std::string("abbc9183-b01e-4671-8c12-19efee534647"), dataset.UniqueId());
+    EXPECT_EQ(std::string("0.5"), dataset.Version());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), dataset.Attribute("xmlns"));
+    EXPECT_EQ(std::string("http://www.w3.org/2001/XMLSchema"),         dataset.Attribute("xmlns:xs"));
+    EXPECT_EQ(std::string("http://www.w3.org/2005/xpath-functions"), dataset.Attribute("xmlns:fn"));
+    EXPECT_EQ(std::string("java:java.util.UUID"), dataset.Attribute("xmlns:uuid"));
+    EXPECT_EQ(std::string("http://whatever"), dataset.Attribute("xmlns:bax"));
+
+    EXPECT_EQ(0, dataset.Filters().Size());
+    EXPECT_EQ(0, dataset.SubDataSets().Size());
+
+    const ExternalResources& resources = dataset.ExternalResources();
+    ASSERT_EQ(3, resources.Size());
+    for (size_t i = 0; i < resources.Size(); ++i) {
+        const ExternalResource& resource = resources[i];
+        if (i == 0) {
+            EXPECT_EQ(std::string("PacBio.SubreadFile.BaxFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.0.bax.h5"),
+                      resource.ResourceId());
+        }
+        else if (i == 1) {
+            EXPECT_EQ(std::string("PacBio.SubreadFile.BaxFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.1.bax.h5"),
+                      resource.ResourceId());
+        }
+        else {
+            EXPECT_EQ(std::string("PacBio.SubreadFile.BaxFile"), resource.MetaType());
+            EXPECT_EQ(std::string("file:///pbi/dept/secondary/siv/testdata/LIMS/2590727/0001/Analysis_Results/m130608_033634_42129_c100515232550000001823076608221351_s1_p0.2.bax.h5"),
+                      resource.ResourceId());
+        }
+    }
+
+    const DataSetMetadata& metadata = dataset.Metadata();
+    EXPECT_EQ(std::string("150000"),   metadata.NumRecords());
+    EXPECT_EQ(std::string("50000000"), metadata.TotalLength());
+}
+
+} // namespace DataSetIOTests
+
+TEST(DataSetIOTest, InspectMalformedXml)
+{
+    const std::string xmlFn = PbbamTestsConfig::Data_Dir + "/dataset/malformed.xml";
+
+    DataSet ds(xmlFn);
+    std::ostringstream s;
+    ds.SaveToStream(s);
+
+    const std::string expected{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<SubreadSet CreatedAt=\"2015-08-19T15:39:36.331\" Description=\"Merged dataset from 1 files using DatasetMerger 0.1.2\" "
+                    "MetaType=\"PacBio.DataSet.HdfSubreadSet\" Name=\"Subreads from runr000013_42267_150403\" "
+                    "Tags=\"pacbio.secondary.instrument=RS\" TimeStampedName=\"hdfsubreadset_2015-08-19T15:39:36.331-07:00\" "
+                    "UniqueId=\"b4741521-2a4c-42df-8a13-0a755ca9ed1e\" Version=\"0.5\" "
+                    "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                    "xmlns:ns0=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+                    "xmlns:ns1=\"http://pacificbiosciences.com/PacBioSampleInfo.xsd\" "
+                    "xmlns:ns2=\"http://pacificbiosciences.com/PacBioCollectionMetadata.xsd\" "
+                    "xmlns:ns3=\"http://pacificbiosciences.com/PacBioReagentKit.xsd\" "
+                    "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                    "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t<ns0:ExternalResources>\n"
+        "\t\t<ns0:ExternalResource MetaType=\"SubreadFile.SubreadBamFile\" "
+                                  "ResourceId=\"file:///mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0//mnt/secondary-siv/jenkins/jenkins-bot01/workspace/Ubuntu1404_Mainline_SA3_Tiny_tests/software/smrtanalysis/siv/testkit-jobs/sa3_pipelines/mapping/tiny/job_output-ubuntu1404/tasks/pbsmrtpipe.tasks.h5_subreads_to_subread-0/file.subreads.subreads.bam\" "
+                                  "TimeStampedName=\"SubreadFile.SubreadBamFile_00000000000000\" "
+                                  "UniqueId=\"251acf71-9eb0-489e-9dd1-cdbd11432753\" />\n"
+        "\t</ns0:ExternalResources>\n"
+        "\t<DataSetMetadata>\n"
+        "\t\t<TotalLength>50000000</TotalLength>\n"
+        "\t\t<NumRecords>150000</NumRecords>\n"
+        "\t\t<ns2:Collections>\n"
+        "\t\t\t<ns2:CollectionMetadata Context=\"m150404_101626_42267_c100807920800000001823174110291514_s1_p0\" "
+                                      "InstrumentId=\"1\" InstrumentName=\"42267\" MetaType=\"PacBio.Collection\" "
+                                      "TimeStampedName=\"m150404_101626_42267_c100807920800000001823174110291514_s1_p0\" "
+                                      "UniqueId=\"d66c8372-2b70-4dcf-b64f-9f8b5cc351fd\">\n"
+        "\t\t\t\t<ns2:InstCtrlVer>2.3.0.1.142990</ns2:InstCtrlVer>\n"
+        "\t\t\t\t<ns2:SigProcVer>NRT@172.31.128.10:8082, SwVer=2301.142990, HwVer=1.0</ns2:SigProcVer>\n"
+        "\t\t\t\t<ns2:RunDetails>\n"
+        "\t\t\t\t\t<ns2:RunId>r000013_42267_150403</ns2:RunId>\n"
+        "\t\t\t\t\t<ns2:Name>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:Name>\n"
+        "\t\t\t\t</ns2:RunDetails>\n"
+        "\t\t\t\t<ns2:WellSample Name=\"Inst42267-040315-SAT-100pM-2kb-P6C4\">\n"
+        "\t\t\t\t\t<ns2:PlateId>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:PlateId>\n"
+        "\t\t\t\t\t<ns2:WellName>Inst42267-040315-SAT-100pM-2kb-P6C4</ns2:WellName>\n"
+        "\t\t\t\t\t<ns2:Concentration>0.0</ns2:Concentration>\n"
+        "\t\t\t\t\t<ns2:SampleReuseEnabled>false</ns2:SampleReuseEnabled>\n"
+        "\t\t\t\t\t<ns2:StageHotstartEnabled>false</ns2:StageHotstartEnabled>\n"
+        "\t\t\t\t\t<ns2:SizeSelectionEnabled>false</ns2:SizeSelectionEnabled>\n"
+        "\t\t\t\t\t<ns2:UseCount>1</ns2:UseCount>\n"
+        "\t\t\t\t\t<ns1:BioSamplePointers>\n"
+        "\t\t\t\t\t\t<ns1:BioSamplePointer>251acf71-9eb0-489e-9dd1-cdbd11432752</ns1:BioSamplePointer>\n"
+        "\t\t\t\t\t</ns1:BioSamplePointers>\n"
+        "\t\t\t\t</ns2:WellSample>\n"
+        "\t\t\t\t<ns2:Automation>\n"
+        "\t\t\t\t\t<ns0:AutomationParameters>\n"
+        "\t\t\t\t\t\t<ns0:AutomationParameter />\n"
+        "\t\t\t\t\t</ns0:AutomationParameters>\n"
+        "\t\t\t\t</ns2:Automation>\n"
+        "\t\t\t\t<ns2:CollectionNumber>7</ns2:CollectionNumber>\n"
+        "\t\t\t\t<ns2:CellIndex>4</ns2:CellIndex>\n"
+        "\t\t\t\t<ns2:CellPac Barcode=\"10080792080000000182317411029151\" />\n"
+        "\t\t\t\t<ns2:Primary>\n"
+        "\t\t\t\t\t<ns2:AutomationName>BasecallerV1</ns2:AutomationName>\n"
+        "\t\t\t\t\t<ns2:ConfigFileName>2-3-0_P6-C4.xml</ns2:ConfigFileName>\n"
+        "\t\t\t\t\t<ns2:SequencingCondition />\n"
+        "\t\t\t\t\t<ns2:OutputOptions>\n"
+        "\t\t\t\t\t\t<ns2:ResultsFolder>Analysis_Results</ns2:ResultsFolder>\n"
+        "\t\t\t\t\t\t<ns2:CollectionPathUri>rsy://mp-rsync/vol55//RS_DATA_STAGING/42267/Inst42267-040315-SAT-100pM-2kb-P6C4_13/A04_7/</ns2:CollectionPathUri>\n"
+        "\t\t\t\t\t\t<ns2:CopyFiles>\n"
+        "\t\t\t\t\t\t\t<ns2:CollectionFileCopy>Fasta</ns2:CollectionFileCopy>\n"
+        "\t\t\t\t\t\t</ns2:CopyFiles>\n"
+        "\t\t\t\t\t\t<ns2:Readout>Bases</ns2:Readout>\n"
+        "\t\t\t\t\t\t<ns2:MetricsVerbosity>Minimal</ns2:MetricsVerbosity>\n"
+        "\t\t\t\t\t</ns2:OutputOptions>\n"
+        "\t\t\t\t</ns2:Primary>\n"
+        "\t\t\t</ns2:CollectionMetadata>\n"
+        "\t\t</ns2:Collections>\n"
+        "\t\t<ns1:BioSamples>\n"
+        "\t\t\t<ns1:BioSample Description=\"Inst42267-SAT-100pM-2kbLambda-P6C4-Std120_CPS_040315\" "
+                            "MetaType=\"PacBio.Sample\" Name=\"Inst42267-040315-SAT-100pM-2kb-P6C4\" "
+                            "TimeStampedName=\"biosample_2015-08-19T15:39:36.331-07:00\" UniqueId=\"251acf71-9eb0-489e-9dd1-cdbd11432752\" />\n"
+        "\t\t</ns1:BioSamples>\n"
+        "\t</DataSetMetadata>\n"
+        "</SubreadSet>\n"};
+
+    EXPECT_EQ(expected, s.str());
+}
+
+TEST(DataSetIOTest, RelativePathCarriedThroughOk_FromString)
+{
+    const std::string inputXml{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet "
+            "CreatedAt=\"2015-01-27T09:00:01\" "
+            "MetaType=\"PacBio.DataSet.AlignmentSet\" "
+            "Name=\"DataSet_AlignmentSet\" "
+            "Tags=\"barcode moreTags mapping mytags\" "
+            "TimeStampedName=\"biosample_2015-08-19T15:39:36.331-07:00\" "
+            "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" "
+            "Version=\"2.3.0\" "
+            "xmlns=\"http://pacificbiosciences.com/PacBioDataModel.xsd\" "
+            "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+            "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+            "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDataModel.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to an example Alignments BAM file.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Third Alignments BAM\" "
+                "ResourceId=\"../path/to/resource1.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"../path/to/resource1.bam.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t\t<pbbase:ExternalResource "
+                "Description=\"Points to another example Alignments BAM file, by relative path.\" "
+                "MetaType=\"AlignmentFile.AlignmentBamFile\" "
+                "Name=\"Fourth Alignments BAM\" "
+                "ResourceId=\"../path/to/resource2.bam\" "
+                "Tags=\"Example\">\n"
+        "\t\t\t<pbbase:FileIndices>\n"
+        "\t\t\t\t<pbbase:FileIndex "
+                    "MetaType=\"PacBio.Index.PacBioIndex\" "
+                    "ResourceId=\"../path/to/resource2.bam.pbi\" />\n"
+        "\t\t\t</pbbase:FileIndices>\n"
+        "\t\t</pbbase:ExternalResource>\n"
+        "\t</pbbase:ExternalResources>\n"
+        "</pbds:AlignmentSet>\n"};
+
+    auto dataset = DataSet::FromXml(inputXml);
+
+    std::ostringstream stream;
+    dataset.SaveToStream(stream);
+    auto outputXml = stream.str();
+
+    EXPECT_EQ(inputXml, outputXml);
+}
+
+TEST(DataSetIOTest, RelativePathCarriedThroughOk_FromFile)
+{
+    DataSet dataset(PbbamTestsConfig::Data_Dir + "/relative/relative.xml");
+    auto resources = dataset.ExternalResources();
+    EXPECT_EQ("./a/test.bam",  resources[0].ResourceId());
+    EXPECT_EQ("./b/test1.bam", resources[1].ResourceId());
+    EXPECT_EQ("./b/test2.bam", resources[2].ResourceId());
+
+    std::ostringstream out;
+    dataset.SaveToStream(out);
+
+    auto newDataset = DataSet::FromXml(out.str());
+    auto newResources = newDataset.ExternalResources();
+    EXPECT_EQ("./a/test.bam",  newResources[0].ResourceId());
+    EXPECT_EQ("./b/test1.bam", newResources[1].ResourceId());
+    EXPECT_EQ("./b/test2.bam", newResources[2].ResourceId());
+}
+
+TEST(DataSetIOTest, DataSetFromRelativeBamFilename)
+{
+    // cache initial directory and move to location so we can test relatvie filename ok
+    const std::string startingDirectory = internal::FileUtils::CurrentWorkingDirectory();
+
+    const std::string targetDirectory = PbbamTestsConfig::Data_Dir + "/dataset";
+    DataSetIOTests::changeCurrentDirectory(targetDirectory);
+    ASSERT_EQ(targetDirectory, internal::FileUtils::CurrentWorkingDirectory());
+
+    EXPECT_NO_THROW(
+    {
+        const std::string relativeBamFn = "../phi29.bam";
+        const DataSet ds(relativeBamFn);
+        const auto files = ds.BamFiles();
+        EXPECT_EQ(1, files.size());
+    });
+
+    // restore working directory
+    DataSetIOTests::changeCurrentDirectory(startingDirectory);
+}
+
+TEST(DataaSetIOTest, AllFiles) 
+{
+    // check  BamFiles only
+    EXPECT_NO_THROW(
+    {
+        const DataSet dataset(PbbamTestsConfig::Data_Dir + "/chunking/chunking.subreadset.xml");
+        const auto bamFiles = dataset.BamFiles();
+        EXPECT_EQ(3, bamFiles.size());
+    });
+
+    // now fetch all files (original BAMs plus PBI files)
+    EXPECT_NO_THROW(
+    {
+        const DataSet dataset(PbbamTestsConfig::Data_Dir + "/chunking/chunking.subreadset.xml");
+        const auto allFiles = dataset.AllFiles();
+        EXPECT_EQ(6, allFiles.size());    
+    });
+}
+
+TEST(DataSetIOTest, MetadataDefaultChildrenProperlyOrderedPerXsd)
+{
+    DataSet dataset(DataSet::ALIGNMENT);
+    dataset.CreatedAt("2015-01-27T09:00:01");
+    dataset.MetaType("PacBio.DataSet.AlignmentSet");
+    dataset.Name("DataSet_AlignmentSet");
+    dataset.Tags("barcode moreTags mapping mytags");
+    dataset.TimeStampedName("my_time_stamped_name");
+    dataset.UniqueId("b095d0a3-94b8-4918-b3af-a3f81bbe519c");
+    dataset.Attribute("xmlns",              "http://pacificbiosciences.com/PacBioDatasets.xsd")
+           .Attribute("xmlns:xsi",          "http://www.w3.org/2001/XMLSchema-instance")
+           .Attribute("xsi:schemaLocation", "http://pacificbiosciences.com/PacBioDatasets.xsd");
+
+    ExternalResource ext("Fake.MetaType", "filename");
+    ext.TimeStampedName("custom_tsn")
+       .UniqueId("my_uuid");
+    dataset.ExternalResources().Add(ext);
+
+    const auto numRecords = std::to_string(42);
+    const auto totalLength = std::to_string(1000);
+    DataSetMetadata metadata(numRecords, totalLength);
+    dataset.Metadata(metadata);
+
+    const std::string expectedXml{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet CreatedAt=\"2015-01-27T09:00:01\" MetaType=\"PacBio.DataSet.AlignmentSet\" "
+                "Name=\"DataSet_AlignmentSet\" Tags=\"barcode moreTags mapping mytags\" "
+                "TimeStampedName=\"my_time_stamped_name\" "
+                "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" Version=\"3.0.1\" "
+                "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:pbbase=\"http://pacificbiosciences.com/PacBioBaseDataModel.xsd\" "
+                "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t<pbbase:ExternalResources>\n"
+        "\t\t<pbbase:ExternalResource MetaType=\"Fake.MetaType\" ResourceId=\"filename\" TimeStampedName=\"custom_tsn\" UniqueId=\"my_uuid\" Version=\"3.0.1\" />\n"
+        "\t</pbbase:ExternalResources>\n"
+        "\t<pbds:DataSetMetadata>\n"
+        "\t\t<pbds:TotalLength>1000</pbds:TotalLength>\n"
+        "\t\t<pbds:NumRecords>42</pbds:NumRecords>\n"
+        "\t</pbds:DataSetMetadata>\n"
+        "</pbds:AlignmentSet>\n"};
+
+    std::ostringstream s;
+    dataset.SaveToStream(s);
+    EXPECT_EQ(expectedXml, s.str());
+}
+
+// clang-format on
diff --git a/tests/src/test_DataSetQuery.cpp b/tests/src/test_DataSetQuery.cpp

new file mode 100644 (file)

index 0000000..e0ada2d
--- /dev/null
+++ b/tests/src/test_DataSetQuery.cpp
@@ -0,0 +1,463 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <boost/any.hpp>
+
+#include <pbbam/DataSet.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/GenomicIntervalQuery.h>
+#include <pbbam/Unused.h>
+#include <pbbam/ZmwGroupQuery.h>
+#include <pbbam/ZmwQuery.h>
+
+#include "PbbamTestData.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace DataSetQueryTests {
+
+const std::string alignedBamFn = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+const std::string aligned2BamFn = PbbamTestsConfig::Data_Dir + "/aligned2.bam";
+const std::string alignedCopyBamFn = PbbamTestsConfig::GeneratedData_Dir + "/aligned.bam";
+const std::string aligned2CopyBamFn = PbbamTestsConfig::GeneratedData_Dir + "/aligned2.bam";
+
+const std::string group_fofn = PbbamTestsConfig::Generated_Dir + "/group.fofn";
+const std::string group_file1 = PbbamTestsConfig::Data_Dir + "/group/test1.bam";
+const std::string group_file2 = PbbamTestsConfig::Data_Dir + "/group/test2.bam";
+const std::string group_file3 = PbbamTestsConfig::Data_Dir + "/group/test3.bam";
+
+const std::vector<std::string> group_file1_names{
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/24962/0_427"};
+
+const std::vector<std::string> group_file2_names{
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2114_2531",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/4101_5571",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237"};
+
+const std::vector<std::string> group_file3_names{
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/45203/0_893",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/45203/0_893",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/3759_4005",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/4052_4686",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/46835/4732_4869",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/9482_9628",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/9675_10333",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/47698/10378_10609",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49050/48_1132",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49194/0_798",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49194/845_1541",
+    "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/49521/0_134"};
+
+static inline bool InGroup(const std::string& name, const std::vector<std::string>& group)
+{
+    for (const std::string& s : group) {
+        if (s == name) return true;
+    }
+    return false;
+}
+
+}  // namespace DataSetQueryTests
+
+TEST(DataSetQueryTest, EntireFileQueryTest)
+{
+    // single file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::alignedBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+
+        int count = 0;
+        EntireFileQuery query(dataset);  // from DataSet object
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+
+        count = 0;
+        EntireFileQuery query2(DataSetQueryTests::alignedBamFn);  // from BAM filename
+        for (const BamRecord& record : query2) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+
+        count = 0;
+        EntireFileQuery query3(bamFile);  // from BamFile object
+        for (const BamRecord& record : query3) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+    });
+
+    // duplicate file attempt
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::alignedBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+        dataset.ExternalResources().Add(bamFile);
+
+        int count = 0;
+        EntireFileQuery query(dataset);
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);  // same as single
+    });
+
+    // true multi-file dataset
+    EXPECT_NO_THROW({
+        BamFile file1(DataSetQueryTests::group_file1);  // 1 read
+        BamFile file2(DataSetQueryTests::group_file2);  // 4 reads
+        BamFile file3(DataSetQueryTests::group_file3);  // 13 reads
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(file1);
+        dataset.ExternalResources().Add(file2);
+        dataset.ExternalResources().Add(file3);
+
+        int count = 0;
+        EntireFileQuery query(dataset);
+        for (const BamRecord& record : query) {
+
+            // ensure sequential merge of files
+            if (count == 0)
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file1_names));
+            else if (count < 5)
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file2_names));
+            else
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file3_names));
+
+            ++count;
+        }
+        EXPECT_EQ(18, count);
+    });
+
+    // same as above, from FOFN
+    EXPECT_NO_THROW({
+        int count = 0;
+
+        DataSet dataset(DataSetQueryTests::group_fofn);
+        EntireFileQuery query(dataset);
+        for (const BamRecord& record : query) {
+
+            // ensure sequential merge of files
+            if (count == 0)
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file1_names));
+            else if (count < 5)
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file2_names));
+            else
+                EXPECT_TRUE(DataSetQueryTests::InGroup(record.FullName(),
+                                                       DataSetQueryTests::group_file3_names));
+
+            ++count;
+        }
+        EXPECT_EQ(18, count);
+    });
+}
+
+TEST(DataSetQueryTest, GenomicIntervalQueryTest)
+{
+    const std::string rname = "lambda_NEB3011";
+
+    // single file
+    EXPECT_NO_THROW({
+        DataSet dataset(DataSetQueryTests::alignedBamFn);  // from BAM filename
+
+        // count records
+        int count = 0;
+        GenomicInterval interval(rname, 5000, 6000);
+        GenomicIntervalQuery query(interval, dataset);
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+
+        // adjust interval and pass back in
+        count = 0;
+        interval.Start(9000);
+        interval.Stop(9500);
+        query.Interval(interval);
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+
+        // unknown ref
+        count = 0;
+        interval.Name("does not exist");
+        interval.Start(0);
+        interval.Stop(100);
+        EXPECT_THROW(query.Interval(interval), std::exception);
+        for (const BamRecord& record : query) {  // iteration is still safe, just returns no data
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(0, count);
+
+        // adjust again - make sure we can read a real region after an invalid one
+        interval.Name(rname);
+        interval.Start(5000);
+        interval.Stop(6000);
+        query.Interval(interval);
+        count = 0;
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+    });
+
+    // duplicate file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::alignedBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+        dataset.ExternalResources().Add(bamFile);
+
+        // count records & also ensure sorted merge
+        int count = 0;
+        int prevId = 0;
+        int prevPos = 0;
+
+        GenomicInterval interval(rname, 5000, 6000);
+        GenomicIntervalQuery query(interval, dataset);
+        for (const BamRecord& record : query) {
+
+            EXPECT_TRUE(record.ReferenceId() >= prevId);
+            EXPECT_TRUE(record.ReferenceStart() >= prevPos);
+
+            prevId = record.ReferenceId();
+            prevPos = record.ReferenceStart();
+            ++count;
+        }
+        EXPECT_EQ(2, count);  // same as single file
+
+        // adjust interval and pass back in
+        count = 0;
+        interval.Start(9000);
+        interval.Stop(10000);
+        query.Interval(interval);
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);  // same as single file
+
+        // unknown ref
+        count = 0;
+        interval.Name("does not exist");
+        interval.Start(0);
+        interval.Stop(100);
+        EXPECT_THROW(query.Interval(interval), std::exception);
+        for (const BamRecord& record : query) {  // iteration is still safe, just returns no data
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(0, count);  // same as single file
+
+        // adjust again - make sure we can read a real region after an invalid one
+        interval.Name(rname);
+        interval.Start(5000);
+        interval.Stop(5300);
+        query.Interval(interval);
+        count = 0;
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);  // same as single file
+    });
+
+    // multi file BAM (same record content for easy testing, but different filename(ResourceId)
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::alignedBamFn);
+        BamFile copyFile(DataSetQueryTests::alignedCopyBamFn);
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(bamFile);
+        dataset.ExternalResources().Add(copyFile);
+
+        // count records & also ensure sorted merge
+        int count = 0;
+        int prevId = 0;
+        int prevPos = 0;
+
+        GenomicInterval interval(rname, 5000, 6000);
+        GenomicIntervalQuery query(interval, dataset);
+        for (const BamRecord& record : query) {
+
+            EXPECT_TRUE(record.ReferenceId() >= prevId);
+            EXPECT_TRUE(record.ReferenceStart() >= prevPos);
+
+            prevId = record.ReferenceId();
+            prevPos = record.ReferenceStart();
+            ++count;
+        }
+        EXPECT_EQ(4, count);  // single file * 2
+
+        // adjust interval and pass back in
+        count = 0;
+        interval.Start(9000);
+        interval.Stop(10000);
+        query.Interval(interval);
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);  // single file * 2
+
+        // unknown ref
+        count = 0;
+        interval.Name("does not exist");
+        interval.Start(0);
+        interval.Stop(100);
+        EXPECT_THROW(query.Interval(interval), std::exception);
+        for (const BamRecord& record : query) {  // iteration is still safe, just returns no data
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(0, count);  // single file * 2
+
+        // adjust again - make sure we can read a real region after an invalid one
+        interval.Name(rname);
+        interval.Start(5000);
+        interval.Stop(5300);
+        query.Interval(interval);
+        count = 0;
+        for (const BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(4, count);  // single file * 2
+    });
+}
+
+// TODO: implement me
+TEST(DataSetQueryTest, QNameQueryTest) { EXPECT_TRUE(true); }
+
+TEST(DataSetQueryTest, ZmwQueryTest)
+{
+    const std::vector<int32_t> whitelist = {13473, 30983};
+
+    // single file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::aligned2BamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        DataSet dataset(bamFile);
+
+        int count = 0;
+        ZmwQuery query(whitelist, dataset);
+        for (const BamRecord& record : query) {
+            const int32_t holeNumber = record.HoleNumber();
+            EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+            ++count;
+        }
+        EXPECT_EQ(4, count);
+    });
+
+    // multi-file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::aligned2BamFn);
+        BamFile bamFile2(DataSetQueryTests::aligned2CopyBamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        ASSERT_TRUE(bamFile2.PacBioIndexExists());
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(ExternalResource(bamFile));
+        dataset.ExternalResources().Add(ExternalResource(bamFile2));
+
+        int count = 0;
+        ZmwQuery query(whitelist, dataset);
+        for (const BamRecord& r : query) {
+            const auto holeNumber = r.HoleNumber();
+            EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+            ++count;
+        }
+        EXPECT_EQ(8, count);
+    });
+}
+
+TEST(DataSetQueryTest, ZmwGroupQueryTest)
+{
+    const std::vector<int32_t> whitelist = {13473, 30983};
+
+    // single-file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::aligned2BamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        DataSet dataset(bamFile);
+
+        int count = 0;
+        int32_t groupZmw = -1;
+        ZmwGroupQuery query(whitelist, dataset);
+        for (const std::vector<BamRecord>& group : query) {
+            for (const BamRecord& record : group) {
+                const auto holeNumber = record.HoleNumber();
+                if (groupZmw == -1) groupZmw = holeNumber;
+                EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+                EXPECT_EQ(groupZmw, holeNumber);
+                ++count;
+            }
+            groupZmw = -1;
+        }
+        EXPECT_EQ(4, count);
+    });
+
+    // multi-file
+    EXPECT_NO_THROW({
+        BamFile bamFile(DataSetQueryTests::aligned2BamFn);
+        BamFile bamFile2(DataSetQueryTests::aligned2CopyBamFn);
+        ASSERT_TRUE(bamFile.PacBioIndexExists());
+        ASSERT_TRUE(bamFile2.PacBioIndexExists());
+
+        DataSet dataset;
+        dataset.ExternalResources().Add(ExternalResource(bamFile));
+        dataset.ExternalResources().Add(ExternalResource(bamFile2));
+
+        int totalCount = 0;
+        int numRecordsInGroup = 0;
+        int groupCount = 0;
+        int32_t groupZmw = -1;
+        ZmwGroupQuery query(whitelist, dataset);
+        for (const std::vector<BamRecord>& group : query) {
+            for (const BamRecord& record : group) {
+                const auto holeNumber = record.HoleNumber();
+                ++numRecordsInGroup;
+                if (groupZmw == -1) groupZmw = holeNumber;
+                EXPECT_TRUE(holeNumber == 13473 || holeNumber == 30983);
+                EXPECT_EQ(groupZmw, holeNumber);
+                ++totalCount;
+            }
+            if (groupCount == 0)
+                EXPECT_EQ(4, numRecordsInGroup);
+            else if (groupCount == 1)
+                EXPECT_EQ(4, numRecordsInGroup);
+            else
+                EXPECT_TRUE(false);  // should not get here
+            numRecordsInGroup = 0;
+            ++groupCount;
+            groupZmw = -1;
+        }
+        EXPECT_EQ(8, totalCount);
+    });
+}
diff --git a/tests/src/test_DataSetXsd.cpp b/tests/src/test_DataSetXsd.cpp

new file mode 100644 (file)

index 0000000..30c9fe8
--- /dev/null
+++ b/tests/src/test_DataSetXsd.cpp
@@ -0,0 +1,154 @@
+// Author: Derek Barnett
+
+#include <sstream>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/DataSet.h>
+#include <pbbam/DataSetXsd.h>
+
+#include "PbbamTestData.h"
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(DataSetXsdTest, DefaultsOk)
+{
+    NamespaceRegistry registry;
+
+    const NamespaceInfo& baseInfo = registry.Namespace(XsdType::BASE_DATA_MODEL);
+    const NamespaceInfo& dsInfo   = registry.Namespace(XsdType::DATASETS);
+    const NamespaceInfo& defaultInfo = registry.DefaultNamespace();
+
+    EXPECT_EQ(XsdType::DATASETS, registry.DefaultXsd());
+
+    EXPECT_EQ(std::string("pbds"),   dsInfo.Name());
+    EXPECT_EQ(std::string("pbbase"), baseInfo.Name());
+    EXPECT_EQ(std::string("pbds"),   defaultInfo.Name());
+
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioBaseDataModel.xsd"), baseInfo.Uri());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"),      dsInfo.Uri());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"),      defaultInfo.Uri());
+}
+
+TEST(DataSetXsdTest, EditDefaultOk)
+{
+    NamespaceRegistry registry;
+    registry.SetDefaultXsd(XsdType::DATASETS);
+
+    const NamespaceInfo& defaultInfo = registry.DefaultNamespace();
+
+    EXPECT_EQ(XsdType::DATASETS, registry.DefaultXsd());
+    EXPECT_EQ(std::string("pbds"), defaultInfo.Name());
+    EXPECT_EQ(std::string("http://pacificbiosciences.com/PacBioDatasets.xsd"), defaultInfo.Uri());
+}
+
+TEST(DataSetXsdTest, EditRegistryOk)
+{
+    NamespaceRegistry registry;
+    registry.Register(XsdType::DATASETS, NamespaceInfo("custom", "http://custom/uri.xsd"));
+
+    const NamespaceInfo& dsInfo = registry.Namespace(XsdType::DATASETS);
+
+    EXPECT_EQ(std::string("custom"),                dsInfo.Name());
+    EXPECT_EQ(std::string("http://custom/uri.xsd"), dsInfo.Uri());
+}
+
+TEST(DataSetXsdTest, EditDatasetRegistry)
+{
+    DataSet dataset(DataSet::ALIGNMENT);
+    dataset.CreatedAt("2015-01-27T09:00:01");
+    dataset.MetaType("PacBio.DataSet.AlignmentSet");
+    dataset.Name("DataSet_AlignmentSet");
+    dataset.Tags("barcode moreTags mapping mytags");
+    dataset.TimeStampedName("my_time_stamped_name");
+    dataset.UniqueId("b095d0a3-94b8-4918-b3af-a3f81bbe519c");
+    dataset.Attribute("xmlns",              "http://pacificbiosciences.com/PacBioDatasets.xsd")
+           .Attribute("xmlns:xsi",          "http://www.w3.org/2001/XMLSchema-instance")
+           .Attribute("xsi:schemaLocation", "http://pacificbiosciences.com/PacBioDatasets.xsd");
+
+    ExternalResource ext("Fake.MetaType", "filename");
+    ext.TimeStampedName("custom_tsn")
+       .UniqueId("my_uuid");
+    dataset.ExternalResources().Add(ext);
+
+    dataset.Namespaces().Register(XsdType::BASE_DATA_MODEL, NamespaceInfo("custom", "http://custom/uri.xsd"));
+
+    const std::string expectedXml{
+        "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+        "<pbds:AlignmentSet CreatedAt=\"2015-01-27T09:00:01\" MetaType=\"PacBio.DataSet.AlignmentSet\" "
+                "Name=\"DataSet_AlignmentSet\" Tags=\"barcode moreTags mapping mytags\" "
+                "TimeStampedName=\"my_time_stamped_name\" "
+                "UniqueId=\"b095d0a3-94b8-4918-b3af-a3f81bbe519c\" Version=\"3.0.1\" "
+                "xmlns=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                "xsi:schemaLocation=\"http://pacificbiosciences.com/PacBioDatasets.xsd\" "
+                "xmlns:custom=\"http://custom/uri.xsd\" "
+                "xmlns:pbds=\"http://pacificbiosciences.com/PacBioDatasets.xsd\">\n"
+        "\t<custom:ExternalResources>\n"
+        "\t\t<custom:ExternalResource MetaType=\"Fake.MetaType\" ResourceId=\"filename\" TimeStampedName=\"custom_tsn\" UniqueId=\"my_uuid\" Version=\"3.0.1\" />\n"
+        "\t</custom:ExternalResources>\n"
+        "</pbds:AlignmentSet>\n"};
+
+    std::ostringstream s;
+    dataset.SaveToStream(s);
+    EXPECT_EQ(expectedXml, s.str());
+}
+
+TEST(DataSetXsdTest, ElementRegistryOk)
+{
+    { // default namespaces
+
+        DataSet ds;
+
+        // append child elements that do not have a C++ built-in, nor namespace prefix with addition
+        DataSetMetadata& metadata = ds.Metadata();
+        metadata.AddChild(internal::DataSetElement("SummaryStats"));
+        metadata.AddChild(internal::DataSetElement("CopyFiles"));
+        metadata.AddChild(internal::DataSetElement("BioSamples"));
+        metadata.AddChild(internal::DataSetElement("AutomationParameters"));
+
+        std::ostringstream s;
+        ds.SaveToStream(s);
+        const std::string output = s.str();
+
+        // check that default namespace is propagated properly
+        EXPECT_TRUE(output.find("pbds:SummaryStats") != std::string::npos);
+        EXPECT_TRUE(output.find("pbmeta:CopyFiles") != std::string::npos);
+        EXPECT_TRUE(output.find("pbsample:BioSamples") != std::string::npos);
+        EXPECT_TRUE(output.find("pbbase:AutomationParameters") != std::string::npos);
+    }
+
+    { // custom namespaces
+
+        DataSet ds;
+
+        // setup custom namespaces
+        ds.Namespaces().Register(XsdType::BASE_DATA_MODEL,     NamespaceInfo("custom_base",   "http://custom/base.xsd"));
+        ds.Namespaces().Register(XsdType::COLLECTION_METADATA, NamespaceInfo("custom_meta",   "http://custom/meta.xsd"));
+        ds.Namespaces().Register(XsdType::DATASETS,            NamespaceInfo("custom_ds",     "http://custom/datasets.xsd"));
+        ds.Namespaces().Register(XsdType::SAMPLE_INFO,         NamespaceInfo("custom_sample", "http://custom/base.xsd"));
+
+        // append child elements that do not have a C++ built-in, nor namespace prefix with addition
+        DataSetMetadata& metadata = ds.Metadata();
+        metadata.AddChild(internal::DataSetElement("SummaryStats"));
+        metadata.AddChild(internal::DataSetElement("CopyFiles"));
+        metadata.AddChild(internal::DataSetElement("BioSamples"));
+        metadata.AddChild(internal::DataSetElement("AutomationParameters"));
+
+        std::ostringstream s;
+        ds.SaveToStream(s);
+        const std::string output = s.str();
+
+        // check that custom namespace is propagated properly
+        EXPECT_TRUE(output.find("custom_ds:SummaryStats") != std::string::npos);
+        EXPECT_TRUE(output.find("custom_meta:CopyFiles") != std::string::npos);
+        EXPECT_TRUE(output.find("custom_sample:BioSamples") != std::string::npos);
+        EXPECT_TRUE(output.find("custom_base:AutomationParameters") != std::string::npos);
+    }
+}
+
+// clang-format on
diff --git a/tests/src/test_EndToEnd.cpp b/tests/src/test_EndToEnd.cpp

new file mode 100644 (file)

index 0000000..c0df7d7
--- /dev/null
+++ b/tests/src/test_EndToEnd.cpp
@@ -0,0 +1,206 @@
+// Author: Derek Barnett
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <htslib/sam.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamFile.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace EndToEndTests {
+
+struct Bam1Deleter
+{
+    void operator()(bam1_t* b)
+    {
+        if (b) bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+struct SamFileDeleter
+{
+    void operator()(samFile* file)
+    {
+        if (file) sam_close(file);
+        file = nullptr;
+    }
+};
+
+struct BamHdrDeleter
+{
+    void operator()(bam_hdr_t* hdr)
+    {
+        if (hdr) bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+const std::string inputBamFn = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+const std::string goldStandardSamFn = PbbamTestsConfig::Data_Dir + "/aligned.sam";
+const std::string generatedBamFn = PbbamTestsConfig::GeneratedData_Dir + "/generated.bam";
+const std::string generatedSamFn = PbbamTestsConfig::GeneratedData_Dir + "/generated.sam";
+const std::vector<std::string> generatedFiles = {generatedBamFn, generatedSamFn};
+
+static inline int RunBam2Sam(const std::string& bamFn, const std::string& samFn,
+                             const std::string& args = std::string())
+{
+    std::ostringstream s;
+    s << PbbamTestsConfig::Bam2Sam << " " << args << " " << bamFn << " > " << samFn;
+    return system(s.str().c_str());
+}
+
+static inline int RunDiff(const std::string& fn1, const std::string& fn2)
+{
+    std::ostringstream s;
+    s << "diff " << fn1 << " " << fn2;
+    return system(s.str().c_str());
+}
+
+static inline void Remove(const std::vector<std::string>& files)
+{
+    for (const auto& fn : files)
+        remove(fn.c_str());
+}
+
+static inline void CheckGeneratedOutput()
+{
+    // convert to sam & diff against gold standard
+    const int convertRet = RunBam2Sam(generatedBamFn, generatedSamFn);
+    const int diffRet = RunDiff(goldStandardSamFn, generatedSamFn);
+    EXPECT_EQ(0, convertRet);
+    EXPECT_EQ(0, diffRet);
+
+    // clean up
+    Remove(generatedFiles);
+}
+
+}  // namespace EndToEndTests
+
+// sanity check for rest of tests below
+TEST(EndToEndTest, ReadAndWrite_PureHtslib)
+{
+    {  // scoped to force flush & close before conversion/diff
+
+        // open files
+
+        std::unique_ptr<samFile, EndToEndTests::SamFileDeleter> inWrapper(
+            sam_open(EndToEndTests::inputBamFn.c_str(), "r"));
+        samFile* in = inWrapper.get();
+        ASSERT_TRUE(in);
+
+        std::unique_ptr<samFile, EndToEndTests::SamFileDeleter> outWrapper(
+            sam_open(EndToEndTests::generatedBamFn.c_str(), "wb"));
+        samFile* out = outWrapper.get();
+        ASSERT_TRUE(out);
+
+        // fetch & write header
+
+        std::unique_ptr<bam_hdr_t, EndToEndTests::BamHdrDeleter> headerWrapper(sam_hdr_read(in));
+        bam_hdr_t* hdr = headerWrapper.get();
+        ASSERT_TRUE(hdr);
+        ASSERT_EQ(0, sam_hdr_write(out, hdr));
+
+        // fetch & write records
+
+        std::unique_ptr<bam1_t, EndToEndTests::Bam1Deleter> record(bam_init1());
+        bam1_t* b = record.get();
+        ASSERT_TRUE(b);
+
+        while (sam_read1(in, hdr, b) >= 0) {
+            const auto ret = sam_write1(out, hdr, b);
+            UNUSED(ret);
+        }
+    }
+
+    EndToEndTests::CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_SingleThread)
+{
+    EXPECT_NO_THROW({
+        // open input BAM file
+        BamFile bamFile(EndToEndTests::inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(EndToEndTests::generatedBamFn, bamFile.Header(),
+                         BamWriter::DefaultCompression, 1);
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    EndToEndTests::CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_APIDefaultThreadCount)
+{
+    EXPECT_NO_THROW({
+        // open input BAM file
+        BamFile bamFile(EndToEndTests::inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(EndToEndTests::generatedBamFn, bamFile.Header());
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    EndToEndTests::CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_SystemDefaultThreadCount)
+{
+    EXPECT_NO_THROW({
+        // open input BAM file
+        BamFile bamFile(EndToEndTests::inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(EndToEndTests::generatedBamFn, bamFile.Header(),
+                         BamWriter::DefaultCompression, 0);
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    EndToEndTests::CheckGeneratedOutput();
+}
+
+TEST(EndToEndTest, ReadAndWrite_UserThreadCount)
+{
+    EXPECT_NO_THROW({
+        // open input BAM file
+        BamFile bamFile(EndToEndTests::inputBamFn);
+
+        // open output BAM file
+        BamWriter writer(EndToEndTests::generatedBamFn, bamFile.Header(),
+                         BamWriter::DefaultCompression, 3);
+
+        // copy BAM file
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile)
+            writer.Write(record);
+    });
+
+    EndToEndTests::CheckGeneratedOutput();
+}
diff --git a/tests/src/test_EntireFileQuery.cpp b/tests/src/test_EntireFileQuery.cpp

new file mode 100644 (file)

index 0000000..f2f014a
--- /dev/null
+++ b/tests/src/test_EntireFileQuery.cpp
@@ -0,0 +1,106 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamWriter.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace EntireFileQueryTests {
+
+const std::string inputBamFn = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+
+}  // namespace EntireFileQueryTests
+
+TEST(EntireFileQueryTest, CountRecords)
+{
+    EXPECT_NO_THROW({
+        BamFile bamFile(EntireFileQueryTests::inputBamFn);
+        int count = 0;
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile) {
+            UNUSED(record);
+            ++count;
+        }
+
+        EXPECT_EQ(4, count);
+    });
+}
+
+TEST(EntireFileQueryTest, NonConstBamRecord)
+{
+    EXPECT_NO_THROW({
+        BamFile bamFile(EntireFileQueryTests::inputBamFn);
+        int count = 0;
+        EntireFileQuery entireFile(bamFile);
+        for (BamRecord& record : entireFile) {
+            UNUSED(record);
+            ++count;
+        }
+
+        EXPECT_EQ(4, count);
+    });
+}
+
+TEST(BamRecordTest, HandlesDeletionOK)
+{
+    // this file raised no error in Debug mode, but segfaulted when
+    // trying to access the aligned qualities in Release mode
+
+    const std::string problemBamFn = PbbamTestsConfig::Data_Dir + "/segfault.bam";
+    BamFile bamFile(problemBamFn);
+    int count = 0;
+    EntireFileQuery entireFile(bamFile);
+    for (const BamRecord& record : entireFile) {
+
+        const auto rawQualities = record.Qualities(Orientation::GENOMIC, false);
+        const auto alignedQualities = record.Qualities(Orientation::GENOMIC, true);
+
+        const std::string rawExpected{
+            "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"
+            "IIIIIIIIIIIII"};
+
+        // 1=1D98=
+        const std::string alignedExpected{
+            "I!"
+            "IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII"
+            "IIIIIIIIIIII"};
+
+        EXPECT_EQ(rawExpected, rawQualities.Fastq());
+        EXPECT_EQ(alignedExpected, alignedQualities.Fastq());
+
+        ++count;
+    }
+
+    EXPECT_EQ(1, count);
+}
+
+TEST(BamRecordTest, ReferenceName)
+{
+    {  // check reference name of first record
+        const std::string exampleBam = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+        BamFile bamFile(exampleBam);
+        EntireFileQuery records(bamFile);
+        auto firstIter = records.begin();
+        auto& firstRecord = *firstIter;
+        ASSERT_TRUE(firstRecord.IsMapped());
+        EXPECT_EQ("lambda_NEB3011", firstRecord.ReferenceName());
+    }
+
+    {  // unmapped records have no reference name, should throw
+        const std::string exampleBam = PbbamTestsConfig::Data_Dir + "/unmap1.bam";
+        BamFile bamFile(exampleBam);
+        EntireFileQuery records(bamFile);
+        auto firstIter = records.begin();
+        auto& firstRecord = *firstIter;
+        ASSERT_FALSE(firstRecord.IsMapped());
+        EXPECT_THROW(firstRecord.ReferenceName(), std::runtime_error);
+    }
+}
diff --git a/tests/src/test_Fasta.cpp b/tests/src/test_Fasta.cpp

new file mode 100644 (file)

index 0000000..d355c5a
--- /dev/null
+++ b/tests/src/test_Fasta.cpp
@@ -0,0 +1,128 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/FastaReader.h>
+#include <pbbam/FastaSequence.h>
+#include <pbbam/FastaSequenceQuery.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FastaTests {
+
+static void CheckSequence(const size_t index, const FastaSequence& seq)
+{
+    SCOPED_TRACE("checking FASTA seq:" + std::to_string(index));
+    switch (index) {
+        case 0:
+            EXPECT_EQ("1", seq.Name());
+            EXPECT_EQ(
+                "TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACGCAGCTCCG"
+                "CCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAACTCCGCCGGCGCAGGCG",
+                seq.Bases());
+            break;
+
+        case 1:
+            EXPECT_EQ("2", seq.Name());
+            EXPECT_EQ(
+                "TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACGCAGCTCCG"
+                "CCCTCGCGGTGCTCTCCGGGTCTGTGCTGAGGAGAACGCAAC",
+                seq.Bases());
+            break;
+
+        case 2:
+            EXPECT_EQ("3", seq.Name());
+            EXPECT_EQ(
+                "TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACACCCTAACCCCA"
+                "ACCCCAACCCCAACCCCAACCCCAACCCCAACCCTAACCCCTAACCCTAACCCT",
+                seq.Bases());
+            break;
+
+        default:
+            ASSERT_TRUE(false);  // invalid index
+    }
+}
+
+}  // namespace FastaTests
+
+TEST(FastaSequenceTest, BasicConstructorOk)
+{
+    FastaSequence seq{"1", "GATTACA"};
+    EXPECT_EQ("1", seq.Name());
+    EXPECT_EQ("GATTACA", seq.Bases());
+}
+
+TEST(FastaReaderTest, IterableOk)
+{
+    const std::string fn = PbbamTestsConfig::GeneratedData_Dir + "/normal.fa";
+    FastaReader reader{fn};
+
+    size_t count = 0;
+    FastaSequence seq;
+    while (reader.GetNext(seq)) {
+        FastaTests::CheckSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(FastaReaderTest, ReadAllOk)
+{
+    const std::string fn = PbbamTestsConfig::GeneratedData_Dir + "/normal.fa";
+
+    size_t count = 0;
+    for (const auto& seq : FastaReader::ReadAll(fn)) {
+        FastaTests::CheckSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(FastaSequenceQueryTest, FromFastaFilename)
+{
+    const std::string fn = PbbamTestsConfig::Data_Dir + "/lambdaNEB.fa";
+
+    {
+        size_t count = 0;
+        FastaSequenceQuery query{fn};
+        for (const auto& seq : query) {
+            UNUSED(seq);
+            ++count;
+        }
+        EXPECT_EQ(1, count);
+    }
+
+    {
+        FastaSequenceQuery query{fn};
+        const auto first = query.cbegin();
+        const auto& seq = *first;
+        EXPECT_EQ("lambda_NEB3011", seq.Name());
+    }
+}
+
+TEST(FastaSequenceQueryTest, FromDataSet)
+{
+    const std::string fn = PbbamTestsConfig::Data_Dir + "/referenceset.xml";
+
+    {
+        size_t count = 0;
+        FastaSequenceQuery query{fn};
+        for (const auto& seq : query) {
+            UNUSED(seq);
+            ++count;
+        }
+        EXPECT_EQ(5, count);  // 1 from lambda, 4 from chimera
+    }
+    {
+        FastaSequenceQuery query{fn};
+        const auto first = query.cbegin();
+        const auto& seq = *first;
+        EXPECT_EQ("lambda_NEB3011", seq.Name());
+    }
+}
diff --git a/tests/src/test_Fastq.cpp b/tests/src/test_Fastq.cpp

new file mode 100644 (file)

index 0000000..8b58485
--- /dev/null
+++ b/tests/src/test_Fastq.cpp
@@ -0,0 +1,108 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+#include <cstddef>
+#include <cstdint>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/FastqReader.h>
+#include <pbbam/FastqSequence.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FastqTests {
+
+static void CheckSequence(const size_t index, const FastqSequence& seq)
+{
+    SCOPED_TRACE("checking Fastq seq:" + std::to_string(index));
+    switch (index) {
+        case 0:
+            EXPECT_EQ("1", seq.Name());
+            EXPECT_EQ(
+                "TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA"
+                "ACCCTAACCCTAACAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGA"
+                "GGAGAACGCAACTCCGCCGGCGCAGGCG",
+                seq.Bases());
+            EXPECT_EQ(
+                "[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[["
+                "[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[["
+                "[[[[[[[[[[[[[[[[[[[[[[[[[[[[",
+                seq.Qualities().Fastq());
+            break;
+
+        case 1:
+            EXPECT_EQ("2", seq.Name());
+            EXPECT_EQ(
+                "TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA"
+                "ACCCTAACCCTAACAACGCAGCTCCGCCCTCGCGGTGCTCTCCGGGTCTGTGCTGA"
+                "GGAGAACGCAAC",
+                seq.Bases());
+            EXPECT_EQ(
+                "[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[["
+                "[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[["
+                "[[[[[[[[[[[[",
+                seq.Qualities().Fastq());
+            break;
+
+        case 2:
+            EXPECT_EQ("3", seq.Name());
+            EXPECT_EQ(
+                "TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA"
+                "ACCCTAACCCTAACACCCTAACCCCAACCCCAACCCCAACCCCAACCCCAACCCCA"
+                "ACCCTAACCCCTAACCCTAACCCT",
+                seq.Bases());
+            EXPECT_EQ(
+                "]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]"
+                "]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]]"
+                "]]]]]]]]]]]]]]]]]]]]]]]]",
+                seq.Qualities().Fastq());
+            break;
+
+        default:
+            ASSERT_TRUE(false);  // invalid index
+    }
+}
+
+}  // namespace FastqTests
+
+TEST(FastqSequenceTest, BasicConstructorsOk)
+{
+    FastqSequence seq1{"1", "GATTACA", "[[[[[[["};
+    EXPECT_EQ("1", seq1.Name());
+    EXPECT_EQ("GATTACA", seq1.Bases());
+    EXPECT_EQ("[[[[[[[", seq1.Qualities().Fastq());
+
+    const auto quals = std::vector<uint8_t>{58, 58, 58, 58, 58, 58, 58};
+    FastqSequence seq2{"1", "GATTACA", QualityValues{quals}};
+    EXPECT_EQ("1", seq2.Name());
+    EXPECT_EQ("GATTACA", seq2.Bases());
+    EXPECT_EQ("[[[[[[[", seq2.Qualities().Fastq());
+}
+
+TEST(FastqReaderTest, IterableOk)
+{
+    const std::string fn = PbbamTestsConfig::GeneratedData_Dir + "/normal.fq";
+    FastqReader reader{fn};
+
+    size_t count = 0;
+    FastqSequence seq;
+    while (reader.GetNext(seq)) {
+        FastqTests::CheckSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(FastqReaderTest, ReadAllOk)
+{
+    const std::string fn = PbbamTestsConfig::GeneratedData_Dir + "/normal.fq";
+
+    size_t count = 0;
+    for (const auto& seq : FastqReader::ReadAll(fn)) {
+        FastqTests::CheckSequence(count, seq);
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
diff --git a/tests/src/test_FileUtils.cpp b/tests/src/test_FileUtils.cpp

new file mode 100644 (file)

index 0000000..c6ef184
--- /dev/null
+++ b/tests/src/test_FileUtils.cpp
@@ -0,0 +1,286 @@
+// Author: Derek Barnett
+
+#include <cctype>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <boost/algorithm/string.hpp>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/../../src/FileUtils.h>
+#include <pbbam/../../src/TimeUtils.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+
+TEST(FileUtilsTest, ExistsOk)
+{
+    EXPECT_FALSE(FileUtils::Exists("does_not_exist.txt"));
+
+    const std::string tmp = PbbamTestsConfig::GeneratedData_Dir + "/pbbam_exists_check.tmp";
+    const std::string cmd = std::string("touch ") + tmp;
+    ASSERT_EQ(0, system(cmd.c_str()));
+    EXPECT_TRUE(FileUtils::Exists(tmp));
+}
+
+TEST(FileUtilsTest, LastModifiedOk)
+{
+    // a little tricky to check without going a full 'mock' filesystem route, but we can approximate
+    //
+    // also, I can't seem to get better than second resolution (on OSX 10.9/clang at least, st_mtimespec.tv_nsec is always zero)
+
+    const auto now = CurrentTime();
+    const auto nowDuration = now.time_since_epoch();
+    const auto nowSeconds = std::chrono::duration_cast<std::chrono::seconds>(nowDuration).count();
+
+    const std::string tmp = PbbamTestsConfig::GeneratedData_Dir + "/pbbam_lastmod_check.tmp";
+    const std::string rmCmd = std::string("rm ") + tmp;
+    const std::string touchCmd = std::string("touch  ") + tmp;
+    const auto ret = system(rmCmd.c_str());
+    UNUSED(ret);
+    ASSERT_EQ(0, system(touchCmd.c_str()));
+
+    const auto stamp = FileUtils::LastModified(tmp);
+    const auto stampDuration = stamp.time_since_epoch();
+    const auto stampSeconds =
+        std::chrono::duration_cast<std::chrono::seconds>(stampDuration).count();
+
+    EXPECT_LE(nowSeconds, stampSeconds);
+}
+
+TEST(FileUtilsTest, ResolvedFilePathOk)
+{
+    const std::string testFrom = "/path/to/myDir";
+
+    // "raw" filenames - no URI scheme
+
+    const std::string absolutePath = "/absolute/path/to/file.txt";
+    const std::string relativePath = "../relative/path/to/file.txt";
+    const std::string noPathFn = "file.txt";
+
+    const std::string resolvedAbsolutePath = FileUtils::ResolvedFilePath(absolutePath, testFrom);
+    const std::string resolvedRelativePath = FileUtils::ResolvedFilePath(relativePath, testFrom);
+    const std::string resolvedNoPath = FileUtils::ResolvedFilePath(noPathFn, testFrom);
+    const std::string resolvedAbsolutePath_defaultFrom = FileUtils::ResolvedFilePath(absolutePath);
+    const std::string resolvedRelativePath_defaultFrom = FileUtils::ResolvedFilePath(relativePath);
+    const std::string resolvedNoPath_defaultFrom = FileUtils::ResolvedFilePath(noPathFn);
+
+    EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsolutePath);
+    EXPECT_EQ("/path/to/myDir/../relative/path/to/file.txt", resolvedRelativePath);
+    EXPECT_EQ("/path/to/myDir/file.txt", resolvedNoPath);
+
+    EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsolutePath_defaultFrom);
+    EXPECT_EQ("./../relative/path/to/file.txt", resolvedRelativePath_defaultFrom);
+    EXPECT_EQ("./file.txt", resolvedNoPath_defaultFrom);
+
+    // filenames with URI scheme ("file://")
+
+    const std::string absoluteSchemeFn = "file:///absolute/path/to/file.txt";
+    const std::string relativeSchemeFn = "file://../relative/path/to/file.txt";
+    const std::string noPathSchemeFn = "file://file.txt";
+
+    const std::string resolvedAbsoluteSchemePath =
+        FileUtils::ResolvedFilePath(absoluteSchemeFn, testFrom);
+    const std::string resolvedRelativeSchemePath =
+        FileUtils::ResolvedFilePath(relativeSchemeFn, testFrom);
+    const std::string resolvedNoPathSchemeFn =
+        FileUtils::ResolvedFilePath(noPathSchemeFn, testFrom);
+    const std::string resolvedAbsoluteSchemePath_defaultFrom =
+        FileUtils::ResolvedFilePath(absoluteSchemeFn);
+    const std::string resolvedRelativeSchemePath_defaultFrom =
+        FileUtils::ResolvedFilePath(relativeSchemeFn);
+    const std::string resolvedNoPathSchemeFn_defaultFrom =
+        FileUtils::ResolvedFilePath(noPathSchemeFn);
+
+    EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsoluteSchemePath);
+    EXPECT_EQ("/path/to/myDir/../relative/path/to/file.txt", resolvedRelativeSchemePath);
+    EXPECT_EQ("/path/to/myDir/file.txt", resolvedNoPathSchemeFn);
+
+    EXPECT_EQ("/absolute/path/to/file.txt", resolvedAbsoluteSchemePath_defaultFrom);
+    EXPECT_EQ("./../relative/path/to/file.txt", resolvedRelativeSchemePath_defaultFrom);
+    EXPECT_EQ("./file.txt", resolvedNoPathSchemeFn_defaultFrom);
+}
+
+TEST(FileUtilsTest, SizeOk)
+{
+    const std::string tmp = PbbamTestsConfig::GeneratedData_Dir + "/pbbam_empty_file.tmp";
+    const std::string cmd = std::string("touch ") + tmp;
+    ASSERT_EQ(0, system(cmd.c_str()));
+    EXPECT_EQ(0, FileUtils::Size(tmp));
+
+    EXPECT_THROW(FileUtils::Size("does_not_exist.txt"), std::runtime_error);
+}
+
+// ####################################################################################################
+// The code below is part of a simple check whether or not a (Windows-only) file path is absolute.
+//
+// NOTE: (and this is admittedly brittle for maintenance, but) the internal methods used are literally
+// copied here for direct driving. There's likely a better way going forward, than the manual copy/paste.
+// But in the absence of a similar runtime environment to build in & test against, while
+// the motivating behavior is blocking other work, this lets me get the fix in their hands ASAP and still
+// have some test code poking it beforehand. -DB
+//
+namespace test_windows {
+
+static std::string removeFileUriScheme(const std::string& uri)
+{
+    assert(!uri.empty());
+
+    auto schemeLess = uri;
+    const auto fileScheme = std::string{"file://"};
+    const auto schemeFound = schemeLess.find(fileScheme);
+    if (schemeFound != std::string::npos) {
+        if (schemeFound != 0) throw std::runtime_error("Malformed URI: scheme not at beginning");
+        schemeLess = schemeLess.substr(fileScheme.size());
+    }
+    return schemeLess;
+}
+
+static std::string removeDiskName(const std::string& filePath)
+{
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':')) return filePath.substr(2);
+    }
+    return filePath;
+}
+
+static const char native_pathSeparator = '\\';
+
+static bool native_pathIsAbsolute(const std::string& filePath)
+{
+    assert(!filePath.empty());
+
+    // if starts with single slash or double slash [cases 1,3]
+    if (boost::algorithm::starts_with(filePath, "\\")) return true;
+
+    // if starts with single or double-dots -> not absolute [case 4 + ".\file.txt"]
+    if (boost::algorithm::starts_with(filePath, ".")) return false;
+
+    // if starts with drive name and colon ("C:\foo\bar.txt")
+    if (filePath.size() >= 2) {
+        const char firstChar = filePath.at(0);
+        if ((isalpha(firstChar) != 0) && (filePath.at(1) == ':'))
+            return native_pathIsAbsolute(removeDiskName(filePath));
+    }
+
+    // otherwise, likely relative
+    return false;
+}
+
+static std::string native_resolvedFilePath(const std::string& filePath, const std::string& from)
+{
+    // strip file:// scheme if present
+    auto schemeLess = removeFileUriScheme(filePath);
+
+    // if empty or already absolute path, just return it
+    // upfront empty check simplifies further parsing logic
+    if (schemeLess.empty() || native_pathIsAbsolute(schemeLess)) return schemeLess;
+
+    // else make relative from the provided 'from' directory
+    //
+    // first pop disk name, then any leading single-dot '.'
+    //
+    // since we're prepending the 'from' directory, we can remove
+    // any leading './' form our file path. this may just mean that
+    // we pop it off to add it right back (when from == '.'), but this
+    // keeps it consistent with other 'from' parent directories
+    //
+    schemeLess = removeDiskName(schemeLess);
+
+    const bool thisDirAtStart = (schemeLess.find(".") == 0);
+    if (thisDirAtStart) {
+        if (schemeLess.find(native_pathSeparator) == 1) schemeLess = schemeLess.substr(2);
+    }
+    return from + native_pathSeparator + schemeLess;
+}
+
+}  // namespace test_windows
+
+TEST(FileUtilsTest, WindowsPathsOk)
+{
+    {  // remove disk name
+
+        // "C:\tmp.txt"
+        std::string f1 = "C:\\tmp.txt";
+        EXPECT_EQ(std::string("\\tmp.txt"), test_windows::removeDiskName(f1));
+
+        // "C:tmp.txt"
+        std::string f2 = "C:tmp.txt";
+        EXPECT_EQ(std::string("tmp.txt"), test_windows::removeDiskName(f2));
+
+        // "\tmp.txt"
+        std::string f3 = "\\tmp.txt";
+        EXPECT_EQ(f3, test_windows::removeDiskName(f3));
+
+        // "tmp.txt"
+        std::string f4 = "tmp.txt";
+        EXPECT_EQ(f4, test_windows::removeDiskName(f4));
+    }
+
+    {  // isAbsolute ?
+
+        // "\\server\path\to\tmp.txt"
+        EXPECT_TRUE(test_windows::native_pathIsAbsolute("\\\\server\\path\\to\tmp.txt"));
+
+        // "..\tmp.txt"
+        EXPECT_FALSE(test_windows::native_pathIsAbsolute("..\\tmp.txt"));
+
+        // ".\tmp.txt"
+        EXPECT_FALSE(test_windows::native_pathIsAbsolute(".\\tmp.txt"));
+
+        // "C:\path\to\tmp.txt"
+        EXPECT_TRUE(test_windows::native_pathIsAbsolute("C:\\path\\to\\tmp.txt"));
+
+        // "C:..\path\to\tmp.txt"
+        EXPECT_FALSE(test_windows::native_pathIsAbsolute("C:..\\path\\to\\tmp.txt"));
+    }
+
+    {  // resolve file path
+
+        const std::string myRootDir = "C:\\path\\to\\myRootDir";
+
+        // "\\server\path\to\tmp.txt"
+        const std::string fn1 = "\\\\server\\path\\to\tmp.txt";
+        const std::string fn1_expected = fn1;
+        EXPECT_EQ(fn1_expected, test_windows::native_resolvedFilePath(fn1, myRootDir));
+
+        // "..\tmp.txt"
+        const std::string fn2 = "..\\tmp.txt";
+        const std::string fn2_expected = "C:\\path\\to\\myRootDir\\..\\tmp.txt";
+        EXPECT_EQ(fn2_expected, test_windows::native_resolvedFilePath(fn2, myRootDir));
+
+        // ".\tmp.txt"
+        const std::string fn3 = ".\\tmp.txt";
+        const std::string fn3_expected = "C:\\path\\to\\myRootDir\\tmp.txt";
+        EXPECT_EQ(fn3_expected, test_windows::native_resolvedFilePath(fn3, myRootDir));
+
+        // "C:\path\to\tmp.txt"
+        const std::string fn4 = "C:\\path\\to\\tmp.txt";
+        const std::string fn4_expected = fn4;
+        EXPECT_EQ(fn4_expected, test_windows::native_resolvedFilePath(fn4, myRootDir));
+
+        // "C:..\path\to\tmp.txt"
+        const std::string fn5 = "C:..\\path\\to\\tmp.txt";
+        const std::string fn5_expected = "C:\\path\\to\\myRootDir\\..\\path\\to\\tmp.txt";
+        EXPECT_EQ(fn5_expected, test_windows::native_resolvedFilePath(fn5, myRootDir));
+
+        // "C:tmp.txt"
+        const std::string fn6 = "C:tmp.txt";
+        const std::string fn6_expected = "C:\\path\\to\\myRootDir\\tmp.txt";
+        EXPECT_EQ(fn6_expected, test_windows::native_resolvedFilePath(fn6, myRootDir));
+        EXPECT_EQ(fn3_expected,
+                  test_windows::native_resolvedFilePath(
+                      fn6, myRootDir));  // our path is equivalent to fn3's "./temp.txt"
+    }
+}
+//
+// ####################################################################################################
diff --git a/tests/src/test_Frames.cpp b/tests/src/test_Frames.cpp

new file mode 100644 (file)

index 0000000..82cb3a6
--- /dev/null
+++ b/tests/src/test_Frames.cpp
@@ -0,0 +1,46 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/Frames.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace FramesTests {
+
+static const std::vector<uint16_t> testFrames{
+    0,  8,  140, 0,  0,   7,  4,  0,  85, 2,  1,  3,  2,   10, 1,  20, 47,   10,  9,  60, 20,
+    3,  12, 5,   13, 165, 6,  14, 22, 12, 2,  4,  9,  218, 27, 3,  15, 2,    17,  2,  45, 24,
+    89, 10, 7,   1,  11,  15, 0,  7,  0,  28, 17, 12, 6,   10, 37, 0,  12,   52,  0,  7,  1,
+    14, 3,  26,  12, 0,   20, 17, 2,  13, 2,  9,  13, 7,   15, 29, 3,  6,    2,   1,  28, 10,
+    3,  14, 7,   1,  22,  1,  6,  6,  0,  19, 31, 6,  2,   14, 0,  0,  1000, 947, 948};
+
+static const std::vector<uint8_t> encodedFrames{
+    0,  8,  102, 0,  0,   7,  4,  0,  75, 2,  1,  3,  2,   10, 1,  20, 47,  10,  9,  60, 20,
+    3,  12, 5,   13, 115, 6,  14, 22, 12, 2,  4,  9,  135, 27, 3,  15, 2,   17,  2,  45, 24,
+    77, 10, 7,   1,  11,  15, 0,  7,  0,  28, 17, 12, 6,   10, 37, 0,  12,  52,  0,  7,  1,
+    14, 3,  26,  12, 0,   20, 17, 2,  13, 2,  9,  13, 7,   15, 29, 3,  6,   2,   1,  28, 10,
+    3,  14, 7,   1,  22,  1,  6,  6,  0,  19, 31, 6,  2,   14, 0,  0,  255, 254, 255};
+
+}  // namespace FramesTests
+
+TEST(FramesTest, Constructors)
+{
+    const Frames f;
+    ASSERT_TRUE(f.Data().empty());
+
+    const Frames f2(FramesTests::testFrames);
+    const auto d = f2.Data();
+    ASSERT_EQ(FramesTests::testFrames, d);
+}
+
+TEST(FramesTest, Encoded)
+{
+    const Frames f(FramesTests::testFrames);
+    const auto e = f.Encode();
+    ASSERT_EQ(FramesTests::encodedFrames, e);
+}
diff --git a/tests/src/test_GenomicIntervalQuery.cpp b/tests/src/test_GenomicIntervalQuery.cpp

new file mode 100644 (file)

index 0000000..91353df
--- /dev/null
+++ b/tests/src/test_GenomicIntervalQuery.cpp
@@ -0,0 +1,124 @@
+// Author: Derek Barnett
+
+#include <iostream>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/GenomicIntervalQuery.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace GenomicIntervalQueryTests {
+const std::string inputBamFn = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+}  // namespace GenomicIntervalQueryTests
+
+TEST(GenomicIntervalQueryTest, ReuseQueryAndCountRecords)
+{
+    const std::string rname = "lambda_NEB3011";
+
+    BamFile bamFile(GenomicIntervalQueryTests::inputBamFn);
+
+    // setup with normal interval
+    int count = 0;
+    GenomicInterval interval(rname, 5000, 6000);
+    GenomicIntervalQuery query(interval, bamFile);
+    for (const BamRecord& record : query) {
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(2, count);
+
+    // adjust interval and pass back in
+    count = 0;
+    interval.Start(9300);
+    interval.Stop(9400);
+    query.Interval(interval);
+    for (const BamRecord& record : query) {
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(2, count);
+
+    // adjust again (empty region)
+    count = 0;
+    interval.Name(rname);
+    interval.Start(1000);
+    interval.Stop(2000);
+    query.Interval(interval);
+    for (const BamRecord& record : query) {
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(0, count);
+
+    // unknown ref
+    count = 0;
+    interval.Name("does not exist");
+    interval.Start(0);
+    interval.Stop(100);
+    EXPECT_THROW(query.Interval(interval), std::runtime_error);
+    for (const BamRecord& record : query) {  // iteration is still safe, just returns no data
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(0, count);
+
+    // adjust again - make sure we can read a real region after an invalid one
+    interval.Name(rname);
+    interval.Start(5000);
+    interval.Stop(6000);
+    query.Interval(interval);
+    count = 0;
+    for (const BamRecord& record : query) {
+        UNUSED(record);
+        ++count;
+    }
+    EXPECT_EQ(2, count);
+}
+
+TEST(GenomicIntervalQueryTest, NonConstBamRecord)
+{
+    EXPECT_NO_THROW({
+        BamFile bamFile(GenomicIntervalQueryTests::inputBamFn);
+        int count = 0;
+
+        GenomicInterval interval("lambda_NEB3011", 8000, 10000);
+        GenomicIntervalQuery query(interval, bamFile);
+        for (BamRecord& record : query) {
+            UNUSED(record);
+            ++count;
+        }
+        EXPECT_EQ(2, count);
+    });
+}
+
+TEST(GenomicIntervalQueryTest, MissingBaiShouldThrow)
+{
+    GenomicInterval interval("lambda_NEB3011", 0, 100);
+    const std::string phi29Bam = PbbamTestsConfig::Data_Dir + "/phi29.bam";
+    const std::string hasBaiBam = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+
+    {  // single file, missing BAI
+        EXPECT_THROW(GenomicIntervalQuery query(interval, phi29Bam), std::runtime_error);
+    }
+
+    {  // from dataset, all missing BAI
+        DataSet ds;
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        EXPECT_THROW(GenomicIntervalQuery query(interval, ds), std::runtime_error);
+    }
+
+    {  // from dataset, mixed BAI presence
+        DataSet ds;
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        ds.ExternalResources().Add(
+            ExternalResource("PacBio.AlignmentFile.AlignmentBamFile", hasBaiBam));
+        EXPECT_THROW(GenomicIntervalQuery query(interval, ds), std::runtime_error);
+    }
+}
diff --git a/tests/src/test_IndexedBamWriter.cpp b/tests/src/test_IndexedBamWriter.cpp

new file mode 100644 (file)

index 0000000..44cb811
--- /dev/null
+++ b/tests/src/test_IndexedBamWriter.cpp
@@ -0,0 +1,61 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamFile.h>
+#include <pbbam/BamReader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/IndexedBamWriter.h>
+#include <pbbam/PbiBuilder.h>
+#include <pbbam/PbiRawData.h>
+
+TEST(IndexedBamWriter, WritesValidIndex)
+{
+    using namespace PacBio::BAM;
+
+    const std::string inBam = PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam";
+    const std::string outBam = PbbamTestsConfig::GeneratedData_Dir + "/ibw.bam";
+    const std::string outPbi = PbbamTestsConfig::GeneratedData_Dir + "/ibw.bam.pbi";
+
+    const BamFile file{inBam};
+    const auto& header = file.Header();
+
+    {  // copy file & generate index
+
+        BamReader reader{file};
+        IndexedBamWriter writer{outBam, header};
+
+        BamRecord b;
+        while (reader.GetNext(b))
+            writer.Write(b);
+    }
+
+    // close scope to finalize BAM/PBI output
+
+    {  // check random access using PBI
+
+        const PbiRawData idx{outPbi};
+        const auto& offsets = idx.BasicData().fileOffset_;
+
+        BamReader reader{outBam};
+        BamRecord b;
+        for (size_t i = 0; i < offsets.size(); ++i) {
+            auto canRead = [](BamReader& myReader, BamRecord& record,
+                              size_t loopI) -> ::testing::AssertionResult {
+                if (myReader.GetNext(record))
+                    return ::testing::AssertionSuccess() << "i: " << loopI;
+                else
+                    return ::testing::AssertionFailure() << "i: " << loopI;
+            };
+
+            reader.VirtualSeek(offsets.at(i));
+            EXPECT_TRUE(canRead(reader, b, i));
+        }
+    }
+
+    // temp file cleanup
+    ::remove(outBam.c_str());
+    ::remove(outPbi.c_str());
+}
diff --git a/tests/src/test_IndexedFastaReader.cpp b/tests/src/test_IndexedFastaReader.cpp

new file mode 100644 (file)

index 0000000..c33f8a9
--- /dev/null
+++ b/tests/src/test_IndexedFastaReader.cpp
@@ -0,0 +1,203 @@
+// Author: Derek Barnett
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include "pbbam/BamFile.h"
+#include "pbbam/BamRecord.h"
+#include "pbbam/EntireFileQuery.h"
+#include "pbbam/IndexedFastaReader.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace IndexedFastaReaderTests {
+
+const std::string lambdaFasta = PbbamTestsConfig::Data_Dir + "/lambdaNEB.fa";
+const std::string singleInsertionBam = PbbamTestsConfig::Data_Dir + "/aligned.bam";
+
+}  // namespace IndexedFastaReaderTests
+
+TEST(IndexedFastaReaderTests, PrintSingleInsertion)
+{
+    IndexedFastaReader r(IndexedFastaReaderTests::lambdaFasta);
+
+    // Open BAM file
+    BamFile bamFile(IndexedFastaReaderTests::singleInsertionBam);
+    EntireFileQuery bamQuery(bamFile);
+
+    auto it = bamQuery.begin();
+    auto record = *it++;
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    record = *it++;
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ("GGCTGCAGTGTACAGCGGTCAGGAGGCC-ATTGATGCCGGACTGGCTGAT",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    record = *it++;
+    EXPECT_EQ(
+        "----------------------------------------------------"
+        "AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+    EXPECT_EQ(
+        "----------------------------------------------------"
+        "AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    record = *it++;
+    EXPECT_EQ(
+        "AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA-----------------------------"
+        "-----------------------",
+        r.ReferenceSubsequence(record, Orientation::GENOMIC, true));
+    EXPECT_EQ(
+        "----------------------------------------------------TTGCCGCTGTT-"
+        "ACCGTGCTGCGATCTTCTGCCATCGACGGACGTCCCACATTGGTGACTT",
+        r.ReferenceSubsequence(record, Orientation::NATIVE, true));
+    EXPECT_EQ("AAGTCACCAATGTGGGACGTCCGTCGATGGCAGAAGATCGCAGCACGGT-AACAGCGGCAA",
+              r.ReferenceSubsequence(record, Orientation::GENOMIC, true, true));
+    EXPECT_EQ("TTGCCGCTGTT-ACCGTGCTGCGATCTTCTGCCATCGACGGACGTCCCACATTGGTGACTT",
+              r.ReferenceSubsequence(record, Orientation::NATIVE, true, true));
+
+    // {
+    //     std::ostringstream output;
+    //     auto itSS = bamQuery.begin();
+    //     {
+    //         const auto recordSS = *itSS;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl;
+    //         output << std::endl;
+    //     }
+    //     ++itSS;
+    //     {
+    //         const auto recordSS = *itSS;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl;
+    //         output << std::endl;
+    //     }
+    //     ++itSS;
+    //     {
+    //         const auto recordSS = *itSS;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl;
+    //         output << std::endl;
+    //     }
+    //     ++itSS;
+    //     {
+    //         const auto recordSS = *itSS;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::GENOMIC, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::GENOMIC, true, true) << std::endl;
+    //         output << std::endl;
+    //         output << r.ReferenceSubsequence(recordSS, Orientation::NATIVE, true, true) << std::endl;
+    //         output << recordSS.Sequence(Orientation::NATIVE, true, true) << std::endl;
+    //     }
+    //     std::cerr << output.str();
+    // }
+}
+
+TEST(IndexedFastaReaderTests, ReadLambda)
+{
+    IndexedFastaReader r(IndexedFastaReaderTests::lambdaFasta);
+
+    EXPECT_TRUE(r.HasSequence("lambda_NEB3011"));
+    EXPECT_FALSE(r.HasSequence("dog"));
+    EXPECT_EQ(1, r.NumSequences());
+    EXPECT_EQ(48502, r.SequenceLength("lambda_NEB3011"));
+
+    std::string seq = r.Subsequence("lambda_NEB3011:0-10");
+    EXPECT_EQ("GGGCGGCGAC", seq);
+
+    std::string seq2 = r.Subsequence("lambda_NEB3011", 0, 10);
+    EXPECT_EQ("GGGCGGCGAC", seq2);
+
+    // subsequence extending beyond bounds returns clipped
+    std::string seq3 = r.Subsequence("lambda_NEB3011", 48400, 48600);
+    EXPECT_EQ(102, seq3.length());
+
+    // bad subsequence
+}
+
+TEST(IndexedFastaReaderTests, Errors)
+{
+    IndexedFastaReader r(IndexedFastaReaderTests::lambdaFasta);
+
+    //
+    // attempt access without "opening"
+    //
+    // EXPECT_THROW(r.NumSequences(), std::exception);
+    // EXPECT_THROW(r.HasSequence("lambda_NEB3011"), std::exception);
+    // EXPECT_THROW(r.SequenceLength("lambda_NEB3011"), std::exception);
+    // EXPECT_THROW(r.Subsequence("lambda_NEB3011:0-10"), std::exception);
+
+    //
+    // invalid accesses after opening
+    //
+    EXPECT_THROW(r.SequenceLength("dog"), std::exception);
+    EXPECT_THROW(r.Subsequence("dog:0-10"), std::exception);
+}
+
+TEST(IndexedFastaReaderTests, Names)
+{
+    IndexedFastaReader r(IndexedFastaReaderTests::lambdaFasta);
+    std::vector<std::string> names = {"lambda_NEB3011"};
+
+    // Test all-name request
+    EXPECT_EQ(names, r.Names());
+
+    // Test single-name query
+    EXPECT_EQ(names[0], r.Name(0));
+
+    // invalid name acces (out of range)
+    EXPECT_THROW(r.Name(1), std::exception);
+}
diff --git a/tests/src/test_Intervals.cpp b/tests/src/test_Intervals.cpp

new file mode 100644 (file)

index 0000000..fa10e65
--- /dev/null
+++ b/tests/src/test_Intervals.cpp
@@ -0,0 +1,295 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/GenomicInterval.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(IntervalTest, Constructors)
+{
+    Interval<Position> empty;
+    Interval<Position> singleton(4);
+    Interval<Position> normal(5, 8);
+
+    EXPECT_EQ(0, empty.Start());
+    EXPECT_EQ(0, empty.Stop());
+
+    EXPECT_EQ(4, singleton.Start());
+    EXPECT_EQ(5, singleton.Stop());
+
+    EXPECT_EQ(5, normal.Start());
+    EXPECT_EQ(8, normal.Stop());
+
+    // TODO: check out-of-order intervals, etc
+}
+
+TEST(IntervalTest, EqualityTest)
+{
+    Interval<Position> empty;
+    Interval<Position> empty2;
+
+    Interval<Position> singleton(4);
+    Interval<Position> sameAsSingleton(4, 5);
+
+    Interval<Position> normal(5, 8);
+    Interval<Position> sameAsNormal(5, 8);
+
+    Interval<Position> different(20, 40);
+
+    // self-equality
+    EXPECT_TRUE(empty == empty);
+    EXPECT_TRUE(singleton == singleton);
+    EXPECT_TRUE(normal == normal);
+    EXPECT_TRUE(different == different);
+
+    // same values equality
+    EXPECT_TRUE(empty == empty2);
+    EXPECT_TRUE(singleton == sameAsSingleton);
+    EXPECT_TRUE(normal == sameAsNormal);
+
+    // different values
+    EXPECT_FALSE(empty == singleton);
+    EXPECT_FALSE(empty == normal);
+    EXPECT_FALSE(empty == different);
+    EXPECT_FALSE(singleton == normal);
+    EXPECT_FALSE(normal == different);
+}
+
+TEST(IntervalTest, Copy)
+{
+    Interval<Position> interval1(5, 8);
+    Interval<Position> interval2(interval1);
+    Interval<Position> interval3 = interval1;
+
+    EXPECT_TRUE(interval1 == interval1);
+    EXPECT_TRUE(interval1 == interval2);
+    EXPECT_TRUE(interval1 == interval3);
+}
+
+TEST(IntervalTest, Modifier)
+{
+    Interval<Position> interval1(5, 8);
+    Interval<Position> interval2(interval1);
+    interval2.Start(2);
+    interval2.Stop(10);
+
+    EXPECT_FALSE(interval1 == interval2);
+    EXPECT_EQ(2, interval2.Start());
+    EXPECT_EQ(10, interval2.Stop());
+}
+
+TEST(IntervalTest, CoverTest)
+{
+    Interval<Position> interval1(2, 4);
+    Interval<Position> interval2(3, 5);
+    Interval<Position> interval3(6, 8);
+    Interval<Position> interval4(1, 7);
+    Interval<Position> interval5(5, 8);
+
+    EXPECT_TRUE(interval1.Covers(interval1));     // self-cover: a.covers(a)
+    EXPECT_TRUE(interval1.CoveredBy(interval1));  // self-cover: a.coveredBy(a)
+
+    EXPECT_TRUE(interval2.CoveredBy(interval4));  // a.coveredBy(b)
+    EXPECT_TRUE(interval4.Covers(interval2));     // thus b.covers(a)
+    EXPECT_FALSE(interval2 == interval4);         // if a != b
+    EXPECT_FALSE(interval2.Covers(interval4));    // then !a.covers(b)
+
+    EXPECT_FALSE(interval2.Covers(interval3));  // completely disjoint
+    EXPECT_FALSE(interval3.Covers(interval2));
+    EXPECT_FALSE(interval2.CoveredBy(interval3));
+    EXPECT_FALSE(interval3.CoveredBy(interval2));
+
+    EXPECT_FALSE(interval2.Covers(interval5));  // a.stop == b.start
+    EXPECT_FALSE(interval2.CoveredBy(interval5));
+
+    EXPECT_TRUE(interval5.Covers(interval3));  // shared endpoint, start contained, thus a.covers(b)
+    EXPECT_TRUE(interval3.CoveredBy(interval5));  // and b.coveredBy(a)
+}
+
+TEST(IntervalTest, IntersectTest)
+{
+    Interval<Position> interval1(2, 4);
+    Interval<Position> interval2(3, 5);
+    Interval<Position> interval3(6, 8);
+    Interval<Position> interval4(1, 7);
+    Interval<Position> interval5(5, 8);
+
+    EXPECT_TRUE(interval1.Intersects(interval1));  // self-intersection: a.intersects(a)
+
+    EXPECT_TRUE(interval1.Intersects(interval2));  // if a.intersects(b)
+    EXPECT_TRUE(interval2.Intersects(interval1));  // then b.intersects(a)
+
+    EXPECT_TRUE(interval4.Covers(interval1));      // if b.covers(a),
+    EXPECT_TRUE(interval1.Intersects(interval4));  // then a.intersects(b)
+    EXPECT_TRUE(interval4.Intersects(interval1));  // and b.intersects(a)
+
+    EXPECT_FALSE(interval2.Intersects(interval3));  // b.start > a.stop (obvious disjoint)
+    EXPECT_FALSE(interval2.Intersects(
+        interval5));  // b.start == a.stop (intervals are right open, so disjoint)
+}
+
+TEST(IntervalTest, ValidityTest)
+{
+    Interval<Position> interval1;        // default ctor
+    Interval<Position> interval2(0, 0);  // start == stop (zero)
+    Interval<Position> interval3(4, 4);  // start == stop (nonzero)
+    Interval<Position> interval4(0, 1);  // start < stop  (start is zero)
+    Interval<Position> interval5(4, 5);  // start < stop  (start is nonzero)
+    Interval<Position> interval6(5, 4);  // start > stop
+
+    EXPECT_FALSE(interval1.IsValid());
+    EXPECT_FALSE(interval2.IsValid());
+    EXPECT_FALSE(interval3.IsValid());
+    EXPECT_TRUE(interval4.IsValid());
+    EXPECT_TRUE(interval5.IsValid());
+    EXPECT_FALSE(interval6.IsValid());
+}
+
+TEST(IntervalTest, LengthTest)
+{
+    Interval<Position> interval1(2, 4);
+    Interval<Position> interval2(3, 5);
+    Interval<Position> interval3(6, 8);
+    Interval<Position> interval4(1, 7);
+    Interval<Position> interval5(5, 8);
+
+    EXPECT_EQ(2, interval1.Length());
+    EXPECT_EQ(2, interval2.Length());
+    EXPECT_EQ(2, interval3.Length());
+    EXPECT_EQ(6, interval4.Length());
+    EXPECT_EQ(3, interval5.Length());
+
+    // TODO: check out-of-order intervals, etc
+}
+
+TEST(GenomicIntervalTest, DefaultConstructor)
+{
+    GenomicInterval gi;
+    EXPECT_EQ("", gi.Name());
+    EXPECT_EQ(0, gi.Start());
+    EXPECT_EQ(0, gi.Stop());
+}
+
+TEST(GenomicIntervalTest, ExplicitConstructor)
+{
+    GenomicInterval gi("foo", 100, 200);
+    EXPECT_EQ("foo", gi.Name());
+    EXPECT_EQ(100, gi.Start());
+    EXPECT_EQ(200, gi.Stop());
+}
+
+TEST(GenomicIntervalTest, RegionStringConstructor)
+{
+    GenomicInterval gi("foo:100-200");
+    EXPECT_EQ("foo", gi.Name());
+    EXPECT_EQ(100, gi.Start());
+    EXPECT_EQ(200, gi.Stop());
+
+    GenomicInterval refOnly("foo");
+    EXPECT_EQ("foo", refOnly.Name());
+    EXPECT_EQ(0, refOnly.Start());
+    EXPECT_EQ(1 << 29, refOnly.Stop());  // htslib's default, "read-to-end" interval stop
+}
+
+TEST(GenomicIntervalTest, Copy)
+{
+    GenomicInterval interval1("foo", 10, 20);
+    GenomicInterval interval2(interval1);
+    GenomicInterval interval3 = interval1;
+
+    EXPECT_TRUE(interval1 == interval1);
+    EXPECT_TRUE(interval1 == interval2);
+    EXPECT_TRUE(interval1 == interval3);
+}
+
+TEST(GenomicIntervalTest, Modifiers)
+{
+    GenomicInterval interval1("foo", 10, 20);
+
+    // modify individual properties
+    GenomicInterval interval2(interval1);
+    interval2.Name("bar");
+    interval2.Start(2);
+    interval2.Stop(10);
+
+    // modify interval as a whole
+    GenomicInterval interval3(interval1);
+    interval3.Interval(interval2.Interval());
+
+    EXPECT_FALSE(interval1 == interval2);
+    EXPECT_EQ("bar", interval2.Name());
+    EXPECT_EQ(2, interval2.Start());
+    EXPECT_EQ(10, interval2.Stop());
+
+    EXPECT_EQ(interval1.Name(), interval3.Name());
+    EXPECT_EQ(interval2.Interval(), interval3.Interval());
+}
+
+TEST(GenomicIntervalTest, CoverTest)
+{
+    GenomicInterval interval1("foo", 2, 4);
+    GenomicInterval interval2("foo", 3, 5);
+    GenomicInterval interval3("foo", 6, 8);
+    GenomicInterval interval4("foo", 1, 7);
+    GenomicInterval interval5("foo", 5, 8);
+
+    // same as interval2, but different ref
+    GenomicInterval interval6(interval2);
+    interval6.Name("bar");
+
+    EXPECT_TRUE(interval1.Covers(interval1));     // self-cover: a.covers(a)
+    EXPECT_TRUE(interval1.CoveredBy(interval1));  // self-cover: a.coveredBy(a)
+
+    EXPECT_TRUE(interval2.CoveredBy(interval4));  // a.coveredBy(b)
+    EXPECT_TRUE(interval4.Covers(interval2));     // thus b.covers(a)
+    EXPECT_FALSE(interval2 == interval4);         // if a != b
+    EXPECT_FALSE(interval2.Covers(interval4));    // then !a.covers(b)
+
+    EXPECT_FALSE(
+        interval6.CoveredBy(interval4));  // interval 6 has same start/stop as 2, w/ different ref
+    EXPECT_FALSE(interval4.Covers(interval6));  //
+    EXPECT_FALSE(interval6 == interval4);       //
+    EXPECT_FALSE(interval6.Covers(interval4));  //
+
+    EXPECT_FALSE(interval2.Covers(interval3));  // completely disjoint
+    EXPECT_FALSE(interval3.Covers(interval2));
+    EXPECT_FALSE(interval2.CoveredBy(interval3));
+    EXPECT_FALSE(interval3.CoveredBy(interval2));
+
+    EXPECT_FALSE(interval2.Covers(interval5));  // a.stop == b.start
+    EXPECT_FALSE(interval2.CoveredBy(interval5));
+
+    EXPECT_TRUE(interval5.Covers(interval3));  // shared endpoint, start contained, thus a.covers(b)
+    EXPECT_TRUE(interval3.CoveredBy(interval5));  // and b.coveredBy(a)
+}
+
+TEST(GenomicIntervalTest, ValidityTest)
+{
+    GenomicInterval interval1;               // default ctor
+    GenomicInterval interval2("foo", 0, 0);  // valid id, start == stop (zero)
+    GenomicInterval interval3("foo", 4, 4);  // valid id, start == stop (nonzero)
+    GenomicInterval interval4("foo", 0, 1);  // valid id, start < stop  (start is zero)
+    GenomicInterval interval5("foo", 4, 5);  // valid id, start < stop  (start is nonzero)
+    GenomicInterval interval6("foo", 5, 4);  // valid id, start > stop
+    GenomicInterval interval7("", 0, 0);     // invalid id, start == stop (zero)
+    GenomicInterval interval8("", 4, 4);     // invalid id, start == stop (nonzero)
+    GenomicInterval interval9("", 0, 1);     // invalid id, start < stop  (start is zero)
+    GenomicInterval interval10("", 4, 5);    // invalid id, start < stop  (start is nonzero)
+    GenomicInterval interval11("", 5, 4);    // invalid id, start > stop
+
+    EXPECT_FALSE(interval1.IsValid());
+    EXPECT_FALSE(interval2.IsValid());
+    EXPECT_FALSE(interval3.IsValid());
+    EXPECT_TRUE(interval4.IsValid());
+    EXPECT_TRUE(interval5.IsValid());
+    EXPECT_FALSE(interval6.IsValid());
+    EXPECT_FALSE(interval7.IsValid());
+    EXPECT_FALSE(interval8.IsValid());
+    EXPECT_FALSE(interval9.IsValid());
+    EXPECT_FALSE(interval10.IsValid());
+    EXPECT_FALSE(interval11.IsValid());
+}
diff --git a/tests/src/test_LongCigar.cpp b/tests/src/test_LongCigar.cpp

new file mode 100644 (file)

index 0000000..13a9a96
--- /dev/null
+++ b/tests/src/test_LongCigar.cpp
@@ -0,0 +1,125 @@
+// Author: Derek Barnett
+
+#include <iostream>
+#include <string>
+#include <tuple>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamReader.h>
+#include <pbbam/BamWriter.h>
+
+#include "../../src/MemoryUtils.h"
+#include "../../src/StringUtils.h"
+
+using BamReader = PacBio::BAM::BamReader;
+using BamRecord = PacBio::BAM::BamRecord;
+using BamWriter = PacBio::BAM::BamWriter;
+using Cigar = PacBio::BAM::Cigar;
+using CigarOp = PacBio::BAM::CigarOperation;
+using PacBio::BAM::CigarOperationType;
+using Tag = PacBio::BAM::Tag;
+
+// clang-format off
+
+namespace LongCigarTests {
+
+static bool DoesHtslibSupportLongCigar()
+{
+    const std::string htsVersion = hts_version();
+
+    // remove any "-<blah>" for non-release versions
+    const auto versionBase = PacBio::BAM::Split(htsVersion, '-');
+    if (versionBase.empty())
+        throw std::runtime_error{"invalid htslib version format: " + htsVersion};
+
+    // grab major/minor version numbers
+    const auto versionParts = PacBio::BAM::Split(versionBase[0], '.');
+    if (versionParts.size() < 2)
+         throw std::runtime_error{"invalid htslib version format: " + htsVersion};
+
+    // check against v1.7
+    const int versionMajor = std::stoi(versionParts[0]);
+    const int versionMinor = std::stoi(versionParts[1]);
+    static constexpr const int v17_major = 1;
+    static constexpr const int v17_minor = 7;
+    return std::tie(versionMajor, versionMinor) >=
+           std::tie(v17_major, v17_minor);
+}
+
+static const bool has_native_long_cigar_support = DoesHtslibSupportLongCigar();
+
+// BAM record in this file has its CIGAR data in the new "CG" tag
+static const std::string LongCigarBam = PacBio::BAM::PbbamTestsConfig::Data_Dir + "/long-cigar-1.7.bam";
+
+static const std::string LongCigarOut =
+    PacBio::BAM::PbbamTestsConfig::GeneratedData_Dir + "/long-cigar-generated.bam";
+
+static const size_t numOps = 72091;
+
+static BamRecord ReadLongCigarRecord(const std::string& fn)
+{
+    BamRecord b;
+    BamReader reader{fn};
+    const bool success = reader.GetNext(b);
+    EXPECT_TRUE(success);
+    return b;
+}
+
+}  // namespace LongCigarTests
+
+TEST(LongCigarTest, ReadAndFetchLongCigar)
+{
+    const auto b = LongCigarTests::ReadLongCigarRecord(LongCigarTests::LongCigarBam);
+
+    EXPECT_EQ(LongCigarTests::numOps, b.CigarData().size());
+    if (LongCigarTests::has_native_long_cigar_support)
+        EXPECT_FALSE(b.Impl().HasTag("CG"));
+    else
+        EXPECT_TRUE(b.Impl().HasTag("CG"));
+}
+
+TEST(LongCigarTest, EditLongCigar)
+{
+    auto b = LongCigarTests::ReadLongCigarRecord(LongCigarTests::LongCigarBam);
+    b.Impl().CigarData(b.CigarData());
+
+    EXPECT_EQ(LongCigarTests::numOps, b.CigarData().size());
+    if (LongCigarTests::has_native_long_cigar_support)
+        EXPECT_FALSE(b.Impl().HasTag("CG"));
+    else
+        EXPECT_TRUE(b.Impl().HasTag("CG"));
+}
+
+TEST(LongCigarTest, WriteLongCigar)
+{
+    SCOPED_TRACE("WriteLongCigar");
+
+    {   // edit & write
+        auto b = LongCigarTests::ReadLongCigarRecord(LongCigarTests::LongCigarBam);
+        b.Impl().CigarData(b.CigarData());
+
+        EXPECT_EQ(LongCigarTests::numOps, b.CigarData().size());
+        if (LongCigarTests::has_native_long_cigar_support)
+            EXPECT_FALSE(b.Impl().HasTag("CG"));
+        else
+            EXPECT_TRUE(b.Impl().HasTag("CG"));
+
+        BamWriter writer{LongCigarTests::LongCigarOut, b.header_};
+        writer.Write(b);
+    }
+
+    {   // read back in
+        auto b = LongCigarTests::ReadLongCigarRecord(LongCigarTests::LongCigarOut);
+
+        EXPECT_EQ(LongCigarTests::numOps, b.CigarData().size());
+        if (LongCigarTests::has_native_long_cigar_support)
+            EXPECT_FALSE(b.Impl().HasTag("CG"));
+        else
+            EXPECT_TRUE(b.Impl().HasTag("CG"));
+    }
+}
+
+// clang-format on
diff --git a/tests/src/test_PacBioIndex.cpp b/tests/src/test_PacBioIndex.cpp

new file mode 100644 (file)

index 0000000..92c126a
--- /dev/null
+++ b/tests/src/test_PacBioIndex.cpp
@@ -0,0 +1,426 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamFile.h>
+#include <pbbam/BamReader.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/PbiBuilder.h>
+#include <pbbam/PbiRawData.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace PacBioIndexTests {
+
+const std::string test2BamFn = PbbamTestsConfig::Data_Dir + "/aligned2.bam";
+const std::string phi29BamFn = PbbamTestsConfig::Data_Dir + "/phi29.bam";
+
+static PbiRawData Test2Bam_CoreIndexData()
+{
+    PbiRawData rawData;
+    rawData.Version(PbiFile::Version_3_0_1);
+    rawData.FileSections(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::REFERENCE);
+    rawData.NumReads(10);
+
+    PbiRawBasicData& basicData = rawData.BasicData();
+    basicData.rgId_ = {-1197849594, -1197849594, -1197849594, -1197849594, -1197849594,
+                       -1197849594, -1197849594, -1197849594, -1197849594, -1197849594};
+    basicData.qStart_ = {48, 387, 0, 9936, 10232, 7468, 5557, 7285, 426, 7064};
+    basicData.qEnd_ = {1132, 1134, 344, 10187, 10394, 8906, 7235, 8657, 1045, 7421};
+    basicData.holeNumber_ = {49050, 32328, 32328, 6469, 6469, 30983, 13473, 13473, 19915, 30983};
+    basicData.readQual_ = {0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6};
+    basicData.ctxtFlag_ = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    basicData.fileOffset_ = {33816576, 33825163, 33831333, 33834264, 33836542,
+                             33838065, 33849818, 33863499, 33874621, 1392836608};
+
+    PbiRawMappedData& mappedData = rawData.MappedData();
+    mappedData.tId_ = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    mappedData.tStart_ = {0, 302, 675, 2170, 2203, 3572, 4506, 4507, 4592, 4669};
+    mappedData.tEnd_ = {471, 1019, 1026, 2397, 2326, 5015, 6125, 5850, 5203, 5011};
+    mappedData.aStart_ = {653, 395, 1, 9960, 10271, 7468, 5574, 7285, 441, 7075};
+    mappedData.aEnd_ = {1129, 1134, 344, 10185, 10394, 8906, 7235, 8647, 1040, 7418};
+    mappedData.revStrand_ = {0, 1, 0, 1, 0, 1, 1, 0, 1, 0};
+    mappedData.nM_ = {460, 704, 339, 216, 118, 1394, 1581, 1313, 583, 333};
+    mappedData.nMM_ = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    mappedData.mapQV_ = {254, 254, 254, 254, 254, 254, 254, 254, 254, 254};
+
+    PbiRawReferenceData& referenceData = rawData.ReferenceData();
+    referenceData.entries_ = {PbiReferenceEntry{0, 0, 10},
+                              PbiReferenceEntry{4294967295, 4294967295, 4294967295}};
+
+    return rawData;
+}
+
+// NOTE: We have 2 different sets of offsets because the copied, new file differs in size than the existing one.
+//
+//       Unsure which combination of write parameters were used on the original. Things like thread count,
+//       compression level, etc. can effect compression ratio, BGZF block sizes, etc. even though the BAM record
+//       content itself is equal. So we'll just track these index values separately, for now at least.
+//
+static PbiRawData Test2Bam_ExistingIndex()
+{
+    PbiRawData index = Test2Bam_CoreIndexData();
+    index.BasicData().fileOffset_ = {33816576, 33825163, 33831333, 33834264, 33836542,
+                                     33838065, 33849818, 33863499, 33874621, 1392836608};
+    return index;
+}
+
+static PbiRawData Test2Bam_NewIndex()
+{
+    PbiRawData index = Test2Bam_CoreIndexData();
+    index.BasicData().fileOffset_ = {33816576,  236126208, 391315456,  469106688,  537067520,
+                                     587792384, 867303424, 1182793728, 1449787392, 1582628864};
+    return index;
+}
+
+static void ExpectRawIndicesEqual(const PbiRawData& expected, const PbiRawData& actual)
+{
+    // header data
+    EXPECT_EQ(expected.Version(), actual.Version());
+    EXPECT_EQ(expected.FileSections(), actual.FileSections());
+    EXPECT_EQ(expected.NumReads(), actual.NumReads());
+
+    // subread data
+    const PbiRawBasicData& e = expected.BasicData();
+    const PbiRawBasicData& a = actual.BasicData();
+    EXPECT_EQ(e.rgId_, a.rgId_);
+    EXPECT_EQ(e.qStart_, a.qStart_);
+    EXPECT_EQ(e.qEnd_, a.qEnd_);
+    EXPECT_EQ(e.holeNumber_, a.holeNumber_);
+    EXPECT_EQ(e.readQual_, a.readQual_);
+    EXPECT_EQ(e.ctxtFlag_, a.ctxtFlag_);
+    EXPECT_EQ(e.fileOffset_, a.fileOffset_);
+
+    // mapped data
+    EXPECT_EQ(expected.HasMappedData(), actual.HasMappedData());
+    if (expected.HasMappedData() && actual.HasMappedData()) {
+        const PbiRawMappedData& e2 = expected.MappedData();
+        const PbiRawMappedData& a2 = actual.MappedData();
+        EXPECT_EQ(e2.tId_, a2.tId_);
+        EXPECT_EQ(e2.tStart_, a2.tStart_);
+        EXPECT_EQ(e2.tEnd_, a2.tEnd_);
+        EXPECT_EQ(e2.aStart_, a2.aStart_);
+        EXPECT_EQ(e2.aEnd_, a2.aEnd_);
+        EXPECT_EQ(e2.revStrand_, a2.revStrand_);
+        EXPECT_EQ(e2.nM_, a2.nM_);
+        EXPECT_EQ(e2.nMM_, a2.nMM_);
+        EXPECT_EQ(e2.mapQV_, a2.mapQV_);
+    }
+
+    // reference data
+    EXPECT_EQ(expected.HasReferenceData(), actual.HasReferenceData());
+    if (expected.HasReferenceData() && actual.HasReferenceData()) {
+        const PbiRawReferenceData& e2 = expected.ReferenceData();
+        const PbiRawReferenceData& a2 = actual.ReferenceData();
+        EXPECT_EQ(e2.entries_, a2.entries_);
+    }
+
+    // barcode data
+    EXPECT_EQ(expected.HasBarcodeData(), actual.HasBarcodeData());
+    if (expected.HasBarcodeData() && actual.HasBarcodeData()) {
+        const PbiRawBarcodeData& e2 = expected.BarcodeData();
+        const PbiRawBarcodeData& a2 = actual.BarcodeData();
+        EXPECT_EQ(e2.bcForward_, a2.bcForward_);
+        EXPECT_EQ(e2.bcReverse_, a2.bcReverse_);
+        EXPECT_EQ(e2.bcQual_, a2.bcQual_);
+    }
+}
+
+}  // namespace PacBioIndexTests
+
+TEST(PacBioIndexTest, CreateFromExistingBam)
+{
+    // do this in temp directory, so we can ensure write access
+    const std::string tempDir = PbbamTestsConfig::GeneratedData_Dir + "/";
+    const std::string tempBamFn = tempDir + "aligned_copy.bam";
+    const std::string tempPbiFn = tempBamFn + ".pbi";
+    std::string cmd("cp ");
+    cmd += PacBioIndexTests::test2BamFn;
+    cmd += " ";
+    cmd += tempBamFn;
+    const auto cmdResult = system(cmd.c_str());
+    UNUSED(cmdResult);
+
+    BamFile bamFile(tempBamFn);
+    PbiFile::CreateFrom(bamFile);
+    EXPECT_EQ(tempPbiFn, bamFile.PacBioIndexFilename());
+
+    PbiRawData index(bamFile.PacBioIndexFilename());
+    EXPECT_EQ(PbiFile::Version_3_0_1, index.Version());
+    EXPECT_EQ(10, index.NumReads());
+    EXPECT_TRUE(index.HasMappedData());
+
+    const PbiRawData expectedIndex = PacBioIndexTests::Test2Bam_ExistingIndex();
+    PacBioIndexTests::ExpectRawIndicesEqual(expectedIndex, index);
+
+    // clean up temp file(s)
+    remove(tempBamFn.c_str());
+    remove(tempPbiFn.c_str());
+}
+
+::testing::AssertionResult CanRead(BamReader& reader, BamRecord& record, int i)
+{
+    if (reader.GetNext(record))
+        return ::testing::AssertionSuccess() << "i: " << i;
+    else
+        return ::testing::AssertionFailure() << "i: " << i;
+}
+
+TEST(PacBioIndexTest, CreateOnTheFly)
+{
+    // do this in temp directory, so we can ensure write access
+    const std::string tempDir = PbbamTestsConfig::GeneratedData_Dir + "/";
+    const std::string tempBamFn = tempDir + "temp.bam";
+    const std::string tempPbiFn = tempBamFn + ".pbi";
+
+    // NOTE: new file differs in size than existing (different write parameters may yield different file sizes, even though content is same)
+    const std::vector<int64_t> expectedNewOffsets = {33816576,   236126208, 391315456, 469106688,
+                                                     537067520,  587792384, 867303424, 1182793728,
+                                                     1449787392, 1582628864};
+    std::vector<int64_t> observedOffsets;
+
+    // create PBI on the fly from input BAM while we write to new file
+    {
+        BamFile bamFile(PacBioIndexTests::test2BamFn);
+        BamHeader header = bamFile.Header();
+
+        BamWriter writer(tempBamFn, header);  // default compression, default thread count
+        PbiBuilder builder(tempPbiFn, header.Sequences().size());
+
+        int64_t vOffset = 0;
+        EntireFileQuery entireFile(bamFile);
+        for (const BamRecord& record : entireFile) {
+            writer.Write(record, &vOffset);
+            builder.AddRecord(record, vOffset);
+            observedOffsets.push_back(vOffset);
+        }
+    }
+
+    EXPECT_EQ(expectedNewOffsets, observedOffsets);
+
+    // sanity check on original file
+    {
+        const std::vector<int64_t> originalFileOffsets = {33816576, 33825163,  33831333, 33834264,
+                                                          33836542, 33838065,  33849818, 33863499,
+                                                          33874621, 1392836608};
+        BamRecord r;
+        BamReader reader(PacBioIndexTests::test2BamFn);
+        for (size_t i = 0; i < originalFileOffsets.size(); ++i) {
+            reader.VirtualSeek(originalFileOffsets.at(i));
+            EXPECT_TRUE(CanRead(reader, r, i));
+        }
+    }
+
+    // attempt to seek in our new file using both expected & observed offsets
+    {
+        BamRecord r;
+        BamReader reader(tempBamFn);
+        for (size_t i = 0; i < expectedNewOffsets.size(); ++i) {
+            reader.VirtualSeek(expectedNewOffsets.at(i));
+            EXPECT_TRUE(CanRead(reader, r, i));
+        }
+        for (size_t i = 0; i < observedOffsets.size(); ++i) {
+            reader.VirtualSeek(observedOffsets.at(i));
+            EXPECT_TRUE(CanRead(reader, r, i));
+        }
+    }
+
+    // compare data in new PBI file, to expected data
+    const PbiRawData expectedIndex = PacBioIndexTests::Test2Bam_NewIndex();
+    const PbiRawData fromBuilt = PbiRawData(tempPbiFn);
+    PacBioIndexTests::ExpectRawIndicesEqual(expectedIndex, fromBuilt);
+
+    // straight diff of newly-generated PBI file to existing PBI
+    // TODO: Come back to this once pbindexump is in place.
+    //       We can't exactly do this since file offsets may differ between 2 BAMs of differing compression levels.
+    //       Should add some sort of BAM checksum based on contents, not just size, for this reason.
+    //    const string pbiDiffCmd = string("diff -q ") + PacBioIndexTests::test2BamFn + ".pbi " + tempPbiFn;
+    //    EXPECT_EQ(0, system(pbiDiffCmd.c_str()));
+
+    // clean up temp file(s)
+    remove(tempBamFn.c_str());
+    remove(tempPbiFn.c_str());
+}
+
+TEST(PacBioIndexTest, RawLoadFromPbiFile)
+{
+    const BamFile bamFile(PacBioIndexTests::test2BamFn);
+    const std::string pbiFilename = bamFile.PacBioIndexFilename();
+    const PbiRawData loadedIndex(pbiFilename);
+
+    const PbiRawData expectedIndex = PacBioIndexTests::Test2Bam_ExistingIndex();
+    PacBioIndexTests::ExpectRawIndicesEqual(expectedIndex, loadedIndex);
+}
+
+TEST(PacBioIndexTest, BasicAndBarodeSectionsOnly)
+{
+    // do this in temp directory, so we can ensure write access
+    const std::string tempDir = PbbamTestsConfig::GeneratedData_Dir + "/";
+    const std::string tempBamFn = tempDir + "phi29.bam";
+    const std::string tempPbiFn = tempBamFn + ".pbi";
+    std::string cmd("cp ");
+    cmd += PacBioIndexTests::phi29BamFn;
+    cmd += " ";
+    cmd += tempDir;
+    const auto cmdResult = system(cmd.c_str());
+    UNUSED(cmdResult);
+
+    BamFile bamFile(tempBamFn);
+    PbiFile::CreateFrom(bamFile);
+    EXPECT_EQ(tempPbiFn, bamFile.PacBioIndexFilename());
+
+    PbiRawData index(bamFile.PacBioIndexFilename());
+    EXPECT_EQ(PbiFile::Version_3_0_1, index.Version());
+    EXPECT_EQ(120, index.NumReads());
+    EXPECT_FALSE(index.HasMappedData());
+    EXPECT_TRUE(index.HasBarcodeData());
+
+    const std::vector<int16_t> expectedBcForward{
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+    const std::vector<int16_t> expectedBcReverse{
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+    const std::vector<int8_t> expectedBcQuality{
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+    const PbiRawBarcodeData& barcodeData = index.BarcodeData();
+    EXPECT_EQ(expectedBcForward, barcodeData.bcForward_);
+    EXPECT_EQ(expectedBcReverse, barcodeData.bcReverse_);
+    EXPECT_EQ(expectedBcQuality, barcodeData.bcQual_);
+
+    // clean up temp file(s)
+    remove(tempBamFn.c_str());
+    remove(tempPbiFn.c_str());
+}
+
+TEST(PacBioIndexTest, ReferenceDataNotLoadedOnUnsortedBam)
+{
+    BamFile bamFile(PacBioIndexTests::test2BamFn);
+    PbiRawData raw(bamFile.PacBioIndexFilename());
+    EXPECT_TRUE(raw.HasReferenceData());
+}
+
+TEST(PacBioIndexTest, LookupLoadFromFileOk)
+{
+    const uint32_t expectedNumReads = 10;
+    const std::vector<int64_t> expectedOffsets{33816576, 33825163, 33831333, 33834264, 33836542,
+                                               33838065, 33849818, 33863499, 33874621, 1392836608};
+
+    EXPECT_NO_THROW({
+        BamFile bamFile(PacBioIndexTests::test2BamFn);
+        PbiRawData index(bamFile.PacBioIndexFilename());
+        EXPECT_EQ(expectedNumReads, index.NumReads());
+        EXPECT_EQ(expectedOffsets, index.BasicData().fileOffset_);
+    });
+}
+
+TEST(PacBioIndexTest, ThrowOnNonExistentPbiFile)
+{
+    EXPECT_THROW(PbiRawData("does_not_exist.pbi"), std::exception);
+}
+
+TEST(PacBioIndexTest, ThrowOnNonPbiFile)
+{
+    // completely wrong format
+    EXPECT_THROW(
+        {
+            const auto fastaFn = PbbamTestsConfig::Data_Dir + "/lambdaNEB.fa";
+            PbiRawData idx{fastaFn};
+        },
+        std::runtime_error);
+
+    // BGZF file, but not PBI
+    EXPECT_THROW(
+        {
+            const auto bamFn = PbbamTestsConfig::Data_Dir + "/ex2.bam";
+            PbiRawData idx{bamFn};
+        },
+        std::runtime_error);
+}
+
+TEST(PacBioIndexTest, AggregatePBI)
+{
+
+    DataSet ds;
+    ExternalResources& resources = ds.ExternalResources();
+    resources.Add(BamFile{PbbamTestsConfig::Data_Dir +
+                          "/aligned.bam"});  // 4 reads, BASIC | MAPPED | REFERENCE
+    resources.Add(BamFile{PbbamTestsConfig::Data_Dir +
+                          "/polymerase/production.subreads.bam"});  // 8 reads, BASIC | BARCODE
+    resources.Add(BamFile{PbbamTestsConfig::Data_Dir +
+                          "/polymerase/production_hq.hqregion.bam"});  // 1 read,  BASIC only
+
+    const PbiRawData index{ds};
+    const PbiRawBasicData& mergedBasicData = index.BasicData();
+    const PbiRawBarcodeData& mergedBarcodeData = index.BarcodeData();
+    const PbiRawMappedData& mergedMappedData = index.MappedData();
+
+    const uint32_t expectedTotal = 13;  // 4 + 8 + 1
+
+    // 'meta' info
+    EXPECT_EQ(expectedTotal, index.NumReads());
+    EXPECT_EQ(PbiFile::BASIC | PbiFile::MAPPED | PbiFile::BARCODE, index.FileSections());
+    EXPECT_TRUE(index.HasBarcodeData());
+    EXPECT_TRUE(index.HasMappedData());
+    EXPECT_FALSE(index.HasReferenceData());
+
+    // file numbers
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(0));
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(1));
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(2));
+    EXPECT_EQ(0, mergedBasicData.fileNumber_.at(3));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(4));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(5));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(6));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(7));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(8));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(9));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(10));
+    EXPECT_EQ(1, mergedBasicData.fileNumber_.at(11));
+    EXPECT_EQ(2, mergedBasicData.fileNumber_.at(12));
+
+    // basic data
+    EXPECT_EQ(0, mergedBasicData.qStart_.at(0));  // file 1
+    EXPECT_EQ(0, mergedBasicData.qStart_.at(1));
+    EXPECT_EQ(2659, mergedBasicData.qStart_.at(4));  // file 2
+    EXPECT_EQ(3116, mergedBasicData.qStart_.at(5));
+    EXPECT_EQ(2659, mergedBasicData.qStart_.at(12));  // file 3
+
+    EXPECT_EQ(21102592, mergedBasicData.fileOffset_.at(0));  // file 1
+    EXPECT_EQ(21102883, mergedBasicData.fileOffset_.at(1));
+    EXPECT_EQ(19857408, mergedBasicData.fileOffset_.at(4));  // file 2
+    EXPECT_EQ(19860696, mergedBasicData.fileOffset_.at(5));
+    EXPECT_EQ(20054016, mergedBasicData.fileOffset_.at(12));  // file 3
+
+    // mapped data
+    EXPECT_EQ(60, mergedMappedData.mapQV_.at(0));  // file 1
+    EXPECT_EQ(60, mergedMappedData.mapQV_.at(1));
+    EXPECT_EQ(255, mergedMappedData.mapQV_.at(4));  // file 2
+    EXPECT_EQ(255, mergedMappedData.mapQV_.at(5));
+    EXPECT_EQ(255, mergedMappedData.mapQV_.at(12));  // file 3
+
+    // barcode data
+    EXPECT_EQ(-1, mergedBarcodeData.bcForward_.at(0));  // file 1
+    EXPECT_EQ(-1, mergedBarcodeData.bcForward_.at(1));
+    EXPECT_EQ(92, mergedBarcodeData.bcForward_.at(4));  // file 2
+    EXPECT_EQ(92, mergedBarcodeData.bcForward_.at(5));
+    EXPECT_EQ(-1, mergedBarcodeData.bcForward_.at(12));  // file 3
+}
diff --git a/tests/src/test_PbiFilter.cpp b/tests/src/test_PbiFilter.cpp

new file mode 100644 (file)

index 0000000..6912ad7
--- /dev/null
+++ b/tests/src/test_PbiFilter.cpp
@@ -0,0 +1,1323 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/PbiFilter.h>
+
+// clang-format off
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace PbiFilterTests {
+
+// helper structs & methods
+
+static
+PbiRawData test2Bam_RawIndex()
+{
+    PbiRawData index;
+    index.NumReads(4);
+
+    PbiRawBasicData& subreadData = index.BasicData();
+    subreadData.rgId_       = { -1197849594, -1197849594, -1197849594, -1197849594 };
+    subreadData.qStart_     = { 2114, 2579, 4101, 5615 };
+    subreadData.qEnd_       = { 2531, 4055, 5571, 6237 };
+    subreadData.holeNumber_ = { 14743, 14743, 14743, 14743 };
+    subreadData.readQual_   = { 0.901, 0.601, 0.901, 0.601 };
+    subreadData.ctxtFlag_   = { 0, 1, 2, 3 };
+    subreadData.fileOffset_ = { 35651584, 35655125, 35667128, 35679170 };
+
+    PbiRawMappedData& mappedData = index.MappedData();
+    mappedData.tId_       = { 0, 0, 0, 0 };
+    mappedData.tStart_    = { 9507, 8453, 8455, 9291 };
+    mappedData.tEnd_      = { 9903, 9902, 9893, 9900 };
+    mappedData.aStart_    = { 2130, 2581, 4102, 5619 };
+    mappedData.aEnd_      = { 2531, 4055, 5560, 6237 };
+    mappedData.revStrand_ = { 0, 1, 0, 1 };
+    mappedData.mapQV_     = { 254, 254, 254, 254 };
+    mappedData.nM_        = { 384, 1411, 1393, 598 };
+    mappedData.nMM_       = { 0, 0, 0, 0 };
+
+    PbiRawBarcodeData& barcodeData = index.BarcodeData();
+    barcodeData.bcForward_ = { 0, 17, 256, 17 };
+    barcodeData.bcReverse_ = { 1, 18, 257, 18 };
+    barcodeData.bcQual_    = { 42, 80, 42, 110 };
+
+    PbiRawReferenceData& referenceData = index.ReferenceData();
+    referenceData.entries_.emplace_back( 0, 0, 3 );
+    referenceData.entries_.emplace_back( 1 );
+    referenceData.entries_.emplace_back( PbiReferenceEntry::UNMAPPED_ID );
+
+    return index;
+}
+
+static const PbiRawData shared_index = test2Bam_RawIndex();
+
+static
+void checkFilterRows(const PbiFilter& filter, const std::vector<size_t> expectedRows)
+{
+    for (size_t row : expectedRows)
+        EXPECT_TRUE(filter.Accepts(shared_index, row));
+}
+
+static
+void checkFilterInternals(const PbiFilter& filter,
+                          const PbiFilter::CompositionType expectedType,
+                          const size_t expectedNumChildren,
+                          const std::vector<size_t> expectedRows)
+{
+    EXPECT_EQ(expectedType,        filter.Type());
+    EXPECT_EQ(expectedNumChildren, filter.NumChildren());
+    checkFilterRows(filter, expectedRows);
+}
+
+struct SimpleFilter
+{
+    bool Accepts(const PbiRawData& /* idx */, const size_t /* row */) const
+    { /*()idx; ()row;*/ return true; }
+};
+
+struct NoncompliantFilter { };
+
+struct SortUniqueTestFilter
+{
+    bool Accepts(const PbiRawData& /* idx */, const size_t row) const
+    {
+//        ()idx;
+        switch(row) {
+            case 0: // fall through
+            case 1: // .
+            case 2: // .
+            case 3: // .
+            case 4: // .
+            case 7: // .
+            case 8: return true;
+            default:
+                return false;
+        }
+    }
+};
+
+struct SortUniqueTestFilter2
+{
+    bool Accepts(const PbiRawData& /* idx */, const size_t row) const
+    {
+//        ()idx;
+        switch(row) {
+            case 3: // fall through
+            case 7: // .
+            case 5: return true;
+            default:
+                return false;
+        }
+    }
+};
+
+static inline
+PbiFilter emptyFilter()
+{ return PbiFilter{ }; }
+
+static inline
+PbiFilter simpleFilter()
+{ return PbiFilter{ SimpleFilter{ } }; }
+
+} // namespace PbiFilterTests
+
+TEST(PbiFilterTest, DefaultCtorOk)
+{
+    auto filter = PbiFilter{ };
+    PbiFilterTests::checkFilterInternals(filter, PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+}
+
+TEST(PbiFilterTest, CompositionOk)
+{
+    auto filter = PbiFilter{ };
+    filter.Add(PbiFilter{ });
+    PbiFilterTests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+}
+
+TEST(PbiFilterTest, CustomFilterOk)
+{
+    { // ctor
+        auto filter = PbiFilter{ PbiFilterTests::SimpleFilter{ } };
+        PbiFilterTests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector<size_t>{});
+    }
+    { // Add
+        auto filter = PbiFilter{ };
+        filter.Add(PbiFilterTests::SimpleFilter{ });
+        PbiFilterTests::checkFilterInternals(filter, PbiFilter::INTERSECT, 1, std::vector<size_t>{});
+    }
+
+//    PbiFilter shouldNotCompile = PbiFilter{ PbiFilterTests::NoncompliantFilter{ } };                       // <-- when uncommented, should not compile
+//    PbiFilter shouldNotCompileEither; shouldNotCompileEither.Add(PbiFilterTests::NoncompliantFilter{ });   // <-- when uncommented, should not compile
+}
+
+TEST(PbiFilterTest, CopyOk)
+{
+    { // empty
+        const auto original = PbiFilter{ };
+
+        PbiFilter copyCtor(original);
+        PbiFilter copyAssign;
+        copyAssign = original;
+
+        PbiFilterTests::checkFilterInternals(original,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(copyCtor,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(copyAssign, PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+    }
+    { // with children
+        const auto original = PbiFilter{ PbiFilterTests::SimpleFilter{ } };
+
+        PbiFilter copyCtor(original);
+        PbiFilter copyAssign;
+        copyAssign = original;
+
+        PbiFilterTests::checkFilterInternals(original,   PbiFilter::INTERSECT, 1, std::vector<size_t>{});
+        PbiFilterTests::checkFilterInternals(copyCtor,   PbiFilter::INTERSECT, 1, std::vector<size_t>{});
+        PbiFilterTests::checkFilterInternals(copyAssign, PbiFilter::INTERSECT, 1, std::vector<size_t>{});
+    }
+}
+
+TEST(PbiFilterTest, MoveOk)
+{
+    { // empty
+        const auto original = PbiFilterTests::emptyFilter();
+
+        PbiFilter moveCtor(PbiFilterTests::emptyFilter());
+        PbiFilter moveAssign;
+        moveAssign = PbiFilterTests::emptyFilter();
+
+        PbiFilterTests::checkFilterInternals(original,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(moveCtor,   PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(moveAssign, PbiFilter::INTERSECT, 0, std::vector<size_t>{0,1,2,3});
+    }
+    { // with children
+        const auto original = PbiFilterTests::simpleFilter();
+
+        PbiFilter moveCtor(PbiFilterTests::simpleFilter());
+        PbiFilter moveAssign;
+        moveAssign = PbiFilterTests::simpleFilter();
+
+        PbiFilterTests::checkFilterInternals(original,   PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(moveCtor,   PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+        PbiFilterTests::checkFilterInternals(moveAssign, PbiFilter::INTERSECT, 1, std::vector<size_t>{0,1,2,3});
+    }
+}
+
+TEST(PbiFilterTest, SortsAndUniquesChildFilterResultsOk)
+{
+    const auto childFilter = PbiFilterTests::SortUniqueTestFilter{ };
+    const auto filter = PbiFilter{ childFilter };
+    PbiFilterTests::checkFilterRows(childFilter, std::vector<size_t>{2, 7, 0, 3, 4, 1, 8});
+    PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0, 1, 2, 3, 4, 7, 8});
+}
+
+TEST(PbiFilterTest, UnionOk)
+{
+    { // empty
+        { // copy
+            const auto emptyFilter = PbiFilterTests::emptyFilter();
+            const auto emptyFilter2 = PbiFilterTests::emptyFilter();
+            const auto u = PbiFilter::Union({ emptyFilter, emptyFilter2 });
+            PbiFilterTests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{0,1,2,3});
+        }
+        { // move
+            const auto u = PbiFilter::Union({ PbiFilter{ }, PbiFilter{ } });
+            PbiFilterTests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{0,1,2,3});
+        }
+    }
+
+    { // with (no-data) children - just checking composition
+        { // copy
+            const auto simpleFilter = PbiFilterTests::SimpleFilter{ };
+            const auto simpleFilter2 = PbiFilterTests::SimpleFilter{ };
+            const auto u = PbiFilter::Union({ simpleFilter, simpleFilter2 });
+            PbiFilterTests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{});
+        }
+        { // move
+            const auto u = PbiFilter::Union({ PbiFilterTests::SimpleFilter{ }, PbiFilterTests::SimpleFilter{ } });
+            PbiFilterTests::checkFilterInternals(u, PbiFilter::UNION, 2, std::vector<size_t>{});
+        }
+    }
+
+    { // 2-child union, results sorted & unique-d by PbiFilter
+
+        const auto child1 = PbiFilterTests::SortUniqueTestFilter{ };
+        const auto child2 = PbiFilterTests::SortUniqueTestFilter2{ };
+        const auto u = PbiFilter::Union({ child1, child2 });
+
+        PbiFilterTests::checkFilterRows(child1, std::vector<size_t>{2, 7, 0, 3, 4, 1, 8});
+        PbiFilterTests::checkFilterRows(child2, std::vector<size_t>{3, 7, 5});
+        PbiFilterTests::checkFilterRows(u, std::vector<size_t>{0, 1, 2, 3, 4, 5, 7, 8});
+    }
+}
+
+TEST(PbiFilterTest, IntersectOk)
+{
+    { // empty
+        { // copy
+            const auto emptyFilter = PbiFilterTests::emptyFilter();
+            const auto emptyFilter2 = PbiFilterTests::emptyFilter();
+            const auto i = PbiFilter::Intersection({ emptyFilter, emptyFilter2 });
+            PbiFilterTests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{0,1,2,3});
+        }
+        { // move
+            const auto i = PbiFilter::Intersection({ PbiFilter{ }, PbiFilter{ } });
+            PbiFilterTests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{0,1,2,3});
+        }
+    }
+
+    { // with (no-data) children - just checking composition
+        { // copy
+            const auto simpleFilter = PbiFilterTests::SimpleFilter{ };
+            const auto simpleFilter2 = PbiFilterTests::SimpleFilter{ };
+            const auto i = PbiFilter::Intersection({ simpleFilter, simpleFilter2 });
+            PbiFilterTests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{});
+        }
+        { // move
+            const auto i = PbiFilter::Intersection({ PbiFilterTests::SimpleFilter{ }, PbiFilterTests::SimpleFilter{ } });
+            PbiFilterTests::checkFilterInternals(i, PbiFilter::INTERSECT, 2, std::vector<size_t>{});
+        }
+    }
+
+    { // 2-child intersect, sorted & unique-d by PbiFilter
+
+        const auto child1 = PbiFilterTests::SortUniqueTestFilter{ };
+        const auto child2 = PbiFilterTests::SortUniqueTestFilter2{ };
+        const auto i = PbiFilter::Intersection({ child1, child2 });
+
+        PbiFilterTests::checkFilterRows(child1, std::vector<size_t>{2, 7, 0, 3, 4, 1, 8});
+        PbiFilterTests::checkFilterRows(child2, std::vector<size_t>{3, 7, 5 });
+        PbiFilterTests::checkFilterRows(i, std::vector<size_t>{3, 7});
+    }
+}
+
+TEST(PbiFilterTest, AlignedEndFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4055 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4055, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 4000, Compare::LESS_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 5560, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 5560, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{2,3});
+    }
+
+    {
+        const auto filter = PbiFilter{ PbiAlignedEndFilter{ 7000, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+}
+
+TEST(PbiFilterTest, AlignedLengthFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiAlignedLengthFilter{ 500, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedLengthFilter{ 1000, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2});
+    }
+}
+
+TEST(PbiFilterTest, AlignedStartFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 2600, Compare::LESS_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 4102, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 4102, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedStartFilter{ 6000, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{ });
+    }
+}
+
+TEST(PbiFilterTest, AlignedStrandFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::FORWARD } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::REVERSE } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiAlignedStrandFilter{ Strand::FORWARD, Compare::NOT_EQUAL } }; // same as Strand::REVERSE
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+
+    // unsupported compare types throw
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::LESS_THAN),          std::runtime_error);
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::LESS_THAN_EQUAL),    std::runtime_error);
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::GREATER_THAN),       std::runtime_error);
+    EXPECT_THROW(PbiAlignedStrandFilter(Strand::FORWARD, Compare::GREATER_THAN_EQUAL), std::runtime_error);
+}
+
+TEST(PbiFilterTest, BarcodeFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiBarcodeFilter{ 17 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeFilter{ 18 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeFilter{ 0 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0});
+    }
+}
+
+TEST(PbiFilterTest, BarcodeForwardFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ 17 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ 400 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeForwardFilter{ {0, 256} } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2});
+    }
+}
+
+TEST(PbiFilterTest, BarcodeQualityFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiBarcodeQualityFilter{ 80, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeQualityFilter{ 40, Compare::LESS_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+}
+
+TEST(PbiFilterTest, BarcodeReverseFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ 18 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ 400 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{ });
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodeReverseFilter{ {1, 257} } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2});
+    }
+}
+
+TEST(PbiFilterTest, BarcodesFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiBarcodesFilter{ 17, 18 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodesFilter{ 17, 19 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{ });
+    }
+    {
+        const auto filter = PbiFilter{ PbiBarcodesFilter{ std::make_pair(17,18) } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+}
+
+TEST(PbiFilterTest, IdentityFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiIdentityFilter{ 0.95, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+}
+
+TEST(PbiFilterTest, LocalContextFilterOk)
+{
+    { // == NO_LOCAL_CONTEXT
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0});
+    }
+    { // != ADAPTER_BEFORE (exact match)
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2,3});
+    }
+    { // contains ADAPTER_BEFORE
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+    { // does not contain ADAPTER_BEFORE
+        const auto filter = PbiFilter { PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2});
+    }
+    { // include both ADAPTER_BEFORE and ADAPTER_AFTER
+        const auto filter = PbiFilter::Intersection(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }
+        });
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+    { // exclude both ADAPTER_BEFORE and ADAPTER_AFTER
+        const auto filter = PbiFilter::Intersection(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }
+        });
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0});
+    }
+    { // include everything with either ADAPTER_BEFORE or ADAPTER_AFTER
+        const auto filter = PbiFilter::Union(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }
+        });
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2,3});
+    }
+    { // include everything with either ADAPTER_BEFORE or ADAPTER_AFTER, but not both
+        const auto filter = PbiFilter::Intersection(
+        {
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL },
+                PbiFilter::Union(
+                {
+                    PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS },
+                    PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }
+                })
+        });
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2});
+    }
+}
+
+TEST(PbiFilterTest, MapQualityFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiMapQualityFilter{ 254 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiMapQualityFilter{ 254, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+}
+
+TEST(PbiFilterTest, MovieNameFilterOk)
+{
+    const auto bamFile = BamFile{ PbbamTestsConfig::Data_Dir + std::string{ "/group/test2.bam" } };
+    const auto index = PbiRawData{ bamFile.PacBioIndexFilename() };
+
+    {
+        const auto filter = PbiFilter{ PbiMovieNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0" } };
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+    }
+    {
+        const auto filter = PbiFilter{ PbiMovieNameFilter{ "does_not_exist" } };
+        const auto expectedRows = std::vector<size_t>{};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+    }
+    {
+        const auto names = std::vector<std::string>{"does_not_exist",
+                                          "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0"};
+        const auto filter = PbiFilter{ PbiMovieNameFilter{ names } };
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+    }
+}
+
+TEST(PbiFilterTest, NumDeletedBasesFilterOk)
+{
+    // del: { 12, 38, 45, 11} - calculated from raw data, not stored directly in testing object or read from PBI file
+
+    {
+        const auto filter = PbiFilter{ PbiNumDeletedBasesFilter{ 12, Compare::LESS_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiNumDeletedBasesFilter{ 45, Compare::EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{2});
+    }
+}
+
+TEST(PbiFilterTest, NumInsertedBasesFilterOk)
+{
+    // ins: { 17, 63, 65, 20 }  - calculated from raw data, not stored directly testing object or read from PBI file
+
+    {
+        const auto filter = PbiFilter{ PbiNumInsertedBasesFilter{ 63, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2});
+    }
+    {
+        const auto filter = PbiFilter{ PbiNumInsertedBasesFilter{ 17, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2,3});
+    }
+}
+
+TEST(PbiFilterTest, NumMatchesFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiNumMatchesFilter{ 1000, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2});
+    }
+    {
+        const auto filter = PbiFilter{ PbiNumMatchesFilter{ 400, Compare::LESS_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0});
+    }
+}
+
+TEST(PbiFilterTest, NumMismatchesFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiNumMismatchesFilter{ 0, Compare::EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiNumMismatchesFilter{ 0, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+}
+
+TEST(PbiFilterTest, QueryEndFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiQueryEndFilter{ 4055 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1});
+    }
+    {
+        const auto filter = PbiFilter{ PbiQueryEndFilter{ 6200, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+}
+
+TEST(PbiFilterTest, QueryLengthFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiQueryLengthFilter{ 500, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiQueryLengthFilter{ 1000, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,2});
+    }
+}
+
+TEST(PbiFilterTest, QueryNameFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1});
+    }
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "does_not_exist/0/0_0" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto names = std::vector<std::string>{"m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/2579_4055",
+                                          "m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/5615_6237"};
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ names } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1,3});
+    }
+
+    // invalid QNAME syntax throws
+    EXPECT_THROW(
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    },
+    std::runtime_error);
+    EXPECT_THROW(
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    },
+    std::runtime_error);
+    EXPECT_THROW(
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo/bar" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    },
+    std::runtime_error);
+    EXPECT_THROW(
+    {
+        const auto filter = PbiFilter{ PbiQueryNameFilter{ "foo/bar/baz_bam" } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    },
+    std::exception); // come back to see why this is not runtime_error but something else
+}
+
+TEST(PbiFilterTest, QueryStartFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiQueryStartFilter{ 4101 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{2});
+    }
+    {
+        const auto filter = PbiFilter{ PbiQueryStartFilter{ 5000 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto filter = PbiFilter{ PbiQueryStartFilter{ 5000, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+}
+
+TEST(PbiFilterTest, ReadAccuracyFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiReadAccuracyFilter{ 0.9 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto filter = PbiFilter{ PbiReadAccuracyFilter{ 0.9, Compare::GREATER_THAN } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,2});
+    }
+}
+
+TEST(PbiFilterTest, ReadGroupFilterOk)
+{
+    { // numeric ID
+        const auto filter = PbiReadGroupFilter{ -1197849594 };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+
+        const auto filter2 = PbiReadGroupFilter{ 200 };
+        PbiFilterTests::checkFilterRows(filter2, std::vector<size_t>{});
+    }
+    { // string ID
+        const auto filter = PbiReadGroupFilter{ "b89a4406" };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+
+        const auto filter2 = PbiReadGroupFilter{ "b89a4406" };
+        PbiFilterTests::checkFilterRows(filter2, std::vector<size_t>{0,1,2,3});
+    }
+    { // ReadGroupInfo object
+        const auto rg = ReadGroupInfo{ "b89a4406" };
+        const auto filter = PbiReadGroupFilter{ rg };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    { // multi-ID
+        const auto ids = std::vector<int32_t>({-1197849594, 200});
+        const auto filter = PbiReadGroupFilter{ ids };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    { // multi-string
+        const auto ids = std::vector<std::string>({"b89a4406", "deadbeef"});
+        const auto filter = PbiReadGroupFilter{ ids };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    { // multi-ReadGroupInfo
+        const auto ids = std::vector<ReadGroupInfo>({ ReadGroupInfo("b89a4406"), ReadGroupInfo("deadbeef")});
+        const auto filter = PbiReadGroupFilter{ ids };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+}
+
+TEST(PbiFilterTest, ReferenceEndFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiReferenceEndFilter{ 9900 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiReferenceEndFilter{ 9900, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,3});
+    }
+}
+
+TEST(PbiFilterTest, ReferenceIdFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiReferenceIdFilter{ 0 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiReferenceIdFilter{ 0, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto ids = std::vector<int32_t>({0, 42});
+        const auto filter = PbiFilter{ PbiReferenceIdFilter{ ids } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+}
+
+TEST(PbiFilterTest, ReferenceNameFilterOk)
+{
+    const auto bamFile = BamFile{ PbbamTestsConfig::Data_Dir + std::string{ "/group/test2.bam" } };
+    const auto index = PbiRawData{ bamFile.PacBioIndexFilename() };
+
+    {
+        const auto filter = PbiFilter{ PbiReferenceNameFilter{ "lambda_NEB3011" } };
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+
+    }
+    {
+        const auto filter = PbiFilter{ PbiReferenceNameFilter{ "lambda_NEB3011", Compare::NOT_EQUAL } };
+        const auto expectedRows = std::vector<size_t>{};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+    }
+    {
+        const auto names = std::vector<std::string>({ "lambda_NEB3011" }); // this file only has 1 :(
+        const auto filter = PbiFilter{ PbiReferenceNameFilter{ names } };
+        const auto expectedRows = std::vector<size_t>{0,1,2,3};
+        for (size_t row : expectedRows)
+            EXPECT_TRUE(filter.Accepts(index, row));
+    }
+
+    // unsupported compare types throw
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::LESS_THAN),          std::runtime_error);
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::LESS_THAN_EQUAL),    std::runtime_error);
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::GREATER_THAN),       std::runtime_error);
+    EXPECT_THROW(PbiReferenceNameFilter("foo", Compare::GREATER_THAN_EQUAL), std::runtime_error);
+}
+
+TEST(PbiFilterTest, ReferenceStartFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiReferenceStartFilter{ 8453 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{1});
+    }
+    {
+        const auto filter = PbiFilter{ PbiReferenceStartFilter{ 9200, Compare::GREATER_THAN_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,3});
+    }
+}
+
+TEST(PbiFilterTest, ZmwFilterOk)
+{
+    {
+        const auto filter = PbiFilter{ PbiZmwFilter{ 14743 } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+    {
+        const auto filter = PbiFilter{ PbiZmwFilter{ 14743, Compare::NOT_EQUAL } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{});
+    }
+    {
+        const auto zmws = std::vector<int32_t>({14743,42,200});
+        const auto filter = PbiFilter{ PbiZmwFilter{ zmws } };
+        PbiFilterTests::checkFilterRows(filter, std::vector<size_t>{0,1,2,3});
+    }
+}
+
+TEST(PbiFilterTest, FromDataSetOk)
+{
+    const auto expectedFilter =
+        PbiFilter::Union(
+        {
+            PbiFilter::Intersection(
+            {
+                PbiZmwFilter{ 14743 },
+                PbiReadAccuracyFilter { 0.9, Compare::GREATER_THAN_EQUAL }
+            }),
+
+            PbiReferenceStartFilter { 9200, Compare::GREATER_THAN_EQUAL }
+        });
+
+
+    auto properties1 = Properties{ };
+    properties1.Add(Property{ "zm", "14743",  "==" });
+    properties1.Add(Property{ "rq", "0.9", ">=" });
+
+    auto datasetFilter1 = Filter{ };
+    datasetFilter1.Properties(properties1);
+
+    auto properties2 = Properties{ };
+    properties2.Add(Property{ "pos", "9200", ">=" });
+
+    auto datasetFilter2 = Filter{ };
+    datasetFilter2.Properties(properties2);
+
+    auto datasetFilters = Filters{ };
+    datasetFilters.Add(datasetFilter1);
+    datasetFilters.Add(datasetFilter2);
+    auto dataset = DataSet{ };
+    dataset.Filters(datasetFilters);
+
+    const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+
+    for (size_t i = 0; i < PbiFilterTests::shared_index.NumReads(); ++i) {
+        EXPECT_EQ(expectedFilter.Accepts(PbiFilterTests::shared_index, i),
+                  generatedFilter.Accepts(PbiFilterTests::shared_index, i));
+    }
+}
+
+TEST(PbiFilterTest, BarcodeListFromDataSetXmlOk)
+{
+    auto runner = [](const Property& property,
+                     const PbiFilter& expectedFilter,
+                     const std::vector<size_t>& expectedResults)
+    {
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  expectedResults);
+        PbiFilterTests::checkFilterRows(generatedFilter, expectedResults);
+    };
+
+    // single barcode
+    runner(Property{ "bc", "18", "==" },
+           PbiBarcodeFilter{ 18, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // single barcode (bracketed)
+    runner(Property{ "bc", "[18]", "==" },
+           PbiBarcodeFilter{ 18, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // barcode pair (square brackets)
+    runner(Property{ "bc", "[17,18]", "==" },
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // barcode pair (parens)
+    runner(Property{ "bc", "(17,18)", "==" },
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // barcode pair (curly brackets)
+    runner(Property{ "bc", "{17,18}", "==" },
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // barcode pair (list, but no brackets)
+    runner(Property{ "bc", "17,18", "==" },
+           PbiBarcodesFilter{ {17, 18}, Compare::EQUAL },
+           std::vector<size_t>{1,3});
+
+    // barcode pair - same value
+    runner(Property{ "bc", "[18,18]", "==" },
+           PbiBarcodesFilter{ {18, 18}, Compare::EQUAL },
+           std::vector<size_t>{}); // none share forward & reverse
+
+    auto expectFail = [](const Property& property)
+    {
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        EXPECT_THROW(PbiFilter::FromDataSet(dataset), std::runtime_error);
+    };
+
+    // list-ish, but only one value
+    expectFail(Property{ "bc", "[18,]", "==" });
+
+    // too many barcodes
+    expectFail(Property{ "bc", "[18,18,18]", "==" });
+}
+
+TEST(PbiFilterTest, LocalContextFiltersFromDataSetXmlOk)
+{
+    {   // no adapters or barcodes
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::EQUAL };
+
+        // XML: <Property Name="cx" Value="0" Operator="==" />
+        Property property("cx", "0", "==");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{0});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{0});
+    }
+    {   // any adapters or barcodes
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL };
+
+        // XML: <Property Name="cx" Value="0" Operator="!=" />
+        Property property("cx", "0", "!=");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    {   // contains adapter_before
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="1" Operator="&" />
+        Property property("cx", "1", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,3});
+    }
+    {   // contains adapter_before
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE" Operator="&" />
+        Property property("cx", "ADAPTER_BEFORE", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,3});
+    }
+    {   // contains adapter_after
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="2" Operator="&" />
+        Property property("cx", "2", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{2,3});
+    }
+    {   // contains adapter_before or adapter_after
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,
+                                       Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="3" Operator="&" />
+        Property property("cx", "3", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    {   // contains adapter_before or adapter_after
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,
+                                       Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE | ADAPTER_AFTER" Operator="&" />
+        Property property("cx", "ADAPTER_BEFORE | ADAPTER_AFTER", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    {   // contains adapter_before or adapter_after - no whitespace separation
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,
+                                       Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE|ADAPTER_AFTER" Operator="&" />
+        Property property("cx", "ADAPTER_BEFORE|ADAPTER_AFTER", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    {   // contains adapter_before or adapter_after - a lot of whitespace separation
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,
+                                       Compare::CONTAINS };
+
+        // XML: <Property Name="cx" Value="ADAPTER_BEFORE        |           ADAPTER_AFTER" Operator="&" />
+        Property property("cx", "ADAPTER_BEFORE        |           ADAPTER_AFTER", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    {   // contains adapter_before or adapter_after, but not both
+
+        const auto expectedFilter = PbiFilter::Union(
+        {
+            PbiFilter::Intersection(
+            {
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL },
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS }
+            }),
+            PbiFilter::Intersection(
+            {
+                PbiLocalContextFilter{ LocalContextFlags::NO_LOCAL_CONTEXT, Compare::NOT_EQUAL },
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER, Compare::NOT_CONTAINS }
+            })
+        });
+
+        // XML:
+        // <Filters>
+        //   <Filter>
+        //     <Properties>
+        //       <Property Name="cx" Value="0" Operator="!=" />
+        //       <Property Name="cx" Value="1" Operator="~" />
+        //     </Properties>
+        //   </Filter>
+        //   <Filter>
+        //     <Properties>
+        //       <Property Name="cx" Value="0" Operator="!=" />
+        //       <Property Name="cx" Value="2" Operator="~" />
+        //     </Properties>
+        //   </Filter>
+        // </Filters>
+
+        auto filter1 = Filter{ };
+        filter1.Properties().Add(Property("cx", "0", "!="));
+        filter1.Properties().Add(Property("cx", "1", "~"));
+
+        auto filter2 = Filter{ };
+        filter2.Properties().Add(Property("cx", "0", "!="));
+        filter2.Properties().Add(Property("cx", "2", "~"));
+
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter1);
+        dataset.Filters().Add(filter2);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2});
+
+    }
+    {   // contains adapter_before or adapter_after
+
+        const auto expectedFilter = PbiFilter::Union(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }
+        });
+
+        // XML:
+        // <Filters>
+        //   <Filter>
+        //     <Properties>
+        //       <Property Name="cx" Value="1" Operator="&" />
+        //     </Properties>
+        //   </Filter>
+        //   <Filter>
+        //     <Properties>
+        //       <Property Name="cx" Value="2" Operator="&" />
+        //     </Properties>
+        //   </Filter>
+        // </Filters>
+
+        auto filter1 = Filter{ };
+        filter1.Properties().Add(Property("cx", "1", "&"));
+
+        auto filter2 = Filter{ };
+        filter2.Properties().Add(Property("cx", "2", "&"));
+
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter1);
+        dataset.Filters().Add(filter2);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1,2,3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1,2,3});
+    }
+    { // adapter_before and adapter_after
+
+        const auto expectedFilter = PbiFilter::Intersection(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::CONTAINS }
+        });
+
+        // XML:
+        // <Property Name="cx" Value="1" Operator="&" />
+        // <Property Name="cx" Value="2" Operator="&" />
+        Property property1("cx", "1", "&");
+        Property property2("cx", "2", "&");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property1);
+        filter.Properties().Add(property2);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{3});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{3});
+    }
+    {   // adapter_before, but no adapter_after
+
+        const auto expectedFilter = PbiFilter::Intersection(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }
+        });
+
+        // XML:
+        // <Property Name="cx" Value="1" Operator="&" />
+        // <Property Name="cx" Value="2" Operator="~" />
+        Property property1("cx", "1", "&");
+        Property property2("cx", "2", "~");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property1);
+        filter.Properties().Add(property2);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{1});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{1});
+    }
+    {   // contains no adapter_before
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS };
+
+        // XML: <Property Name="cx" Value="1" Operator="~" />
+        Property property("cx", "1", "~");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{0,2});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{0,2});
+    }
+    {   // contains no adapter_before or adapter_after
+
+        const auto expectedFilter = PbiFilter::Intersection(
+        {
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE, Compare::NOT_CONTAINS },
+            PbiLocalContextFilter{ LocalContextFlags::ADAPTER_AFTER,  Compare::NOT_CONTAINS }
+        });
+
+        // XML:
+        // <Property Name="cx" Value="1" Operator="~" />
+        // <Property Name="cx" Value="2" Operator="~" />
+        Property property1("cx", "1", "~");
+        Property property2("cx", "2", "~");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property1);
+        filter.Properties().Add(property2);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{0});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{0});
+    }
+    {   // contains no adapter_before or adapter_after
+
+        const auto expectedFilter =
+                PbiLocalContextFilter{ LocalContextFlags::ADAPTER_BEFORE | LocalContextFlags::ADAPTER_AFTER,
+                                       Compare::NOT_CONTAINS };
+
+        // XML: <Property Name="cx" Value="3" Operator="~" />
+        Property property("cx", "3", "~");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        const auto generatedFilter = PbiFilter::FromDataSet(dataset);
+        PbiFilterTests::checkFilterRows(expectedFilter,  std::vector<size_t>{0});
+        PbiFilterTests::checkFilterRows(generatedFilter, std::vector<size_t>{0});
+    }
+    {   // throws on invalid enum name
+
+        Property property("cx", "DOES_NOT_EXIST", "~");
+
+        auto filter = Filter{ };
+        filter.Properties().Add(property);
+        DataSet dataset = DataSet{ };
+        dataset.Filters().Add(filter);
+
+        EXPECT_THROW(PbiFilter::FromDataSet(dataset), std::runtime_error);
+    }
+}
+
+// clang-format on
diff --git a/tests/src/test_PbiFilterQuery.cpp b/tests/src/test_PbiFilterQuery.cpp

new file mode 100644 (file)

index 0000000..21bcf5f
--- /dev/null
+++ b/tests/src/test_PbiFilterQuery.cpp
@@ -0,0 +1,693 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/PbiFilterQuery.h>
+#include <pbbam/Unused.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(PbiFilterQueryTest, QueryOk)
+{
+    const auto bamFile = BamFile{PbbamTestsConfig::Data_Dir + std::string{"/group/test2.bam"}};
+
+    {
+        PbiFilterQuery query(PbiQueryLengthFilter{500, Compare::GREATER_THAN_EQUAL}, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(3, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 500);
+        }
+        EXPECT_EQ(3, count);
+    }
+    {
+        // all records aligned to reverse strand && pos >= 9200
+        const auto filter =
+            PbiFilter::Intersection({PbiAlignedStrandFilter{Strand::REVERSE},
+                                     PbiReferenceStartFilter{9200, Compare::GREATER_THAN_EQUAL}});
+
+        PbiFilterQuery query(filter, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_EQ(Strand::REVERSE, r.AlignedStrand());
+            EXPECT_GE((r.ReferenceStart()), 9200);
+            EXPECT_EQ(
+                std::string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/"
+                            "5615_6237"),
+                r.FullName());
+        }
+        EXPECT_EQ(1, count);
+    }
+    {
+        // all records aligned to forward strand && pos >= 9200
+        const auto filter =
+            PbiFilter::Intersection({PbiAlignedStrandFilter{Strand::FORWARD},
+                                     PbiReferenceStartFilter{9200, Compare::GREATER_THAN_EQUAL}});
+
+        PbiFilterQuery query(filter, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_EQ(Strand::FORWARD, r.AlignedStrand());
+            EXPECT_GE((r.ReferenceStart()), 9200);
+            EXPECT_EQ(
+                std::string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/14743/"
+                            "2114_2531"),
+                r.FullName());
+        }
+        EXPECT_EQ(1, count);
+    }
+    {
+        // all records from RG ("b89a4406") with numMatches >= 1200
+        const auto filter =
+            PbiFilter::Intersection({PbiReadGroupFilter{"b89a4406"},
+                                     PbiNumMatchesFilter{1200, Compare::GREATER_THAN_EQUAL}});
+
+        PbiFilterQuery query(filter, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(2, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_EQ(std::string("b89a4406"), r.ReadGroupId());
+            EXPECT_GE((r.NumMatches()), 1200);
+            if (count == 1)
+                EXPECT_EQ(
+                    std::string("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0/"
+                                "14743/2579_4055"),
+                    r.FullName());
+            else {
+                if (count == 2) {
+                    EXPECT_EQ(
+                        std::string("m140905_042212_sidney_c100564852550000001823085912221377_s1_"
+                                    "X0/14743/4101_5571"),
+                        r.FullName());
+                }
+            }
+        }
+        EXPECT_EQ(2, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, ZmwRangeFromDatasetOk)
+{
+    const auto expectedMovieName =
+        std::string{"m150404_101626_42267_c100807920800000001823174110291514_s1_p0"};
+
+    const DataSet ds(PbbamTestsConfig::Data_Dir + "/chunking/chunking.subreadset.xml");
+    EXPECT_EQ(3, ds.BamFiles().size());
+
+    {  // movie name
+
+        PbiFilterQuery query{PbiMovieNameFilter{expectedMovieName}, ds};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1220, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            EXPECT_EQ(expectedMovieName, r.MovieName());
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+
+    {  // sequencing chemistries
+        std::set<std::string> chems{ds.SequencingChemistries()};
+        std::set<std::string> expected{"P6-C4"};
+        EXPECT_TRUE(equal(chems.begin(), chems.end(), expected.begin()));
+    }
+
+    {  // min ZMW
+
+        PbiFilterQuery query{PbiZmwFilter{54, Compare::GREATER_THAN}, ds};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1220, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            EXPECT_GT(r.HoleNumber(), 54);
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+
+    {  // max ZMW
+
+        PbiFilterQuery query{PbiZmwFilter{1816, Compare::LESS_THAN}, ds};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(150, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            EXPECT_LT(r.HoleNumber(), 1816);
+            ++count;
+        }
+        EXPECT_EQ(150, count);
+    }
+
+    {  // put all together, from DataSet XML
+
+        const PbiFilter filter = PbiFilter::FromDataSet(ds);
+        PbiFilterQuery query(filter, ds);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(150, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            EXPECT_EQ(expectedMovieName, r.MovieName());
+            const auto zmw = r.HoleNumber();
+            EXPECT_GT(zmw, 54);
+            EXPECT_LT(zmw, 1816);
+            ++count;
+        }
+        EXPECT_EQ(150, count);
+    }
+    {  // empty filter object - should return all records from the same dataset
+
+        PbiFilterQuery query(PbiFilter{}, ds);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1220, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+    {  // no <Filters> element present at all
+
+        const DataSet dsData(PbbamTestsConfig::GeneratedData_Dir +
+                             "/chunking_missingfilters.subreadset.xml");
+        const PbiFilter filter = PbiFilter::FromDataSet(dsData);
+        PbiFilterQuery query(filter, dsData);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1220, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+    {  // <Filters> element contains no child <Filter> elements
+
+        const DataSet dsData(PbbamTestsConfig::GeneratedData_Dir +
+                             "/chunking_emptyfilters.subreadset.xml");
+        const PbiFilter filter = PbiFilter::FromDataSet(dsData);
+        PbiFilterQuery query(filter, dsData);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(1220, numReads);
+
+        int count = 0;
+        for (const BamRecord& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(1220, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, MissingPbiShouldThrow)
+{
+    const PbiFilter filter{PbiZmwFilter{31883}};
+    const std::string phi29Bam = PbbamTestsConfig::GeneratedData_Dir + "/missing_pbi.bam";
+    const std::string hasPbiBam = PbbamTestsConfig::Data_Dir + "/polymerase/production.scraps.bam";
+
+    {  // single file, missing PBI
+
+        EXPECT_THROW(PbiFilterQuery(filter, phi29Bam), std::runtime_error);
+    }
+
+    {  // from dataset, all missing PBI
+
+        DataSet ds;
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        EXPECT_THROW(PbiFilterQuery(filter, ds), std::runtime_error);
+    }
+
+    {  // from dataset, mixed PBI presence
+
+        DataSet ds;
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.SubreadBamFile", phi29Bam));
+        ds.ExternalResources().Add(ExternalResource("PacBio.SubreadFile.ScrapsBamFile", hasPbiBam));
+        EXPECT_THROW(PbiFilterQuery(filter, ds), std::runtime_error);
+    }
+}
+
+TEST(PbiFilterQueryTest, QNameWhitelistFile)
+{
+    const DataSet ds(PbbamTestsConfig::Data_Dir + "/polymerase/qnameFiltered.subreads.dataset.xml");
+    const PbiFilter filter = PbiFilter::FromDataSet(ds);
+    PbiFilterQuery query(filter, ds);
+    const auto numReads = query.NumReads();
+    EXPECT_EQ(3, numReads);
+
+    int count = 0;
+    for (const BamRecord& r : query) {
+        UNUSED(r);
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(PbiFilterQueryTest, EmptyFiles)
+{
+    const BamFile file{PbbamTestsConfig::Data_Dir + "/empty.bam"};
+    PbiFilterQuery query{PbiFilter{}, file};
+    const auto numReads = query.NumReads();
+    EXPECT_EQ(0, numReads);
+
+    size_t count = 0;
+    for (const auto& r : query) {
+        UNUSED(r);
+        ++count;
+    }
+    EXPECT_EQ(0, count);
+}
+
+TEST(PbiFilterQueryTest, BarcodeData)
+{
+    const BamFile file{PbbamTestsConfig::Data_Dir + "/phi29.bam"};
+
+    // bc_quality == 1
+    {
+        PbiFilterQuery query{PbiBarcodeQualityFilter{1}, file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(120, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(120, count);
+    }
+
+    // bc_quality != 1
+    {
+        PbiFilterQuery query{PbiBarcodeQualityFilter{1, Compare::NOT_EQUAL}, file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(0, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(0, count);
+    }
+
+    // bc_forward == 0
+    {
+        PbiFilterQuery query{PbiBarcodeForwardFilter{0}, file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(40, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(40, count);
+    }
+
+    // bc_forward == [0,2]
+    {
+        const auto ids = std::vector<int16_t>{0, 2};
+        PbiFilterQuery query{PbiBarcodeForwardFilter{ids}, file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(80, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(80, count);
+    }
+
+    // bc_reverse != 0
+    {
+        PbiFilterQuery query{PbiBarcodeReverseFilter{0, Compare::NOT_EQUAL}, file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(80, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(80, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, BarcodeQualityFromXml)
+{
+
+    const std::string xml_all = R"_XML_(
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet 
+   xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd" 
+   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+   xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+   xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+   xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+   xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+   xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd" 
+   UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c" 
+   TimeStampedName="subreadset_150304_231155" 
+   MetaType="PacBio.DataSet.SubreadSet" 
+   Name="DataSet_SubreadSet" 
+   Tags="" 
+   Version="3.0.0" 
+   CreatedAt="2015-01-27T09:00:01"> 
+<pbbase:ExternalResources>
+   <pbbase:ExternalResource 
+       UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193" 
+       TimeStampedName="subread_bam_150304_231155" 
+       MetaType="PacBio.SubreadFile.SubreadBamFile" 
+       ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam">
+       <pbbase:FileIndices>
+           <pbbase:FileIndex 
+               UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5194" 
+               TimeStampedName="bam_index_150304_231155" 
+               MetaType="PacBio.Index.PacBioIndex" 
+               ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi"/>
+       </pbbase:FileIndices>
+   </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="bq" Operator="=" Value="1"/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
+)_XML_";
+
+    const std::string xml_none = R"_XML_(
+<?xml version="1.0" encoding="utf-8"?>
+<pbds:SubreadSet
+   xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+   xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+   xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+   xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+   xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+   xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"
+   UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c"
+   TimeStampedName="subreadset_150304_231155"
+   MetaType="PacBio.DataSet.SubreadSet"
+   Name="DataSet_SubreadSet"
+   Tags=""
+   Version="3.0.0"
+   CreatedAt="2015-01-27T09:00:01">
+<pbbase:ExternalResources>
+   <pbbase:ExternalResource
+       UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193"
+       TimeStampedName="subread_bam_150304_231155"
+       MetaType="PacBio.SubreadFile.SubreadBamFile"
+       ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam">
+       <pbbase:FileIndices>
+           <pbbase:FileIndex
+               UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5194"
+               TimeStampedName="bam_index_150304_231155"
+               MetaType="PacBio.Index.PacBioIndex"
+               ResourceId="m150404_101626_42267_c100807920800000001823174110291514_s1_p0.1.subreads.bam.pbi"/>
+       </pbbase:FileIndices>
+   </pbbase:ExternalResource>
+</pbbase:ExternalResources>
+<pbds:Filters>
+    <pbds:Filter>
+        <pbbase:Properties>
+            <pbbase:Property Name="bq" Operator="!=" Value="1"/>
+        </pbbase:Properties>
+    </pbds:Filter>
+</pbds:Filters>
+</pbds:SubreadSet>
+)_XML_";
+
+    const BamFile file{PbbamTestsConfig::Data_Dir + "/phi29.bam"};
+
+    {  // filter allows all records
+        const DataSet ds = DataSet::FromXml(xml_all);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(120, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(120, count);
+    }
+    {  // filter allows no records
+        const DataSet ds = DataSet::FromXml(xml_none);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(0, numReads);
+
+        size_t count = 0;
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count;
+        }
+        EXPECT_EQ(0, count);
+    }
+}
+
+TEST(PbiFilterQueryTest, ZmwWhitelistFromXml)
+{
+    const BamFile file{PbbamTestsConfig::Data_Dir + "/phi29.bam"};
+    const std::string xmlHeader = R"_XML_(
+        <?xml version="1.0" encoding="utf-8"?>
+        <pbds:SubreadSet
+           xmlns="http://pacificbiosciences.com/PacBioDatasets.xsd"
+           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           xmlns:pbbase="http://pacificbiosciences.com/PacBioBaseDataModel.xsd"
+           xmlns:pbsample="http://pacificbiosciences.com/PacBioSampleInfo.xsd"
+           xmlns:pbmeta="http://pacificbiosciences.com/PacBioCollectionMetadata.xsd"
+           xmlns:pbds="http://pacificbiosciences.com/PacBioDatasets.xsd"
+           xsi:schemaLocation="http://pacificbiosciences.com/PacBioDataModel.xsd"
+           UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe519c"
+           TimeStampedName="subreadset_150304_231155"
+           MetaType="PacBio.DataSet.SubreadSet"
+           Name="DataSet_SubreadSet"
+           Tags=""
+           Version="3.0.0"
+           CreatedAt="2015-01-27T09:00:01">
+        <pbbase:ExternalResources>
+           <pbbase:ExternalResource
+               UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5193"
+               TimeStampedName="subread_bam_150304_231155"
+               MetaType="PacBio.SubreadFile.SubreadBamFile"
+               ResourceId="phi29.bam">
+               <pbbase:FileIndices>
+                   <pbbase:FileIndex
+                       UniqueId="b095d0a3-94b8-4918-b3af-a3f81bbe5194"
+                       TimeStampedName="bam_index_150304_231155"
+                       MetaType="PacBio.Index.PacBioIndex"
+                       ResourceId="phi29.bam.pbi"/>
+               </pbbase:FileIndices>
+           </pbbase:ExternalResource>
+        </pbbase:ExternalResources>
+        <pbds:Filters>
+            <pbds:Filter>
+                <pbbase:Properties>)_XML_";
+
+    const std::string xmlFooter = R"_XML_(
+                </pbbase:Properties>
+            </pbds:Filter>
+        </pbds:Filters>
+        </pbds:SubreadSet>
+        )_XML_";
+
+    size_t count_30422 = 0;
+    size_t count_648 = 0;
+    size_t count_17299 = 0;
+    size_t count_whitelist = 0;
+
+    {  // 30422
+        const std::string xmlProperty =
+            R"_XML_(<pbbase:Property Name="zm" Operator="=" Value="30422"/>\n)_XML_";
+        const std::string xml = xmlHeader + xmlProperty + xmlFooter;
+        const DataSet ds = DataSet::FromXml(xml);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(13, numReads);
+
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count_30422;
+        }
+        EXPECT_EQ(13, count_30422);
+    }
+    {  // 648
+        const std::string xmlProperty =
+            R"_XML_(<pbbase:Property Name="zm" Operator="=" Value="648"/>\n)_XML_";
+        const std::string xml = xmlHeader + xmlProperty + xmlFooter;
+        const DataSet ds = DataSet::FromXml(xml);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(11, numReads);
+
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count_648;
+        }
+        EXPECT_EQ(11, count_648);
+    }
+    {  // 17299
+        const std::string xmlProperty =
+            R"_XML_(<pbbase:Property Name="zm" Operator="=" Value="17299"/>\n)_XML_";
+        const std::string xml = xmlHeader + xmlProperty + xmlFooter;
+        const DataSet ds = DataSet::FromXml(xml);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(4, numReads);
+
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count_17299;
+        }
+        EXPECT_EQ(4, count_17299);
+    }
+    {  // now check whitelist
+        const std::string xmlProperty =
+            R"_XML_(<pbbase:Property Name="zm" Operator="=" Value="[30422,648,17299]"/>\n)_XML_";
+        const std::string xml = xmlHeader + xmlProperty + xmlFooter;
+        const DataSet ds = DataSet::FromXml(xml);
+        const PbiFilterQuery query{PbiFilter::FromDataSet(ds), file};
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(28, numReads);
+
+        for (const auto& r : query) {
+            UNUSED(r);
+            ++count_whitelist;
+        }
+        EXPECT_EQ(count_30422 + count_648 + count_17299, count_whitelist);
+    }
+}
+
+TEST(PbiFilterQueryTest, TranscriptRecords)
+{
+    const std::string transcriptFn = PbbamTestsConfig::Data_Dir + "/transcript.subreads.bam";
+
+    PbiFilterQuery query{PbiFilter{}, transcriptFn};
+    for (const auto& b : query)
+        EXPECT_TRUE(b.HasHoleNumber());
+
+    {  // zmw whitelist
+        const std::vector<int32_t> whitelist = {1, 3};
+
+        std::vector<int32_t> observed;
+
+        PbiFilter filter{PbiZmwFilter{whitelist}};
+        PbiFilterQuery queryData{filter, transcriptFn};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        ASSERT_EQ(2, observed.size());
+        EXPECT_EQ(1, observed.at(0));
+        EXPECT_EQ(3, observed.at(1));
+    }
+    {  // zmw bounds
+        const PbiFilter filter{
+            {PbiZmwFilter{2, Compare::GREATER_THAN_EQUAL}, PbiZmwFilter{4, Compare::LESS_THAN}}};
+
+        std::vector<int32_t> observed;
+
+        PbiFilterQuery queryData{filter, transcriptFn};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        ASSERT_EQ(2, observed.size());
+        EXPECT_EQ(2, observed.at(0));
+        EXPECT_EQ(3, observed.at(1));
+    }
+    {  // QNAME
+
+        std::vector<int32_t> observed;
+
+        PbiFilter filter{PbiQueryNameFilter{"transcript/2"}};
+        PbiFilterQuery queryData{filter, transcriptFn};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        ASSERT_EQ(1, observed.size());
+        EXPECT_EQ(2, observed.at(0));
+    }
+    {  // QNAME whitelist
+
+        const std::vector<std::string> whitelist = {"transcript/1", "transcript/4"};
+
+        std::vector<int32_t> observed;
+
+        PbiFilter filter{PbiQueryNameFilter{whitelist}};
+        PbiFilterQuery queryData{filter, transcriptFn};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        ASSERT_EQ(2, observed.size());
+        EXPECT_EQ(1, observed.at(0));
+        EXPECT_EQ(4, observed.at(1));
+    }
+
+    {  // movie name
+        std::vector<int32_t> observed;
+
+        PbiFilter filter{PbiMovieNameFilter{"transcript"}};
+        PbiFilterQuery queryData{filter, transcriptFn};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        EXPECT_EQ(4, observed.size());
+    }
+
+    {  // movie name from DataSet
+
+        const std::string datasetFn = PbbamTestsConfig::Data_Dir + "/transcriptset.xml";
+
+        std::vector<int32_t> observed;
+
+        PacBio::BAM::DataSet ds(datasetFn);
+        PacBio::BAM::PbiFilter filter = PacBio::BAM::PbiFilter::FromDataSet(ds);
+        PbiFilterQuery queryData{filter, ds};
+        for (const auto& b : queryData) {
+            observed.push_back(b.HoleNumber());
+        }
+
+        EXPECT_EQ(4, observed.size());
+    }
+}
diff --git a/tests/src/test_Pulse2BaseCache.cpp b/tests/src/test_Pulse2BaseCache.cpp

new file mode 100644 (file)

index 0000000..187fa58
--- /dev/null
+++ b/tests/src/test_Pulse2BaseCache.cpp
@@ -0,0 +1,49 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/../../src/Pulse2BaseCache.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+
+TEST(Pulse2BaseCacheTest, CountsDetectedInConstructor)
+{
+    const std::string pulseCalls = "ACccTTAGtTCAtG";
+    const std::string trimmedPC = "ACTTAGTCAG";
+
+    const Pulse2BaseCache cache{pulseCalls};
+
+    EXPECT_EQ(pulseCalls.size(), cache.NumPulses());
+    EXPECT_EQ(trimmedPC.size(), cache.NumBases());
+}
+
+TEST(Pulse2BaseCacheTest, RemovesSquashedPulsesFromString)
+{
+    const std::string pulseCalls = "ACccTTAGtTCAtG";
+    const std::string trimmedPC = "ACTTAGTCAG";
+    const std::string altLabel = "-G--A--T--AC--";
+    const std::string trimmedAlt = "-GA--T-AC-";
+
+    const Pulse2BaseCache cache{pulseCalls};
+
+    EXPECT_EQ(trimmedPC, cache.RemoveSquashedPulses(pulseCalls));
+    EXPECT_EQ(trimmedAlt, cache.RemoveSquashedPulses(altLabel));
+}
+
+TEST(Pulse2BaseCacheTest, RemovesSquashedPulsesFromVector)
+{
+    const std::string pulseCalls = "ACccTTAGtTCAtG";
+    const std::vector<uint16_t> pkMean = {5, 4, 2, 2, 3, 8, 8, 8, 4, 7, 7, 7, 3, 4};
+    const std::vector<uint16_t> trimmedPkmean = {5, 4, 3, 8, 8, 8, 7, 7, 7, 4};
+
+    const Pulse2BaseCache cache{pulseCalls};
+
+    EXPECT_EQ(trimmedPkmean, cache.RemoveSquashedPulses(pkMean));
+}
diff --git a/tests/src/test_QNameQuery.cpp b/tests/src/test_QNameQuery.cpp

new file mode 100644 (file)

index 0000000..b051728
--- /dev/null
+++ b/tests/src/test_QNameQuery.cpp
@@ -0,0 +1,64 @@
+// Author: Yuan Li
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/QNameQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace QNameQueryTests {
+
+static const std::string dataDir = PbbamTestsConfig::Data_Dir + "/group/";
+static const std::string test1fn = std::string(dataDir) + "test1.bam";
+static const std::string test2fn = std::string(dataDir) + "test2.bam";
+static const std::string test3fn = std::string(dataDir) + "test3.bam";
+
+static void TestQNameQuery(const std::string& fn, const std::vector<int>& expected)
+{
+    EXPECT_NO_THROW({
+        std::vector<int> counts;
+        QNameQuery qQuery(fn);
+        for (const std::vector<BamRecord>& records : qQuery)
+            counts.push_back(records.size());
+        EXPECT_EQ(expected, counts);
+    });
+}
+
+static void TestNoneConstQNameQuery(const std::string& fn, const std::vector<int>& expected)
+{
+    EXPECT_NO_THROW({
+        std::vector<int> counts;
+        QNameQuery qQuery(fn);
+        for (std::vector<BamRecord>& records : qQuery)
+            counts.push_back(records.size());
+        EXPECT_EQ(expected, counts);
+    });
+}
+
+}  // namespace QNameQueryTests
+
+TEST(QNameQueryTest, CountQSizes)
+{
+    // test case 1 has exactly one bamRecord.
+    std::string fn = QNameQueryTests::test1fn;
+    std::vector<int> expected({1});
+    QNameQueryTests::TestQNameQuery(fn, expected);
+    QNameQueryTests::TestNoneConstQNameQuery(fn, expected);
+
+    // test case 2 has bamRecords of four subreads.
+    fn = QNameQueryTests::test2fn;
+    expected = {1, 1, 1, 1};
+    QNameQueryTests::TestQNameQuery(fn, expected);
+    QNameQueryTests::TestNoneConstQNameQuery(fn, expected);
+
+    fn = QNameQueryTests::test3fn;
+    expected = {2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1};
+    QNameQueryTests::TestQNameQuery(fn, expected);
+    QNameQueryTests::TestNoneConstQNameQuery(fn, expected);
+}
diff --git a/tests/src/test_QualityValues.cpp b/tests/src/test_QualityValues.cpp

new file mode 100644 (file)

index 0000000..c06fd09
--- /dev/null
+++ b/tests/src/test_QualityValues.cpp
@@ -0,0 +1,88 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/QualityValues.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(QualityValueTest, DefaultsOk)
+{
+    const QualityValue value;
+    EXPECT_EQ(0, value);
+    EXPECT_EQ('!', value.Fastq());
+}
+
+TEST(QualityValueTest, FromNumber)
+{
+    const QualityValue zero(0);
+    const QualityValue thirtyThree(33);
+    const QualityValue valid(42);
+    const QualityValue max(93);
+    const QualityValue tooHigh(94);
+    const QualityValue wayTooHigh(std::numeric_limits<int8_t>::max());
+
+    EXPECT_EQ(0, zero);
+    EXPECT_EQ(33, thirtyThree);
+    EXPECT_EQ(42, valid);
+    EXPECT_EQ(93, max);
+    EXPECT_EQ(93, tooHigh);
+    EXPECT_EQ(93, wayTooHigh);
+
+    EXPECT_EQ('!', zero.Fastq());
+    EXPECT_EQ('B', thirtyThree.Fastq());
+    EXPECT_EQ('K', valid.Fastq());
+    EXPECT_EQ('~', max.Fastq());
+    EXPECT_EQ('~', tooHigh.Fastq());
+    EXPECT_EQ('~', wayTooHigh.Fastq());
+}
+
+TEST(QualityValueTest, FromFastq)
+{
+    const QualityValue zero = QualityValue::FromFastq('!');
+    const QualityValue thirtyThree = QualityValue::FromFastq('B');
+    const QualityValue valid = QualityValue::FromFastq('K');
+    const QualityValue max = QualityValue::FromFastq('~');
+
+    EXPECT_EQ(0, zero);
+    EXPECT_EQ(33, thirtyThree);
+    EXPECT_EQ(42, valid);
+    EXPECT_EQ(93, max);
+}
+
+TEST(QualityValuesTest, Default)
+{
+    const QualityValues qvs;
+    EXPECT_TRUE(qvs.empty());
+    EXPECT_EQ(std::string(), qvs.Fastq());
+}
+
+TEST(QualityValuesTest, FromNumbers)
+{
+    const std::string fastqString = "~~~KKBB!!";
+    const std::vector<uint8_t> values = {93, 93, 93, 42, 42, 33, 33, 0, 0};
+
+    QualityValues qvs;
+    for (auto qv : values)
+        qvs.push_back(qv);
+    EXPECT_EQ(fastqString, qvs.Fastq());
+}
+
+TEST(QualityValuesTest, FromFastq)
+{
+    const std::string fastqString = "~~~KKBB!!";
+    const std::vector<uint8_t> values = {93, 93, 93, 42, 42, 33, 33, 0, 0};
+
+    const QualityValues qvs = QualityValues::FromFastq(fastqString);
+    EXPECT_EQ(fastqString.size(), qvs.size());
+    EXPECT_EQ(values.size(), qvs.size());
+    for (size_t i = 0; i < fastqString.size(); ++i)
+        EXPECT_EQ(values.at(i), qvs.at(i));
+}
diff --git a/tests/src/test_ReadAccuracyQuery.cpp b/tests/src/test_ReadAccuracyQuery.cpp

new file mode 100644 (file)

index 0000000..f7b1f6d
--- /dev/null
+++ b/tests/src/test_ReadAccuracyQuery.cpp
@@ -0,0 +1,42 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/ReadAccuracyQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(ReadAccuracyQueryTest, QueryOk)
+{
+    const auto bamFile = BamFile{PbbamTestsConfig::Data_Dir + std::string{"/group/test2.bam"}};
+
+    {
+        ReadAccuracyQuery query(0.901, Compare::GREATER_THAN_EQUAL, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(4, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE(r.ReadAccuracy(), 0.901);
+        }
+        EXPECT_EQ(4, count);
+    }
+    {
+        ReadAccuracyQuery query(0.95, Compare::GREATER_THAN_EQUAL, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(0, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE(r.ReadAccuracy(), 0.901);
+        }
+        EXPECT_EQ(0, count);
+    }
+}
diff --git a/tests/src/test_ReadGroupInfo.cpp b/tests/src/test_ReadGroupInfo.cpp

new file mode 100644 (file)

index 0000000..fd0a4f8
--- /dev/null
+++ b/tests/src/test_ReadGroupInfo.cpp
@@ -0,0 +1,252 @@
+// Author: Derek Barnett, Lance Hepler
+
+#include <cstddef>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/ReadGroupInfo.h>
+#include <pbbam/exception/BundleChemistryMappingException.h>
+#include <pbbam/exception/InvalidSequencingChemistryException.h>
+
+// clang-format off
+
+using namespace PacBio::BAM;
+
+TEST(ReadGroupInfoTest, IdFromMovieNameAndReadType)
+{
+    ReadGroupInfo rg("m140905_042212_sidney_c100564852550000001823085912221377_s1_X0", "HQREGION");
+    EXPECT_EQ("00082ba1", rg.Id());
+}
+
+TEST(ReadGroupInfoTest, FrameCodecSetOk)
+{
+    ReadGroupInfo rg("test");
+    rg.IpdCodec(FrameCodec::V1);
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::IPD));
+    EXPECT_EQ("ip", rg.BaseFeatureTag(BaseFeature::IPD));
+    EXPECT_EQ(FrameCodec::V1, rg.IpdCodec());
+}
+
+TEST(ReadGroupInfoTest, SequencingChemistryOk)
+{
+    {   // P6-C4
+        const std::string chem{"P6-C4"};
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100356200","2.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100356200","2.3"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100612400","2.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100356300","100612400","2.3"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100356200","2.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100356200","2.3"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100612400","2.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100372700","100612400","2.3"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100356300")
+          .SequencingKit("100356200")
+          .BasecallerVersion("2.1");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+
+    {   // S/P1-C1/beta
+        const std::string chem{"S/P1-C1/beta"};
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-620-000","3.0"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-620-000","3.1"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-620-000")
+          .BasecallerVersion("3.0");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+
+    {   // S/P1-C1.1 (Echidna)
+        const std::string chem{"S/P1-C1.1"};
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.2"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-867-300","3.3"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-867-300")
+          .BasecallerVersion("3.1");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+
+    {   // S/P1-C1.2 (Flea)
+        const std::string chem{"S/P1-C1.2"};
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.1"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.2"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-902-100","3.3"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-902-100")
+          .BasecallerVersion("3.1");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+    {   // S/P1-C1.3 (Goat)
+        const std::string chem{"S/P1-C1.3"};
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-972-200","3.2"));
+        EXPECT_EQ(chem, ReadGroupInfo::SequencingChemistryFromTriple("100-619-300","100-972-200","3.3"));
+
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-972-200")
+          .BasecallerVersion("3.3");
+        EXPECT_EQ(chem, rg.SequencingChemistry());
+    }
+}
+
+#ifdef _WIN32
+int setenv(const char* name, const char* value, int overwrite)
+{
+    int err = 0;
+    if (!overwrite) {
+        size_t sz = 0;
+        err = getenv_s(&sz, NULL, 0, name);
+        if (err || sz) return err;
+    }
+    return _putenv_s(name, value);
+}
+
+int unsetenv(const char* name) {
+    static const char* empty = "";
+    return _putenv_s(name, empty);
+}
+#endif
+
+TEST(ReadGroupInfoTest, SequencingChemistryFromMappingXml)
+{
+    ReadGroupInfo rg("MAYBE");
+    rg.BindingKit("1").SequencingKit("2").BasecallerVersion("3.4");
+    EXPECT_THROW(rg.SequencingChemistry(), InvalidSequencingChemistryException);
+
+    // set the magic environment variable
+    const char* varname = "SMRT_CHEMISTRY_BUNDLE_DIR";
+    EXPECT_EQ(0, setenv(varname, PbbamTestsConfig::Data_Dir.c_str(), 0));
+
+    EXPECT_EQ("FOUND", rg.SequencingChemistry());
+
+    // unset the environment variable
+    EXPECT_EQ(0, unsetenv(varname));
+
+    // test memoization
+    EXPECT_THROW(ReadGroupInfo::SequencingChemistryFromTriple("1", "2", "3.4"),
+                 InvalidSequencingChemistryException);
+    EXPECT_EQ("FOUND", rg.SequencingChemistry());
+
+    EXPECT_EQ(0, setenv(varname, "/dev/null", 0));
+
+    // test that a bogus SMRT_CHEMISTRY_BUNDLE_DIR throws
+    EXPECT_THROW(ReadGroupInfo::SequencingChemistryFromTriple("1", "2", "3.4"),
+                 BundleChemistryMappingException);
+
+    EXPECT_EQ(0, unsetenv(varname));
+}
+
+TEST(ReadGroupInfoTest, SequencingChemistryThrowsOnBadTriple)
+{
+    // check that we actually throw
+    ReadGroupInfo rg("BAD");
+    rg.BindingKit("100372700")
+      .SequencingKit("100-619-400")
+      .BasecallerVersion("2.0");
+    EXPECT_THROW(rg.SequencingChemistry(), InvalidSequencingChemistryException);
+
+    // now check thrown contents
+    try {
+        ReadGroupInfo rg2("BAD");
+        rg2.BindingKit("100372700")
+          .SequencingKit("100-619-400")
+          .BasecallerVersion("2.0");
+    } catch (InvalidSequencingChemistryException& e) {
+        EXPECT_EQ(std::string("100372700"),   e.BindingKit());
+        EXPECT_EQ(std::string("100-619-400"), e.SequencingKit());
+        EXPECT_EQ(std::string("2.0"),         e.BasecallerVersion());
+    }
+}
+
+TEST(ReadGroupInfoTest, BasecallerVersion)
+{
+    // too short
+    try {
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-867-300")
+          .BasecallerVersion("3");
+        const std::string chem = rg.SequencingChemistry();
+//        ()chem;
+
+    } catch (std::runtime_error& e) {
+        EXPECT_EQ(std::string("basecaller version too short: 3"), std::string(e.what()));
+    }
+
+    // initial implementation assumed single digit version numbers:
+    //    const std::string ver{ basecallerVersion.substr(0, 3) };
+    // So '3.299.dummy' would incorrectly be interpreted as (OK) '3.2'.
+    // 3.
+
+    try {
+        ReadGroupInfo rg("dummy");
+        rg.BindingKit("100-619-300")
+          .SequencingKit("100-867-300")
+          .BasecallerVersion("3.199.dummy");   
+        const std::string chem = rg.SequencingChemistry();
+//        ()chem;
+
+    } catch (InvalidSequencingChemistryException& e) {
+        EXPECT_EQ("100-619-300", e.BindingKit());
+        EXPECT_EQ("100-867-300", e.SequencingKit());
+        EXPECT_EQ("3.199.dummy", e.BasecallerVersion());
+    }
+    //EXPECT_THROW(rg.SequencingChemistry(), InvalidSequencingChemistryException);
+}
+
+TEST(ReadGroupInfoTest, ClearBaseFeatures)
+{
+    ReadGroupInfo rg("test");
+    rg.BaseFeatureTag(BaseFeature::DELETION_QV,     "dq");
+    rg.BaseFeatureTag(BaseFeature::DELETION_TAG,    "dt");
+    rg.BaseFeatureTag(BaseFeature::INSERTION_QV,    "iq");
+    rg.BaseFeatureTag(BaseFeature::MERGE_QV,        "mq");
+    rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV, "sq");
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+    EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV));
+
+    rg.ClearBaseFeatures();
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_TAG));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::INSERTION_QV));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::MERGE_QV));
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_QV));
+}
+
+TEST(ReadGroupInfoTest, RemoveBaseFeature)
+{
+    ReadGroupInfo rg("test");
+    rg.BaseFeatureTag(BaseFeature::DELETION_QV,     "dq");
+    rg.BaseFeatureTag(BaseFeature::DELETION_TAG,    "dt");
+    rg.BaseFeatureTag(BaseFeature::INSERTION_QV,    "iq");
+    rg.BaseFeatureTag(BaseFeature::MERGE_QV,        "mq");
+    rg.BaseFeatureTag(BaseFeature::SUBSTITUTION_QV, "sq");
+    rg.BaseFeatureTag(BaseFeature::PULSE_EXCLUSION, "pe");
+
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+    EXPECT_EQ("dq", rg.BaseFeatureTag(BaseFeature::DELETION_QV));
+
+    rg.RemoveBaseFeature(BaseFeature::DELETION_QV);
+    EXPECT_FALSE(rg.HasBaseFeature(BaseFeature::DELETION_QV));
+    
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::DELETION_TAG));
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::INSERTION_QV));
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::MERGE_QV));
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::SUBSTITUTION_QV));
+    EXPECT_TRUE(rg.HasBaseFeature(BaseFeature::PULSE_EXCLUSION));
+}
+
+// clang-format on
diff --git a/tests/src/test_SamWriter.cpp b/tests/src/test_SamWriter.cpp

new file mode 100644 (file)

index 0000000..f439ce9
--- /dev/null
+++ b/tests/src/test_SamWriter.cpp
@@ -0,0 +1,115 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/SamWriter.h>
+#include "PbbamTestData.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(SamWriterTest, HeaderOk)
+{
+    // setup header
+    const std::string hdrText{
+        "@HD\tVN:1.1\tSO:unknown\tpb:3.0.3\n"
+        "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;"
+        "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t"
+        "PU:test\tPM:SEQUEL\n"};
+
+    EXPECT_NO_THROW({
+        // write header to file
+        const std::string generatedFn =
+            PbbamTestsConfig::GeneratedData_Dir + "/samwriter_hdr_only.sam";
+        {
+            const BamHeader inputHeader(hdrText);
+            SamWriter writer(generatedFn, inputHeader);
+            //            ()writer;
+        };
+
+        // check header
+        {
+            std::ifstream f(generatedFn);
+            const std::string text((std::istreambuf_iterator<char>(f)),
+                                   std::istreambuf_iterator<char>());
+            EXPECT_EQ(hdrText, text);
+        }
+
+        // clean up
+        remove(generatedFn.c_str());
+    });
+}
+
+TEST(SamWriterTest, SingleRecordOk)
+{
+
+    // setup header
+    const std::string hdrLine1{"@HD\tVN:1.1\tSO:unknown\tpb:3.0.3"};
+    const std::string hdrLine2{
+        "@RG\tID:6002b307\tPL:PACBIO\tDS:READTYPE=SUBREAD;BINDINGKIT=100-619-300;"
+        "SEQUENCINGKIT=100-619-400;BASECALLERVERSION=3.0;FRAMERATEHZ=100\t"
+        "PU:test\tPM:SEQUEL"};
+    const std::string hdrText = hdrLine1 + "\n" + hdrLine2 + "\n";
+    const BamHeader inputHeader(hdrText);
+
+    // setup record
+    BamRecord record(inputHeader);
+    record.Impl().Name("test/100/0_5");
+    record.Impl().SetSequenceAndQualities("ACGTC", 5, "@@@@@");
+    record.Impl().CigarData("");
+    record.Impl().Bin(0);
+    record.Impl().Flag(0);
+    record.Impl().InsertSize(0);
+    record.Impl().MapQuality(0);
+    record.Impl().MatePosition(-1);
+    record.Impl().MateReferenceId(-1);
+    record.Impl().Position(-1);
+    record.Impl().ReferenceId(-1);
+    record.Impl().SetMapped(false);
+
+    TagCollection tags;
+    tags["zm"] = int32_t{100};
+    tags["qs"] = int32_t{0};
+    tags["qe"] = int32_t{5};
+    tags["np"] = int32_t{1};
+    tags["rq"] = static_cast<float>(0.6);
+    tags["RG"] = std::string{"6002b307"};
+    tags["sn"] = std::vector<float>{0.2f, 0.2f, 0.2f, 0.2f};
+    record.Impl().Tags(tags);
+
+    const std::string expectedSamRecord{
+        "test/100/0_5\t4\t*\t0\t0\t*\t*\t0\t0\tACGTC\t@@@@@\tRG:Z:6002b307\t"
+        "np:i:1\tqe:i:5\tqs:i:0\trq:f:0.6\tsn:B:f,0.2,0.2,0.2,0.2\tzm:i:100"};
+
+    EXPECT_NO_THROW({
+        // write data to file
+        const std::string generatedFn =
+            PbbamTestsConfig::GeneratedData_Dir + "/samwriter_hdr_and_record.sam";
+        {
+            SamWriter writer(generatedFn, inputHeader);
+            writer.Write(record);
+        };
+
+        // check header & record
+        {
+            std::ifstream f(generatedFn);
+            std::string line1;
+            std::string line2;
+            std::string line3;
+            std::getline(f, line1);
+            std::getline(f, line2);
+            std::getline(f, line3);
+            EXPECT_EQ(hdrLine1, line1);
+            EXPECT_EQ(hdrLine2, line2);
+            EXPECT_EQ(expectedSamRecord, line3);
+        }
+
+        // cleanup
+        remove(generatedFn.c_str());
+    });
+}
diff --git a/tests/src/test_SequenceUtils.cpp b/tests/src/test_SequenceUtils.cpp

new file mode 100644 (file)

index 0000000..a1f88cb
--- /dev/null
+++ b/tests/src/test_SequenceUtils.cpp
@@ -0,0 +1,80 @@
+// Author: Derek Barnett
+
+#include <climits>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/../../src/SequenceUtils.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+
+TEST(SequenceUtilsTest, ComplementChar)
+{
+    // complement
+    const char A = 'A';  // T
+    const char B = 'B';  // V
+    const char C = 'C';  // G
+    const char D = 'D';  // H
+    const char E = 'E';  // null
+    const char F = 'F';  // null
+    const char G = 'G';  // C
+    const char H = 'H';  // D
+    const char I = 'I';  // null
+    const char J = 'J';  // null
+    const char K = 'K';  // M
+    const char L = 'L';  // null
+    const char M = 'M';  // K
+    const char N = 'N';  // N
+    const char O = 'O';  // null
+    const char P = 'P';  // null
+    const char Q = 'Q';  // null
+    const char R = 'R';  // Y
+    const char S = 'S';  // S
+    const char T = 'T';  // A
+    const char U = 'U';  // A
+    const char V = 'V';  // B
+    const char W = 'W';  // W
+    const char X = 'X';  // null
+    const char Y = 'Y';  // R
+    const char Z = 'Z';  // null
+
+    EXPECT_EQ(T, Complement(A));
+    EXPECT_EQ(V, Complement(B));
+    EXPECT_EQ(G, Complement(C));
+    EXPECT_EQ(H, Complement(D));
+    EXPECT_EQ(0, Complement(E));
+    EXPECT_EQ(0, Complement(F));
+    EXPECT_EQ(C, Complement(G));
+    EXPECT_EQ(D, Complement(H));
+    EXPECT_EQ(0, Complement(I));
+    EXPECT_EQ(0, Complement(J));
+    EXPECT_EQ(M, Complement(K));
+    EXPECT_EQ(0, Complement(L));
+    EXPECT_EQ(K, Complement(M));
+    EXPECT_EQ(N, Complement(N));
+    EXPECT_EQ(0, Complement(O));
+    EXPECT_EQ(0, Complement(P));
+    EXPECT_EQ(0, Complement(Q));
+    EXPECT_EQ(Y, Complement(R));
+    EXPECT_EQ(S, Complement(S));
+    EXPECT_EQ(A, Complement(T));
+    EXPECT_EQ(A, Complement(U));
+    EXPECT_EQ(B, Complement(V));
+    EXPECT_EQ(W, Complement(W));
+    EXPECT_EQ(0, Complement(X));
+    EXPECT_EQ(R, Complement(Y));
+    EXPECT_EQ(0, Complement(Z));
+}
+
+TEST(SequenceUtilsTest, ReverseComplement)
+{
+    std::string input1{"ATATATCCCGGCG"};
+    const std::string rc1{"CGCCGGGATATAT"};
+
+    ReverseComplement(input1);
+    EXPECT_EQ(rc1, input1);
+}
diff --git a/tests/src/test_StringUtils.cpp b/tests/src/test_StringUtils.cpp

new file mode 100644 (file)

index 0000000..7ae5af0
--- /dev/null
+++ b/tests/src/test_StringUtils.cpp
@@ -0,0 +1,81 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+
+#include <pbbam/StringUtilities.h>
+
+TEST(StringUtilsTest, BasicSplitWithDefaultDelim)
+{
+    using PacBio::BAM::Split;
+
+    const std::string test{"foo\tbar\tbaz"};
+    const auto tokens = Split(test);
+    EXPECT_EQ(3, tokens.size());
+    EXPECT_TRUE(tokens.at(0) == "foo");
+    EXPECT_TRUE(tokens.at(1) == "bar");
+    EXPECT_TRUE(tokens.at(2) == "baz");
+}
+
+TEST(StringUtilsTest, BasicSplitWithProvidedDelim)
+{
+    using PacBio::BAM::Split;
+
+    const std::string test{"foo:bar:baz"};
+    const auto tokens = Split(test, ':');
+    EXPECT_EQ(3, tokens.size());
+    EXPECT_TRUE(tokens.at(0) == "foo");
+    EXPECT_TRUE(tokens.at(1) == "bar");
+    EXPECT_TRUE(tokens.at(2) == "baz");
+}
+
+TEST(StringUtilsTest, SplitEmptyStringReturnsEmptyResult)
+{
+    using PacBio::BAM::Split;
+
+    const std::string test;
+    const auto tokens = Split(test);
+    EXPECT_TRUE(tokens.empty());
+}
+
+TEST(StringUtilsTest, SplitKeepsEmptyTokens)
+{
+    using PacBio::BAM::Split;
+
+    const std::string test{"foo\tbar\t\tbaz"};
+    const auto tokens = Split(test);
+    EXPECT_EQ(4, tokens.size());
+    EXPECT_TRUE(tokens.at(0) == "foo");
+    EXPECT_TRUE(tokens.at(1) == "bar");
+    EXPECT_TRUE(tokens.at(2) == "");
+    EXPECT_TRUE(tokens.at(3) == "baz");
+}
+
+TEST(StringUtilsTest, RemoveWhitespaceNormal)
+{
+    using PacBio::BAM::RemoveAllWhitespace;
+
+    {  // lvalue
+        const std::string input{" \f\r\v  Lorem ipsum     \tdolor sit\n\namet "};
+        const auto result = RemoveAllWhitespace(input);
+        EXPECT_EQ("Loremipsumdolorsitamet", result);
+    }
+    {  // rvalue
+        const auto result = RemoveAllWhitespace(" \f\r\v  Lorem ipsum     \tdolor sit\n\namet ");
+        EXPECT_EQ("Loremipsumdolorsitamet", result);
+    }
+}
+
+TEST(StringUtilsTest, RemoveWhitespaceOnEmptyString)
+{
+    using PacBio::BAM::RemoveAllWhitespace;
+
+    {  // lvalue
+        const std::string input;
+        const auto result = RemoveAllWhitespace(input);
+        EXPECT_TRUE(result.empty());
+    }
+    {  // rvalue
+        const auto result = RemoveAllWhitespace("");
+        EXPECT_TRUE(result.empty());
+    }
+}
diff --git a/tests/src/test_SubreadLengthQuery.cpp b/tests/src/test_SubreadLengthQuery.cpp

new file mode 100644 (file)

index 0000000..81c6791
--- /dev/null
+++ b/tests/src/test_SubreadLengthQuery.cpp
@@ -0,0 +1,54 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/SubreadLengthQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(SubreadLengthQueryTest, QueryOk)
+{
+    const auto bamFile = BamFile{PbbamTestsConfig::Data_Dir + std::string{"/group/test2.bam"}};
+
+    {
+        SubreadLengthQuery query(500, Compare::GREATER_THAN_EQUAL, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(3, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 500);
+        }
+        EXPECT_EQ(3, count);
+    }
+    {
+        SubreadLengthQuery query(1000, Compare::GREATER_THAN_EQUAL, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(2, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 1000);
+        }
+        EXPECT_EQ(2, count);
+    }
+    {
+        SubreadLengthQuery query(5000, Compare::GREATER_THAN_EQUAL, bamFile);
+        const auto numReads = query.NumReads();
+        EXPECT_EQ(0, numReads);
+
+        int count = 0;
+        for (const auto& r : query) {
+            ++count;
+            EXPECT_GE((r.QueryEnd() - r.QueryStart()), 5000);
+        }
+        EXPECT_EQ(0, count);
+    }
+}
diff --git a/tests/src/test_Tags.cpp b/tests/src/test_Tags.cpp

new file mode 100644 (file)

index 0000000..eeba32f
--- /dev/null
+++ b/tests/src/test_Tags.cpp
@@ -0,0 +1,1080 @@
+// Author: Derek Barnett
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <string>
+#include <typeinfo>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <boost/type_traits/is_convertible.hpp>
+
+#include <pbbam/BamTagCodec.h>
+#include <pbbam/SamTagCodec.h>
+#include <pbbam/TagCollection.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+TEST(TagTest, TagConstruction)
+{
+    int8_t i8 = 0;
+    uint8_t u8 = 0;
+    int16_t i16 = 0;
+    uint16_t u16 = 0;
+    int32_t i32 = 0;
+    uint32_t u32 = 0;
+    float f = 0.0;
+    std::string str = "";
+    std::vector<int8_t> i8_array;
+    std::vector<uint8_t> u8_array;
+    std::vector<int16_t> i16_array;
+    std::vector<uint16_t> u16_array;
+    std::vector<int32_t> i32_array;
+    std::vector<uint32_t> u32_Array;
+    std::vector<float> float_array;
+
+    signed char c = 'A';
+    unsigned char uc = 'A';
+
+    Tag i8Tag(i8);
+    Tag u8Tag(u8);
+    Tag i16Tag(i16);
+    Tag u16Tag(u16);
+    Tag i32Tag(i32);
+    Tag u32Tag(u32);
+    Tag floatTag(f);
+    Tag stringTag(str);
+    Tag i8_array_Tag(i8_array);
+    Tag u8_array_Tag(u8_array);
+    Tag i16_array_Tag(i16_array);
+    Tag u16_array_Tag(u16_array);
+    Tag i32_array_Tag(i32_array);
+    Tag u32_array_Tag(u32_Array);
+    Tag float_array_Tag(float_array);
+
+    Tag charTag(c, TagModifier::ASCII_CHAR);
+    Tag ucharTag(uc, TagModifier::ASCII_CHAR);
+
+    EXPECT_TRUE(i8Tag.Type() == TagDataType::INT8);
+    EXPECT_TRUE(u8Tag.Type() == TagDataType::UINT8);
+    EXPECT_TRUE(i16Tag.Type() == TagDataType::INT16);
+    EXPECT_TRUE(u16Tag.Type() == TagDataType::UINT16);
+    EXPECT_TRUE(i32Tag.Type() == TagDataType::INT32);
+    EXPECT_TRUE(u32Tag.Type() == TagDataType::UINT32);
+    EXPECT_TRUE(floatTag.Type() == TagDataType::FLOAT);
+    EXPECT_TRUE(stringTag.Type() == TagDataType::STRING);
+    EXPECT_TRUE(i8_array_Tag.Type() == TagDataType::INT8_ARRAY);
+    EXPECT_TRUE(u8_array_Tag.Type() == TagDataType::UINT8_ARRAY);
+    EXPECT_TRUE(i16_array_Tag.Type() == TagDataType::INT16_ARRAY);
+    EXPECT_TRUE(u16_array_Tag.Type() == TagDataType::UINT16_ARRAY);
+    EXPECT_TRUE(i32_array_Tag.Type() == TagDataType::INT32_ARRAY);
+    EXPECT_TRUE(u32_array_Tag.Type() == TagDataType::UINT32_ARRAY);
+    EXPECT_TRUE(float_array_Tag.Type() == TagDataType::FLOAT_ARRAY);
+
+    EXPECT_TRUE(charTag.ToAscii() == 'A');
+    EXPECT_TRUE(ucharTag.ToAscii() == 'A');
+}
+
+TEST(TagTest, CopyAndCompare)
+{
+    int8_t i8 = 0;
+    uint8_t u8 = 0;
+    int16_t i16 = 0;
+    uint16_t u16 = 0;
+    int32_t i32 = 0;
+    uint32_t u32 = 0;
+    float f = 0.0;
+    std::string str = "";
+    std::vector<int8_t> i8_array;
+    std::vector<uint8_t> u8_array;
+    std::vector<int16_t> i16_array;
+    std::vector<uint16_t> u16_array;
+    std::vector<int32_t> i32_array;
+    std::vector<uint32_t> u32_Array;
+    std::vector<float> float_array;
+
+    Tag i8Tag(i8);
+    Tag u8Tag(u8);
+    Tag i16Tag(i16);
+    Tag u16Tag(u16);
+    Tag i32Tag(i32);
+    Tag u32Tag(u32);
+    Tag floatTag(f);
+    Tag stringTag(str);
+    Tag i8_array_Tag(i8_array);
+    Tag u8_array_Tag(u8_array);
+    Tag i16_array_Tag(i16_array);
+    Tag u16_array_Tag(u16_array);
+    Tag i32_array_Tag(i32_array);
+    Tag u32_array_Tag(u32_Array);
+    Tag float_array_Tag(float_array);
+
+    Tag i8Tag2 = i8Tag;
+    Tag u8Tag2 = u8Tag;
+    Tag i16Tag2 = i16Tag;
+    Tag u16Tag2 = u16Tag;
+    Tag i32Tag2 = i32Tag;
+    Tag u32Tag2 = u32Tag;
+    Tag floatTag2 = floatTag;
+    Tag stringTag2 = stringTag;
+    Tag i8_array_Tag2 = i8_array_Tag;
+    Tag u8_array_Tag2 = u8_array_Tag;
+    Tag i16_array_Tag2 = i16_array_Tag;
+    Tag u16_array_Tag2 = u16_array_Tag;
+    Tag i32_array_Tag2 = i32_array_Tag;
+    Tag u32_array_Tag2 = u32_array_Tag;
+    Tag float_array_Tag2 = float_array_Tag;
+
+    EXPECT_EQ(i8Tag, i8Tag2);
+    EXPECT_EQ(u8Tag, u8Tag2);
+    EXPECT_EQ(i16Tag, i16Tag2);
+    EXPECT_EQ(u16Tag, u16Tag2);
+    EXPECT_EQ(i32Tag, i32Tag2);
+    EXPECT_EQ(u32Tag, u32Tag2);
+    EXPECT_EQ(floatTag, floatTag2);
+    EXPECT_EQ(stringTag, stringTag2);
+    EXPECT_EQ(i8_array_Tag, i8_array_Tag2);
+    EXPECT_EQ(u8_array_Tag, u8_array_Tag2);
+    EXPECT_EQ(i16_array_Tag, i16_array_Tag2);
+    EXPECT_EQ(u16_array_Tag, u16_array_Tag2);
+    EXPECT_EQ(i32_array_Tag, i32_array_Tag2);
+    EXPECT_EQ(u32_array_Tag, u32_array_Tag2);
+    EXPECT_EQ(float_array_Tag, float_array_Tag2);
+}
+
+TEST(TagTest, Type_None)
+{
+    Tag tag;
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INVALID);
+    EXPECT_TRUE(tag.IsNull());
+    EXPECT_TRUE(tag.Typename() == "none");
+
+    EXPECT_FALSE(tag.IsNumeric());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+}
+
+TEST(TagTest, Type_Int8)
+{
+    const int8_t v = -42;
+    const Tag tag(v);
+
+    int8_t v2{};
+    EXPECT_NO_THROW(v2 = tag.ToInt8());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT8);
+    EXPECT_TRUE(tag.Typename() == "int8_t");
+    EXPECT_TRUE(tag.IsInt8());
+
+    EXPECT_TRUE(tag.IsSignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsUnsignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt8)
+{
+    const uint8_t v = 42;
+    const Tag tag(v);
+
+    uint8_t v2{};
+    EXPECT_NO_THROW(v2 = tag.ToUInt8());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT8);
+    EXPECT_TRUE(tag.Typename() == "uint8_t");
+    EXPECT_TRUE(tag.IsUInt8());
+
+    EXPECT_TRUE(tag.IsUnsignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsSignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Ascii)
+{
+    const char c = '$';
+    const signed char sc = '$';
+    const unsigned char uc = '$';
+    const uint8_t u8 = 65;
+    const int8_t i8 = 66;
+
+    {  // old style: construct-then-modify
+
+        Tag fromPlainChar = Tag(c);
+        Tag fromSignedChar = Tag(sc);
+        Tag fromUnsignedChar = Tag(uc);
+        Tag fromUint8 = Tag(u8);
+        Tag fromInt8 = Tag(i8);
+        fromPlainChar.Modifier(TagModifier::ASCII_CHAR);
+        fromSignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUnsignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUint8.Modifier(TagModifier::ASCII_CHAR);
+        fromInt8.Modifier(TagModifier::ASCII_CHAR);
+
+        EXPECT_TRUE(fromPlainChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromPlainChar.IsIntegral());
+        EXPECT_TRUE(fromPlainChar.IsNumeric());
+        EXPECT_EQ('$', fromPlainChar.ToAscii());
+
+        EXPECT_TRUE(fromSignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromSignedChar.IsIntegral());
+        EXPECT_TRUE(fromSignedChar.IsNumeric());
+        EXPECT_EQ('$', fromSignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUnsignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUnsignedChar.IsIntegral());
+        EXPECT_TRUE(fromUnsignedChar.IsNumeric());
+        EXPECT_EQ('$', fromUnsignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUint8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUint8.IsIntegral());
+        EXPECT_TRUE(fromUint8.IsNumeric());
+        EXPECT_EQ('A', fromUint8.ToAscii());
+
+        EXPECT_TRUE(fromInt8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromInt8.IsIntegral());
+        EXPECT_TRUE(fromInt8.IsNumeric());
+        EXPECT_EQ('B', fromInt8.ToAscii());
+    }
+
+    {  // new style: construct directly as ASCII
+
+        const Tag fromPlainChar = Tag(c, TagModifier::ASCII_CHAR);
+        const Tag fromSignedChar = Tag(sc, TagModifier::ASCII_CHAR);
+        const Tag fromUnsignedChar = Tag(uc, TagModifier::ASCII_CHAR);
+        const Tag fromUint8 = Tag(u8, TagModifier::ASCII_CHAR);
+        const Tag fromInt8 = Tag(i8, TagModifier::ASCII_CHAR);
+
+        EXPECT_TRUE(fromPlainChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromPlainChar.IsIntegral());
+        EXPECT_TRUE(fromPlainChar.IsNumeric());
+        EXPECT_EQ('$', fromPlainChar.ToAscii());
+
+        EXPECT_TRUE(fromSignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromSignedChar.IsIntegral());
+        EXPECT_TRUE(fromSignedChar.IsNumeric());
+        EXPECT_EQ('$', fromSignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUnsignedChar.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUnsignedChar.IsIntegral());
+        EXPECT_TRUE(fromUnsignedChar.IsNumeric());
+        EXPECT_EQ('$', fromUnsignedChar.ToAscii());
+
+        EXPECT_TRUE(fromUint8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromUint8.IsIntegral());
+        EXPECT_TRUE(fromUint8.IsNumeric());
+        EXPECT_EQ('A', fromUint8.ToAscii());
+
+        EXPECT_TRUE(fromInt8.HasModifier(TagModifier::ASCII_CHAR));
+        EXPECT_TRUE(fromInt8.IsIntegral());
+        EXPECT_TRUE(fromInt8.IsNumeric());
+        EXPECT_EQ('B', fromInt8.ToAscii());
+    }
+
+    // check invalid constructs
+    EXPECT_THROW(Tag('A', TagModifier::HEX_STRING), std::runtime_error);
+}
+
+TEST(TagTest, Type_Int16)
+{
+    const int16_t v = -42;
+    const Tag tag(v);
+
+    int16_t v2{};
+    EXPECT_NO_THROW(v2 = tag.ToInt16());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT16);
+    EXPECT_TRUE(tag.Typename() == "int16_t");
+    EXPECT_TRUE(tag.IsInt16());
+    EXPECT_TRUE(tag.IsSignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsUnsignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt16)
+{
+    const uint16_t v = 42;
+    const Tag tag(v);
+
+    uint16_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt16());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT16);
+    EXPECT_TRUE(tag.Typename() == "uint16_t");
+    EXPECT_TRUE(tag.IsUInt16());
+    EXPECT_TRUE(tag.IsUnsignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsSignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Int32)
+{
+    const int32_t v = -42;
+    const Tag tag(v);
+
+    int32_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt32());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT32);
+    EXPECT_TRUE(tag.Typename() == "int32_t");
+    EXPECT_TRUE(tag.IsInt32());
+    EXPECT_TRUE(tag.IsSignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsUnsignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt32)
+{
+    const uint32_t v = 42;
+    const Tag tag(v);
+
+    uint32_t v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt32());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT32);
+    EXPECT_TRUE(tag.Typename() == "uint32_t");
+    EXPECT_TRUE(tag.IsUInt32());
+    EXPECT_TRUE(tag.IsUnsignedInt());
+    EXPECT_TRUE(tag.IsIntegral());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsSignedInt());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Float)
+{
+    const float v = 3.141;
+    const Tag tag(v);
+
+    float v2;
+    EXPECT_NO_THROW(v2 = tag.ToFloat());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::FLOAT);
+    EXPECT_TRUE(tag.Typename() == "float");
+    EXPECT_TRUE(tag.IsFloat());
+    EXPECT_TRUE(tag.IsNumeric());
+
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsIntegral());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_String)
+{
+    const std::string v = "foo_who";
+    const Tag tag(v);
+
+    std::string v2;
+    EXPECT_NO_THROW(v2 = tag.ToString());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::STRING);
+    EXPECT_TRUE(tag.Typename() == "string");
+    EXPECT_TRUE(tag.IsString());
+
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+    EXPECT_FALSE(tag.IsArray());
+
+    EXPECT_EQ(v, v2);
+
+    // "Hex format" string
+    const Tag hex("DEADBEEF", TagModifier::HEX_STRING);
+    EXPECT_TRUE(hex.Type() == TagDataType::STRING);
+    EXPECT_TRUE(hex.Typename() == "string");
+    EXPECT_TRUE(hex.IsString());
+    EXPECT_TRUE(hex.HasModifier(TagModifier::HEX_STRING));
+    EXPECT_FALSE(hex.IsNull());
+    EXPECT_FALSE(hex.IsNumeric());
+    EXPECT_FALSE(hex.IsArray());
+
+    // check invalid constructs
+    EXPECT_THROW(Tag("DEADBEEF", TagModifier::ASCII_CHAR), std::runtime_error);
+}
+
+TEST(TagTest, Type_Int8Array)
+{
+    const std::vector<int8_t> v = {-42, 100, 0};
+    const Tag tag(v);
+
+    std::vector<int8_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt8Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT8_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<int8_t>");
+    EXPECT_TRUE(tag.IsInt8Array());
+    EXPECT_TRUE(tag.IsSignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt8Array)
+{
+    const std::vector<uint8_t> v = {42, 200, 0};
+    const Tag tag(v);
+
+    std::vector<uint8_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt8Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT8_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<uint8_t>");
+    EXPECT_TRUE(tag.IsUInt8Array());
+    EXPECT_TRUE(tag.IsUnsignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_Int16Array)
+{
+    const std::vector<int16_t> v = {42, -300, 0};
+    const Tag tag(v);
+
+    std::vector<int16_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt16Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT16_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<int16_t>");
+    EXPECT_TRUE(tag.IsInt16Array());
+    EXPECT_TRUE(tag.IsSignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt16Array)
+{
+    const std::vector<uint16_t> v = {42, 300, 0};
+    const Tag tag(v);
+
+    std::vector<uint16_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt16Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT16_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<uint16_t>");
+    EXPECT_TRUE(tag.IsUInt16Array());
+    EXPECT_TRUE(tag.IsUnsignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+    ;
+}
+
+TEST(TagTest, Type_Int32Array)
+{
+    const std::vector<int32_t> v = {42, -300, 0};
+    const Tag tag(v);
+
+    std::vector<int32_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToInt32Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::INT32_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<int32_t>");
+    EXPECT_TRUE(tag.IsInt32Array());
+    EXPECT_TRUE(tag.IsSignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_UInt32Array)
+{
+    const std::vector<uint32_t> v = {42, 300, 0};
+    const Tag tag(v);
+
+    std::vector<uint32_t> v2;
+    EXPECT_NO_THROW(v2 = tag.ToUInt32Array());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::UINT32_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<uint32_t>");
+    EXPECT_TRUE(tag.IsUInt32Array());
+    EXPECT_TRUE(tag.IsUnsignedArray());
+    EXPECT_TRUE(tag.IsIntegralArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, Type_FloatArray)
+{
+    const std::vector<float> v = {1.1f, 1.2f, 1.3f};
+    const Tag tag(v);
+
+    std::vector<float> v2;
+    EXPECT_NO_THROW(v2 = tag.ToFloatArray());
+
+    EXPECT_TRUE(tag.Type() == TagDataType::FLOAT_ARRAY);
+    EXPECT_TRUE(tag.Typename() == "vector<float>");
+    EXPECT_TRUE(tag.IsFloatArray());
+    EXPECT_TRUE(tag.IsArray());
+
+    EXPECT_FALSE(tag.IsIntegralArray());
+    EXPECT_FALSE(tag.IsFloat());
+    EXPECT_FALSE(tag.IsString());
+    EXPECT_FALSE(tag.IsNull());
+    EXPECT_FALSE(tag.IsNumeric());
+
+    EXPECT_EQ(v, v2);
+}
+
+TEST(TagTest, CastBackToOriginalOk)
+{
+    int8_t i8 = 0;
+    uint8_t u8 = 0;
+    int16_t i16 = 0;
+    uint16_t u16 = 0;
+    int32_t i32 = 0;
+    uint32_t u32 = 0;
+    float f = 0.0;
+    std::string str = "";
+    std::vector<int8_t> i8_array;
+    std::vector<uint8_t> u8_array;
+    std::vector<int16_t> i16_array;
+    std::vector<uint16_t> u16_array;
+    std::vector<int32_t> i32_array;
+    std::vector<uint32_t> u32_array;
+    std::vector<float> float_array;
+
+    Tag i8Tag(i8);
+    Tag u8Tag(u8);
+    Tag i16Tag(i16);
+    Tag u16Tag(u16);
+    Tag i32Tag(i32);
+    Tag u32Tag(u32);
+    Tag floatTag(f);
+    Tag stringTag(str);
+    Tag i8_array_Tag(i8_array);
+    Tag u8_array_Tag(u8_array);
+    Tag i16_array_Tag(i16_array);
+    Tag u16_array_Tag(u16_array);
+    Tag i32_array_Tag(i32_array);
+    Tag u32_array_Tag(u32_array);
+    Tag float_array_Tag(float_array);
+
+    EXPECT_NO_THROW({
+        i8 = i8Tag.ToInt8();
+        u8 = u8Tag.ToUInt8();
+        i16 = i16Tag.ToInt16();
+        u16 = u16Tag.ToUInt16();
+        i32 = i32Tag.ToInt32();
+        u32 = u32Tag.ToUInt32();
+        f = floatTag.ToFloat();
+        str = stringTag.ToString();
+        i8_array = i8_array_Tag.ToInt8Array();
+        u8_array = u8_array_Tag.ToUInt8Array();
+        i16_array = i16_array_Tag.ToInt16Array();
+        u16_array = u16_array_Tag.ToUInt16Array();
+        i32_array = i32_array_Tag.ToInt32Array();
+        u32_array = u32_array_Tag.ToUInt32Array();
+        float_array = float_array_Tag.ToFloatArray();
+    });
+}
+
+TEST(TagTest, ConvertToInt8)
+{
+    Tag zero(int32_t{0});
+    Tag min(int32_t{std::numeric_limits<int8_t>::min()});
+    Tag normal(int32_t{42});
+    Tag max(int32_t{std::numeric_limits<int8_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<int8_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToInt8();
+        min.ToInt8();
+        normal.ToInt8();
+        max.ToInt8();
+    });
+
+    // not allowed
+    EXPECT_THROW(floatTag.ToInt8(), std::exception);
+    EXPECT_THROW(stringTag.ToInt8(), std::exception);
+    EXPECT_THROW(arrayTag.ToInt8(), std::exception);
+}
+
+TEST(TagTest, ConvertToUInt8)
+{
+    Tag zero(int32_t{0});
+    Tag neg(int32_t{-1});
+    Tag normal(int32_t{42});
+    Tag max(int32_t{std::numeric_limits<uint8_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<uint8_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToUInt8();
+        normal.ToUInt8();
+        max.ToUInt8();
+    });
+
+    // not allowed
+    EXPECT_THROW(neg.ToUInt8(), std::exception);
+    EXPECT_THROW(floatTag.ToUInt8(), std::exception);
+    EXPECT_THROW(stringTag.ToUInt8(), std::exception);
+    EXPECT_THROW(arrayTag.ToUInt8(), std::exception);
+}
+
+TEST(TagTest, ConvertToInt16)
+{
+    Tag zero(int32_t{0});
+    Tag min(int32_t{std::numeric_limits<int16_t>::min()});
+    Tag normal(int32_t{42});
+    Tag max(int32_t{std::numeric_limits<int16_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<int16_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToInt16();
+        min.ToInt16();
+        normal.ToInt16();
+        max.ToInt16();
+    });
+
+    // not allowed
+    EXPECT_THROW(floatTag.ToInt16(), std::exception);
+    EXPECT_THROW(stringTag.ToInt16(), std::exception);
+    EXPECT_THROW(arrayTag.ToInt16(), std::exception);
+}
+
+TEST(TagTest, ConvertToUInt16)
+{
+    Tag zero(int32_t{0});
+    Tag neg(int32_t{-1});
+    Tag normal(int32_t{42});
+    Tag max(int32_t{std::numeric_limits<uint16_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<uint16_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToUInt16();
+        normal.ToUInt16();
+        max.ToUInt16();
+    });
+
+    // not allowed
+    EXPECT_THROW(neg.ToUInt16(), std::exception);
+    EXPECT_THROW(floatTag.ToUInt16(), std::exception);
+    EXPECT_THROW(stringTag.ToUInt16(), std::exception);
+    EXPECT_THROW(arrayTag.ToUInt16(), std::exception);
+}
+
+TEST(TagTest, ConvertToInt32)
+{
+    Tag zero(int32_t{0});
+    Tag min(int32_t{std::numeric_limits<int32_t>::min()});
+    Tag normal(int32_t{42});
+    Tag max(int32_t{std::numeric_limits<int32_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<int32_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToInt32();
+        min.ToInt32();
+        normal.ToInt32();
+        max.ToInt32();
+    });
+
+    // not allowed
+    EXPECT_THROW(floatTag.ToInt32(), std::exception);
+    EXPECT_THROW(stringTag.ToInt32(), std::exception);
+    EXPECT_THROW(arrayTag.ToInt32(), std::exception);
+}
+
+TEST(TagTest, ConvertToUInt32)
+{
+    Tag zero(int32_t{0});
+    Tag neg(int32_t{-1});
+    Tag normal(int32_t{42});
+    Tag max(uint32_t{std::numeric_limits<uint32_t>::max()});
+    Tag floatTag(float{3.14});
+    Tag stringTag(std::string{"foo"});
+    Tag arrayTag(std::vector<uint32_t>{{1, 2, 3}});
+
+    // allowed
+    EXPECT_NO_THROW({
+        zero.ToUInt32();
+        normal.ToUInt32();
+        max.ToUInt32();
+    });
+
+    // not allowed
+    EXPECT_THROW(neg.ToUInt32(), std::exception);
+    EXPECT_THROW(floatTag.ToUInt32(), std::exception);
+    EXPECT_THROW(stringTag.ToUInt32(), std::exception);
+    EXPECT_THROW(arrayTag.ToUInt32(), std::exception);
+}
+
+TEST(TagCollectionTest, DefaultConstruction)
+{
+    TagCollection tags;
+    EXPECT_TRUE(tags.empty());
+    EXPECT_FALSE(tags.Contains("XY"));
+}
+
+TEST(TagCollectionTest, AddSimpleTags)
+{
+    const int32_t intValue = -42;
+    const std::string strValue = "foo";
+    const std::string hexStrValue = "1abc75";
+
+    TagCollection tags;
+    tags["ST"] = strValue;
+    tags["XY"] = intValue;
+    tags["HX"] = hexStrValue;
+    tags["HX"].Modifier(TagModifier::HEX_STRING);
+
+    EXPECT_EQ(3, tags.size());
+    EXPECT_TRUE(tags.Contains("XY"));
+    EXPECT_TRUE(tags.Contains("ST"));
+    EXPECT_TRUE(tags.Contains("HX"));
+    EXPECT_FALSE(tags.Contains("ZZ"));
+
+    EXPECT_TRUE(tags["XY"].ToInt32() == intValue);
+    EXPECT_TRUE(tags["ST"].ToString() == strValue);
+    EXPECT_TRUE(tags["HX"].ToString() == hexStrValue);
+    EXPECT_TRUE(tags["HX"].HasModifier(TagModifier::HEX_STRING));
+}
+
+TEST(SamTagCodecTest, DecodeTest)
+{
+    std::string tagString;
+    tagString.append("HX:H:1abc75");
+    tagString.append("\t");
+    tagString.append("ST:Z:foo");
+    tagString.append("\t");
+    tagString.append("VC:B:i,42,-100,37,2048");
+    tagString.append("\t");
+    tagString.append("XY:i:-42");
+
+    TagCollection expected;
+    expected["ST"] = std::string("foo");
+    expected["XY"] = int32_t{-42};
+    expected["HX"] = Tag("1abc75", TagModifier::HEX_STRING);
+    expected["VC"] = std::vector<int32_t>({42, -100, 37, 2048});
+
+    TagCollection tags = SamTagCodec::Decode(tagString);
+
+    EXPECT_TRUE(tags.Contains("ST"));
+    EXPECT_TRUE(tags.Contains("HX"));
+    EXPECT_TRUE(tags.Contains("XY"));
+    EXPECT_TRUE(tags.Contains("VC"));
+
+    EXPECT_EQ(std::string("foo"), tags["ST"].ToString());
+    EXPECT_TRUE(tags["HX"].HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), tags["HX"].ToString());
+    EXPECT_EQ(int8_t{-42}, tags["XY"].ToInt8());
+    EXPECT_EQ(std::vector<int32_t>({42, -100, 37, 2048}), tags["VC"].ToInt32Array());
+}
+
+TEST(SamTagCodecTest, EncodeTest)
+{
+    TagCollection tags;
+    tags["ST"] = std::string("foo");
+    tags["XY"] = int32_t{-42};
+    tags["HX"] = Tag("1abc75", TagModifier::HEX_STRING);
+    tags["VC"] = std::vector<int32_t>({42, -100, 37, 2048});
+
+    // "HX:H:1abc75\tST:Z:foo\0\tVC:B:i,42,-100,37,2048\tXY:i:-42"
+    std::string expected;
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("ST:Z:foo");
+    expected.append("\t");
+    expected.append("VC:B:i,42,-100,37,2048");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(tags);
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamTagCodecTest, DecodeTest)
+{
+    std::vector<uint8_t> data;
+    data.push_back(uint8_t('H'));
+    data.push_back(uint8_t('X'));
+    data.push_back(uint8_t('H'));
+    data.push_back(uint8_t('1'));
+    data.push_back(uint8_t('a'));
+    data.push_back(uint8_t('b'));
+    data.push_back(uint8_t('c'));
+    data.push_back(uint8_t('7'));
+    data.push_back(uint8_t('5'));
+    data.push_back(uint8_t(0));
+
+    data.push_back(uint8_t('X'));
+    data.push_back(uint8_t('Y'));
+    data.push_back(uint8_t('i'));
+    const int32_t x = -42;
+    char valueBytes[sizeof x];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&x)),
+              static_cast<const char*>(static_cast<const void*>(&x)) + sizeof x, valueBytes);
+    data.push_back(valueBytes[0]);
+    data.push_back(valueBytes[1]);
+    data.push_back(valueBytes[2]);
+    data.push_back(valueBytes[3]);
+
+    data.push_back('C');
+    data.push_back('A');
+    data.push_back('B');
+    data.push_back('C');
+    const uint32_t numChars = 3;
+    char numCharsValueBytes[sizeof numChars];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&numChars)),
+              static_cast<const char*>(static_cast<const void*>(&numChars)) + sizeof numChars,
+              numCharsValueBytes);
+    data.push_back(numCharsValueBytes[0]);
+    data.push_back(numCharsValueBytes[1]);
+    data.push_back(numCharsValueBytes[2]);
+    data.push_back(numCharsValueBytes[3]);
+
+    const std::vector<uint8_t> charArray = std::vector<uint8_t>({34, 5, 125});
+    data.push_back(charArray.at(0));
+    data.push_back(charArray.at(1));
+    data.push_back(charArray.at(2));
+
+    TagCollection tags = BamTagCodec::Decode(data);
+
+    EXPECT_TRUE(tags["HX"].HasModifier(TagModifier::HEX_STRING));
+    EXPECT_EQ(std::string("1abc75"), tags["HX"].ToString());
+    EXPECT_EQ(x, tags["XY"].ToInt32());
+    EXPECT_EQ(charArray, tags["CA"].ToUInt8Array());
+
+    // sanity check - convert tags back to SAM
+    std::string expected;
+    expected.append("CA:B:C,34,5,125");
+    expected.append("\t");
+    expected.append("HX:H:1abc75");
+    expected.append("\t");
+    expected.append("XY:i:-42");
+
+    const std::string sam = SamTagCodec::Encode(tags);
+    EXPECT_EQ(expected, sam);
+}
+
+TEST(BamTagCodecTest, EncodeTest)
+{
+    std::vector<uint8_t> expected;
+
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('B');
+    expected.push_back('C');
+    const uint32_t numChars = 3;
+    char numCharsValueBytes[sizeof numChars];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&numChars)),
+              static_cast<const char*>(static_cast<const void*>(&numChars)) + sizeof numChars,
+              numCharsValueBytes);
+    expected.push_back(numCharsValueBytes[0]);
+    expected.push_back(numCharsValueBytes[1]);
+    expected.push_back(numCharsValueBytes[2]);
+    expected.push_back(numCharsValueBytes[3]);
+
+    const std::vector<uint8_t> charArray = std::vector<uint8_t>({34, 5, 125});
+    expected.push_back(charArray.at(0));
+    expected.push_back(charArray.at(1));
+    expected.push_back(charArray.at(2));
+
+    expected.push_back(uint8_t('H'));
+    expected.push_back(uint8_t('X'));
+    expected.push_back(uint8_t('H'));
+    expected.push_back(uint8_t('1'));
+    expected.push_back(uint8_t('a'));
+    expected.push_back(uint8_t('b'));
+    expected.push_back(uint8_t('c'));
+    expected.push_back(uint8_t('7'));
+    expected.push_back(uint8_t('5'));
+    expected.push_back(uint8_t(0));
+
+    expected.push_back(uint8_t('X'));
+    expected.push_back(uint8_t('Y'));
+    expected.push_back(uint8_t('i'));
+    const int32_t x = -42;
+    char valueBytes[sizeof x];
+    std::copy(static_cast<const char*>(static_cast<const void*>(&x)),
+              static_cast<const char*>(static_cast<const void*>(&x)) + sizeof x, valueBytes);
+    expected.push_back(valueBytes[0]);
+    expected.push_back(valueBytes[1]);
+    expected.push_back(valueBytes[2]);
+    expected.push_back(valueBytes[3]);
+
+    TagCollection tags;
+    tags["HX"] = Tag("1abc75", TagModifier::HEX_STRING);
+    tags["CA"] = charArray;
+    tags["XY"] = x;
+
+    const std::vector<uint8_t> data = BamTagCodec::Encode(tags);
+    EXPECT_EQ(expected, data);
+}
+
+TEST(BamTagCodecTest, AsciiTagsTest)
+{
+    std::vector<uint8_t> expected;
+    expected.reserve(20);
+    expected.push_back('I');  // I8:A:B
+    expected.push_back('8');
+    expected.push_back('A');
+    expected.push_back('B');
+    expected.push_back('P');  // PC:A:$
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('$');
+    expected.push_back('S');  // SC:A:$
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('$');
+    expected.push_back('U');  // U8:A:A
+    expected.push_back('8');
+    expected.push_back('A');
+    expected.push_back('A');
+    expected.push_back('U');  // UC:A:$
+    expected.push_back('C');
+    expected.push_back('A');
+    expected.push_back('$');
+
+    const char c = '$';
+    const signed char sc = '$';
+    const unsigned char uc = '$';
+    const uint8_t u8 = 65;
+    const int8_t i8 = 66;
+
+    {  // old style: construct-then-modify
+
+        Tag fromPlainChar = Tag(c);
+        Tag fromSignedChar = Tag(sc);
+        Tag fromUnsignedChar = Tag(uc);
+        Tag fromUint8 = Tag(u8);
+        Tag fromInt8 = Tag(i8);
+        fromPlainChar.Modifier(TagModifier::ASCII_CHAR);
+        fromSignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUnsignedChar.Modifier(TagModifier::ASCII_CHAR);
+        fromUint8.Modifier(TagModifier::ASCII_CHAR);
+        fromInt8.Modifier(TagModifier::ASCII_CHAR);
+
+        TagCollection tags;
+        tags["PC"] = fromPlainChar;
+        tags["SC"] = fromSignedChar;
+        tags["UC"] = fromUnsignedChar;
+        tags["U8"] = fromUint8;
+        tags["I8"] = fromInt8;
+
+        const std::vector<uint8_t> data = BamTagCodec::Encode(tags);
+        EXPECT_EQ(expected, data);
+    }
+
+    {  // new style: construct directly as ASCII
+
+        const Tag fromPlainChar = Tag(c, TagModifier::ASCII_CHAR);
+        const Tag fromSignedChar = Tag(sc, TagModifier::ASCII_CHAR);
+        const Tag fromUnsignedChar = Tag(uc, TagModifier::ASCII_CHAR);
+        const Tag fromUint8 = Tag(u8, TagModifier::ASCII_CHAR);
+        const Tag fromInt8 = Tag(i8, TagModifier::ASCII_CHAR);
+
+        TagCollection tags;
+        tags["PC"] = fromPlainChar;
+        tags["SC"] = fromSignedChar;
+        tags["UC"] = fromUnsignedChar;
+        tags["U8"] = fromUint8;
+        tags["I8"] = fromInt8;
+
+        const std::vector<uint8_t> data = BamTagCodec::Encode(tags);
+        EXPECT_EQ(expected, data);
+    }
+}
diff --git a/tests/src/test_TimeUtils.cpp b/tests/src/test_TimeUtils.cpp

new file mode 100644 (file)

index 0000000..e3b7df8
--- /dev/null
+++ b/tests/src/test_TimeUtils.cpp
@@ -0,0 +1,33 @@
+// Author: Derek Barnett
+
+#include <chrono>
+#include <ctime>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/../../src/TimeUtils.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+
+TEST(TimeUtilsTest, ToIso8601)
+{
+    const time_t rawTime = 436428750L;
+    const auto timestamp = std::chrono::system_clock::from_time_t(rawTime);
+
+    const auto expected = std::string{"1983-10-31T06:12:30Z"};  // no ms in test case
+    const auto actual = internal::ToIso8601(timestamp);
+    EXPECT_EQ(expected, actual);
+}
+
+TEST(TimeUtilsTest, ToDataSetFormat)
+{
+    const time_t rawTime = 436428750L;
+    const auto timestamp = std::chrono::system_clock::from_time_t(rawTime);
+
+    const auto expected = std::string{"831031_061230"};  // no ms in test case
+    const std::string actual = internal::ToDataSetFormat(timestamp);
+    EXPECT_EQ(expected, actual);
+}
diff --git a/tests/src/test_Validator.cpp b/tests/src/test_Validator.cpp

new file mode 100644 (file)

index 0000000..5f540de
--- /dev/null
+++ b/tests/src/test_Validator.cpp
@@ -0,0 +1,574 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/BamFile.h>
+#include <pbbam/BamHeader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/Cigar.h>
+#include <pbbam/ReadGroupInfo.h>
+#include <pbbam/Validator.h>
+
+#include "../src/StringUtils.h"
+#include "../src/ValidationErrors.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace ValidatorTests {
+
+static BamRecord makeValidMappedRecord()
+{
+    BamRecordImpl impl;
+    impl.Bin(4680);
+    impl.Flag(2);
+    impl.InsertSize(0);
+    impl.MapQuality(10);
+    impl.MatePosition(-1);
+    impl.MateReferenceId(-1);
+    impl.Name("movie1/54130/0_10");
+    impl.Position(1);
+    impl.ReferenceId(0);
+    impl.SetMapped(true);
+    impl.SetSequenceAndQualities("AATGAGGAGA");
+    impl.CigarData(Cigar{"10="});
+
+    TagCollection tags;
+    tags["RG"] = std::string{"3f58e5b8"};
+    tags["dq"] = std::string{"2222'$22'2"};
+    tags["dt"] = std::string{"NNNNAGNNGN"};
+    tags["iq"] = std::string{"(+#1'$#*1&"};
+    tags["mq"] = std::string{"&1~51*5&~2"};
+    tags["sq"] = std::string{"<32<4<<<<3"};
+    tags["ip"] = std::vector<uint8_t>{2, 0, 10, 22, 34, 0, 2, 3, 0, 16};
+    tags["np"] = int32_t{1};
+    tags["qe"] = int32_t{10};
+    tags["qs"] = int32_t{0};
+    tags["zm"] = int32_t{54130};
+    tags["cx"] = int32_t{2};
+    tags["AS"] = int32_t{-3020};
+    tags["NM"] = int32_t{134};
+    tags["rq"] = static_cast<float>(0.854);
+    tags["sn"] = std::vector<float>{2.0, 2.0, 2.0, 2.0};
+    impl.Tags(tags);
+
+    return BamRecord(impl);
+}
+
+static BamRecord makeValidUnmappedRecord()
+{
+    BamRecordImpl impl;
+    impl.Bin(4680);
+    impl.Flag(4);
+    impl.InsertSize(0);
+    impl.MapQuality(10);
+    impl.MatePosition(-1);
+    impl.MateReferenceId(-1);
+    impl.Name("m140906_231018_42161_c100676332550000001823129611271486_s1_p0/8/0_10");
+    impl.Position(-1);
+    impl.ReferenceId(-1);
+    impl.SetSequenceAndQualities("AATGAGGAGA");
+
+    TagCollection tags;
+    tags["RG"] = std::string{"b5482b33"};
+    tags["dq"] = std::string{"2222222222"};
+    tags["dt"] = std::string{"NNNNNNNNNN"};
+    tags["iq"] = std::string{",*11111001"};
+    tags["mq"] = std::string{"&47088')34"};
+    tags["sq"] = std::string{"8<4<:<6<0<"};
+    tags["ip"] = std::vector<uint8_t>{255, 9, 20, 43, 38, 12, 9, 30, 39, 22};
+    tags["np"] = int32_t{1};
+    tags["qe"] = int32_t{10};
+    tags["qs"] = int32_t{0};
+    tags["zm"] = int32_t{8};
+    tags["cx"] = int32_t{2};
+    tags["AS"] = int32_t{-3020};
+    tags["NM"] = int32_t{134};
+    tags["rq"] = static_cast<float>(0.811);
+    tags["sn"] = std::vector<float>{2.0, 2.0, 2.0, 2.0};
+    impl.Tags(tags);
+
+    return BamRecord(impl);
+}
+
+static ReadGroupInfo makeValidReadGroup()
+{
+    ReadGroupInfo rg("f5b4ffb6");
+    rg.MovieName("movie32");
+    rg.ReadType("CCS");
+    rg.BindingKit("100372700");
+    rg.SequencingKit("100612400");
+    rg.BasecallerVersion("2.3");
+    rg.FrameRateHz("100");
+    rg.Control("TRUE");
+    return rg;
+}
+
+// valid, 'starter' objects
+static const ReadGroupInfo validReadGroup = makeValidReadGroup();
+static const BamRecord validMappedRecord = makeValidMappedRecord();
+static const BamRecord validUnmappedRecord = makeValidUnmappedRecord();
+
+}  // namespace ValidatorTests
+
+TEST(ValidatorErrorsTest, SetMaxNumErrors)
+{
+    {  // default - use "no max"
+        internal::ValidationErrors errors;
+        EXPECT_EQ(internal::ValidationErrors::MAX, errors.MaxNumErrors());
+    }
+    {  // max of zero doesn't make sense... make equivalent to "no max"
+        internal::ValidationErrors errors(0);
+        EXPECT_EQ(internal::ValidationErrors::MAX, errors.MaxNumErrors());
+    }
+    {  // max = 1
+        internal::ValidationErrors errors(1);
+        EXPECT_EQ(1, errors.MaxNumErrors());
+    }
+    {  // max = 10
+        internal::ValidationErrors errors(10);
+        EXPECT_EQ(10, errors.MaxNumErrors());
+    }
+}
+
+TEST(ValidatorErrorsTest, ThrowOnMaxReached)
+{
+    {
+        internal::ValidationErrors errors(1);
+        EXPECT_THROW(errors.AddFileError("foo", "you"), ValidationException);
+    }
+    {
+        internal::ValidationErrors errors(2);
+        errors.AddFileError("foo", "you");
+        EXPECT_THROW(errors.AddFileError("foo", "me"), ValidationException);
+    }
+}
+
+TEST(ValidatorErrorsTest, ExceptionFromResults)
+{
+    const std::string error1 = "error1";
+    const std::string error2 = "error2";
+
+    try {
+
+        internal::ValidationErrors errors(4);
+        errors.AddFileError("path/to/foo.bam", error1);
+        errors.AddFileError("path/to/foo.bam", error2);
+        errors.AddReadGroupError("deadbeef", "invalid sequencing chemistry combination detected");
+        errors.AddRecordError(
+            "m140906_231018_42161_c100676332550000001823129611271486_s1_p0/8/0_10",
+            "MergeQV does not match expected length");
+
+    } catch (ValidationException& e) {
+
+        EXPECT_EQ(1, e.FileErrors().size());                        // only 1 file
+        EXPECT_EQ(2, e.FileErrors().at("path/to/foo.bam").size());  // 2 errors for this file
+        EXPECT_EQ(1, e.ReadGroupErrors().size());
+        EXPECT_EQ(1, e.RecordErrors().size());
+    }
+}
+
+TEST(ValidatorTest, ValidReadGroup)
+{
+    ASSERT_NO_THROW(Validator::Validate(ValidatorTests::validReadGroup));
+}
+
+TEST(ValidatorTest, ReadGroupRequiredComponents)
+{
+    {  // missing ID
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.Id("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing movie name
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.MovieName("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing read type
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.ReadType("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing binding kit
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.BindingKit("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing sequencing kit
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.SequencingKit("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing basecaller version
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.BasecallerVersion("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // missing frame rate
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.FrameRateHz("");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+}
+
+TEST(ValidatorTest, ReadGroupValues)
+{
+    {  // mismatch expected ID vs stored ID - change ID
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.Id("deadbeef");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // mismatch expected ID vs stored ID - change read type
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.ReadType("SUBREAD");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // mismatch expected ID vs stored ID - change movie name
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.MovieName("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // unknown read type
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.ReadType("FOO");
+
+        // recompute ID so we're only checking the new read type, not read ID
+        rg.Id(MakeReadGroupId(rg.MovieName(), rg.ReadType()));
+
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // invalid chemistry triple - change binding kit
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.BindingKit("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // invalid chemistry triple - change sequencing kit
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.SequencingKit("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // invalid chemistry triple - change basecaller version
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.BasecallerVersion("0.42");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+    {  // non-numeric frame rate
+        ReadGroupInfo rg = ValidatorTests::validReadGroup;
+        rg.FrameRateHz("foo");
+        EXPECT_THROW(Validator::Validate(rg), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(rg));
+    }
+}
+
+TEST(ValidatorTest, ValidHeader)
+{
+    const BamHeader validMappedHeader{
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:"
+        "734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV="
+        "iq;"
+        "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;"
+        "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200"
+        "\tPU:movie1\n"};
+
+    const BamHeader validUnmappedHeader{
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+        "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t"
+        "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n"};
+
+    ASSERT_NO_THROW(Validator::Validate(validMappedHeader));
+    ASSERT_NO_THROW(Validator::Validate(validUnmappedHeader));
+}
+
+TEST(ValidatorTest, ValidateHeader)
+{
+    const BamHeader validMappedHeader{
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:"
+        "734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV="
+        "iq;"
+        "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;"
+        "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200"
+        "\tPU:movie1\n"};
+
+    {  // invalid SAM version - non-numeric
+        BamHeader header = validMappedHeader.DeepCopy();
+        header.Version("foo");
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {  // invalid SAM version - negative version numbers
+        BamHeader header = validMappedHeader.DeepCopy();
+        header.Version("-1.4.0");
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {  // invalid sort order
+        BamHeader header = validMappedHeader.DeepCopy();
+        header.SortOrder("not_a_valid_sort_order");
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+
+    // invalid PacBioBamVersion numbers (non-numeric, negative, earlier than min)
+    // already throw when you try to set them... so we have to catch & ignore
+    // initial exception to get to validator
+
+    {  // invalid PacBioBAM version - non-numeric
+        BamHeader header = validMappedHeader.DeepCopy();
+        try {
+            header.PacBioBamVersion("foo");
+        } catch (...) {
+        }
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {  // invalid PacBioBAM version - negative version numbers
+        BamHeader header = validMappedHeader.DeepCopy();
+        try {
+            header.PacBioBamVersion("-1.4.0");
+        } catch (...) {
+        }
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+    {  // invalid PacBioBAM version - earlier than minimum allowed
+        BamHeader header = validMappedHeader.DeepCopy();
+        try {
+            header.PacBioBamVersion("3.0.0");
+        } catch (...) {
+        }
+        EXPECT_THROW(Validator::Validate(header), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(header));
+    }
+}
+
+TEST(ValidatorTest, ValidRecord)
+{
+    const BamHeader validMappedHeader{
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:"
+        "734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV="
+        "iq;"
+        "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;"
+        "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200"
+        "\tPU:movie1\n"};
+    BamRecord record(ValidatorTests::validMappedRecord);
+    record.header_ = validMappedHeader;
+    ASSERT_NO_THROW(Validator::Validate(record));
+}
+
+static inline void ModifyTag(BamRecord* record, const std::string& tagName, const Tag& tag)
+{
+    if (record->Impl().HasTag(tagName))
+        record->Impl().EditTag(tagName, tag);
+    else
+        record->Impl().AddTag(tagName, tag);
+}
+
+static inline void CheckInvalidTagLength(const std::string& tagName, const Tag& tag)
+{
+    static const BamHeader validUnmappedHeader{
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+        "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t"
+        "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n"};
+    BamRecord record(ValidatorTests::validUnmappedRecord);
+    record.header_ = validUnmappedHeader;
+
+    ModifyTag(&record, tagName, tag);
+
+    EXPECT_THROW(Validator::Validate(record), ValidationException);
+    EXPECT_FALSE(Validator::IsValid(record));
+}
+
+TEST(ValidatorTest, TagDataLengths)
+{
+    const BamHeader validUnmappedHeader{
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+        "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t"
+        "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n"};
+
+    // make these "variable-length" SEQ/tags too short for the read's stated
+    // queryStart/queryEnd
+
+    {  // SEQ
+        BamRecord record(ValidatorTests::validUnmappedRecord);
+        record.header_ = validUnmappedHeader;
+        record.Impl().SetSequenceAndQualities("AA");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+
+    CheckInvalidTagLength("dq", QualityValues("@@").Fastq());  // DeletionQV
+    CheckInvalidTagLength("iq", QualityValues("@@").Fastq());  // InsertionQV
+    CheckInvalidTagLength("mq", QualityValues("@@").Fastq());  // MergeQV
+    CheckInvalidTagLength("sq", QualityValues("@@").Fastq());  // SubstitutionQV
+    CheckInvalidTagLength("dt", std::string("AA"));            // DeletionTag
+    CheckInvalidTagLength("st", std::string("AA"));            // SubstitutionTag
+
+    const Frames f{{42, 42, 42}};
+    const auto& frames = f.Data();
+    CheckInvalidTagLength("ip", frames);  // IPD
+
+    // NOTE: disabling "internal" tag checks for now, only checking "standard"
+    //       PacBioBAM tags
+
+    //    const auto& pulses = vector<uint16_t>{42, 42, 42};
+    //    CheckInvalidTagLength("pv", QualityValues("@@").Fastq());  // AltLabelQV
+    //    CheckInvalidTagLength("pq", QualityValues("@@").Fastq());  // LabelQV
+    //    CheckInvalidTagLength("pg", QualityValues("@@").Fastq());  // PulseMergeQv
+    //    CheckInvalidTagLength("pt", string("AA")); // AltLabelTag
+    //    CheckInvalidTagLength("pc", string("AA")); // PulseCall
+    //    CheckInvalidTagLength("pd", frames); // PrePulseFrames
+    //    CheckInvalidTagLength("px", frames); // PulseCallWidth
+    //    CheckInvalidTagLength("pw", frames); // PulseWidth
+    //    CheckInvalidTagLength("pa", pulses); // Pkmean
+    //    CheckInvalidTagLength("ps", pulses); // Pkmean2
+    //    CheckInvalidTagLength("pm", pulses); // Pkmid
+    //    CheckInvalidTagLength("pi", pulses); // Pkmid2
+}
+
+TEST(ValidatorTest, TagDataValues)
+{
+    const BamHeader validMappedHeader{
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n"
+        "@SQ\tSN:ecoliK12_pbi_March2013_2955000_to_2980000\tLN:25000\tM5:"
+        "734d5f3b2859595f4bd87a2fe6b7389b\n"
+        "@RG\tID:3f58e5b8\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;InsertionQV="
+        "iq;"
+        "MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BASECALLERVERSION=2.1;"
+        "FRAMERATEHZ=75.000000;BINDINGKIT=100356300;SEQUENCINGKIT=100356200"
+        "\tPU:movie1\n"};
+
+    {  // missing qe
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("qe");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // missing qs
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("qs");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // queryStart should be < queryEnd
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.QueryStart(10);
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // missing zm
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("zm");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // missing np
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("np");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // numPasses for SUBREAD type records should be 1
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.NumPasses(42);
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // missing sn
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().RemoveTag("sn");
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+}
+
+TEST(ValidatorTest, MappedRecords)
+{
+    const BamHeader validMappedHeader{
+        "@HD\tVN:1.5\tSO:coordinate\tpb:3.0.1\n"
+        "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+        "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t"
+        "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n"};
+
+    {  // mapped record should have valid refID
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().ReferenceId(-1);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // mapped record should have valid position
+        BamRecord record(ValidatorTests::validMappedRecord);
+        record.header_ = validMappedHeader;
+        record.Impl().Position(-1);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+}
+
+TEST(ValidatorTest, UnmappedRecords)
+{
+    const BamHeader validUnmappedHeader{
+        "@HD\tVN:1.5\tSO:unknown\tpb:3.0.1\n"
+        "@RG\tID:b5482b33\tPL:PACBIO\tDS:READTYPE=SUBREAD;DeletionQV=dq;DeletionTag=dt;"
+        "InsertionQV=iq;MergeQV=mq;SubstitutionQV=sq;Ipd:CodecV1=ip;BINDINGKIT=100356300;"
+        "SEQUENCINGKIT=100356200;BASECALLERVERSION=2.1;FRAMERATEHZ=75.000000\t"
+        "PU:m140906_231018_42161_c100676332550000001823129611271486_s1_p0\n"};
+
+    {  // unmapped should have no refID
+        BamRecord record(ValidatorTests::validUnmappedRecord);
+        record.header_ = validUnmappedHeader;
+        record.Impl().ReferenceId(0);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+    {  // unmapped should have no position
+        BamRecord record(ValidatorTests::validUnmappedRecord);
+        record.header_ = validUnmappedHeader;
+        record.Impl().Position(42);
+
+        EXPECT_THROW(Validator::Validate(record), ValidationException);
+        EXPECT_FALSE(Validator::IsValid(record));
+    }
+}
diff --git a/tests/src/test_VcfFile.cpp b/tests/src/test_VcfFile.cpp

new file mode 100644 (file)

index 0000000..793fb69
--- /dev/null
+++ b/tests/src/test_VcfFile.cpp
@@ -0,0 +1,44 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfFormat.h>
+
+#include "PbbamTestData.h"
+
+using VcfFile = PacBio::VCF::VcfFile;
+using VcfFormat = PacBio::VCF::VcfFormat;
+
+namespace VcfFileTests {
+
+static const std::string BasicHeaderText{
+    "##fileformat=VCFv4.2\n"
+    "##fileDate=20180509\n"
+    "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">\n"
+    "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n"
+    "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant "
+    "described in this record\">\n"
+    "##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT "
+    "alleles\">\n"
+    "##INFO=<ID=SVANN,Number=.,Type=String,Description=\"Repeat annotation of structural "
+    "variant\">\n"
+    "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
+    "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Per-sample read depth of this structural "
+    "variant\">\n"
+    "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth at this position for this "
+    "sample\">\n"
+    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample"};
+
+static const std::string VcfFn{PacBio::BAM::PbbamTestsConfig::Data_Dir +
+                               "/vcf/structural_variants.vcf"};
+
+}  // namespace VcfFileTests
+
+TEST(VCF_File, initializes_header_from_input_file)
+{
+    const VcfFile file{VcfFileTests::VcfFn};
+    const auto hdrText = VcfFormat::FormattedHeader(file.Header());
+
+    EXPECT_EQ(VcfFileTests::BasicHeaderText, hdrText);
+}
diff --git a/tests/src/test_VcfFormat.cpp b/tests/src/test_VcfFormat.cpp

new file mode 100644 (file)

index 0000000..632594b
--- /dev/null
+++ b/tests/src/test_VcfFormat.cpp
@@ -0,0 +1,421 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfFormat.h>
+#include <pbbam/vcf/VcfHeader.h>
+#include <pbbam/vcf/VcfVariant.h>
+
+#include "PbbamTestData.h"
+
+using ContigDefinition = PacBio::VCF::ContigDefinition;
+using FilterDefinition = PacBio::VCF::FilterDefinition;
+using FormatDefinition = PacBio::VCF::FormatDefinition;
+using GeneralDefinition = PacBio::VCF::GeneralDefinition;
+using InfoDefinition = PacBio::VCF::InfoDefinition;
+using Sample = PacBio::VCF::Sample;
+using VcfFormat = PacBio::VCF::VcfFormat;
+using VcfHeader = PacBio::VCF::VcfHeader;
+using VcfVariant = PacBio::VCF::VcfVariant;
+
+namespace VcfFormatTests {
+
+static const std::string BasicHeaderText{
+    "##fileformat=VCFv4.2\n"
+    "##fileDate=20180509\n"
+    "##contig=<ID=ctg1,length=4200,assembly=foo,md5=dead123beef>\n"
+    "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">\n"
+    "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n"
+    "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant "
+    "described in this record\">\n"
+    "##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT "
+    "alleles\">\n"
+    "##INFO=<ID=SVANN,Number=.,Type=String,Description=\"Repeat annotation of structural "
+    "variant\">\n"
+    "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
+    "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Per-sample read depth of this structural "
+    "variant\">\n"
+    "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth at this position for this "
+    "sample\">\n"
+    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample"};
+
+// does not have ##contig line(s) in file
+static const std::string FileHeaderText{
+    "##fileformat=VCFv4.2\n"
+    "##fileDate=20180509\n"
+    "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">\n"
+    "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n"
+    "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant "
+    "described in this record\">\n"
+    "##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT "
+    "alleles\">\n"
+    "##INFO=<ID=SVANN,Number=.,Type=String,Description=\"Repeat annotation of structural "
+    "variant\">\n"
+    "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
+    "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Per-sample read depth of this structural "
+    "variant\">\n"
+    "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth at this position for this "
+    "sample\">\n"
+    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample"};
+
+static const std::string BasicVariantText{
+    "chrXVI\t660831\tpbsv.INS.21\tC\tCAAAGGAATGGTAAAGATGGGGGGTCAACGGACAAGGGAAAGGATCCATGGGGGCA\t."
+    "\tPASS"
+    "\tIMPRECISE;SVTYPE=INS;END=660831;SVLEN=55;MULTI=1,2,3\tGT:AD:DP:AC\t0/1:2:5:1,2"};
+
+static const std::string VcfFn{PacBio::BAM::PbbamTestsConfig::Data_Dir +
+                               "/vcf/structural_variants.vcf"};
+
+}  // namespace VcfFormatTests
+
+TEST(VCF_Format, provides_current_version)
+{
+    const std::string version = VcfFormat::CurrentVersion();
+    EXPECT_EQ("VCFv4.2", version);
+}
+
+//## ----------------------------------------------------------------- ##
+//
+//              HEADER FORMATTING
+//
+//## ----------------------------------------------------------------- ##
+
+TEST(VCF_Format, can_format_contig_definition)
+{
+    const ContigDefinition def{"ctg1",
+                               {{"length", "4200"}, {"assembly", "foo"}, {"md5", "dead123beef"}}};
+    const auto text = VcfFormat::FormattedContigDefinition(def);
+    EXPECT_EQ("##contig=<ID=ctg1,length=4200,assembly=foo,md5=dead123beef>", text);
+}
+
+TEST(VCF_Format, can_format_filter_definition)
+{
+    const FilterDefinition def{"FILTER1", "Filter1"};
+    const auto text = VcfFormat::FormattedFilterDefinition(def);
+    EXPECT_EQ("##FILTER=<ID=FILTER1,Description=\"Filter1\">", text);
+}
+
+TEST(VCF_Format, can_format_format_definition)
+{
+    const FormatDefinition def{"GT", "1", "String", "Genotype"};
+    const auto text = VcfFormat::FormattedFormatDefinition(def);
+    EXPECT_EQ("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", text);
+}
+
+TEST(VCF_Format, can_format_general_header_definition)
+{
+    const GeneralDefinition def{"phasing", "partial"};
+    const auto text = VcfFormat::FormattedGeneralDefinition(def);
+    EXPECT_EQ("##phasing=partial", text);
+}
+
+TEST(VCF_Format, can_format_info_definition)
+{
+    const InfoDefinition def{"IMPRECISE", "0", "Flag", "Imprecise structural variant"};
+    const auto text = VcfFormat::FormattedInfoDefinition(def);
+    EXPECT_EQ(
+        "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">",
+        text);
+}
+
+TEST(VCF_Format, can_format_info_definition_with_optional_fields)
+{
+    {  // with Source
+        const InfoDefinition def{"IMPRECISE", "0", "Flag", "Imprecise structural variant",
+                                 "source1"};
+        const auto text = VcfFormat::FormattedInfoDefinition(def);
+        EXPECT_EQ(
+            "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural "
+            "variant\",Source=\"source1\">",
+            text);
+    }
+
+    {  // with Version
+        const InfoDefinition def{"IMPRECISE", "0",       "Flag", "Imprecise structural variant",
+                                 "",          "version1"};
+        const auto text = VcfFormat::FormattedInfoDefinition(def);
+        EXPECT_EQ(
+            "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural "
+            "variant\",Version=\"version1\">",
+            text);
+    }
+    {  // with Source & Version
+        const InfoDefinition def{"IMPRECISE", "0",       "Flag", "Imprecise structural variant",
+                                 "source1",   "version1"};
+        const auto text = VcfFormat::FormattedInfoDefinition(def);
+        EXPECT_EQ(
+            "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural "
+            "variant\",Source=\"source1\",Version=\"version1\">",
+            text);
+    }
+}
+
+TEST(VCF_Format, can_format_basic_header)
+{
+    const VcfHeader header{VcfFormatTests::BasicHeaderText};
+    const auto text = VcfFormat::FormattedHeader(header);
+    EXPECT_EQ(VcfFormatTests::BasicHeaderText, text);
+}
+
+TEST(VCF_Format, format_basic_header_with_only_filedate)
+{
+    VcfHeader header;
+    header.FileDate("1770704");
+    std::string text;
+    EXPECT_NO_THROW(text = VcfFormat::FormattedHeader(header));
+}
+
+TEST(VCF_Format, format_basic_header_with_only_version)
+{
+    VcfHeader header;
+    header.Version("3.14");
+    std::string text;
+    EXPECT_NO_THROW(text = VcfFormat::FormattedHeader(header));
+}
+
+//## ----------------------------------------------------------------- ##
+//
+//              HEADER PARSING
+//
+//## ----------------------------------------------------------------- ##
+
+TEST(VCF_Format, can_parse_general_header_definition)
+{
+    const auto phasing = VcfFormat::ParsedGeneralDefinition("##phasing=partial");
+    EXPECT_EQ("phasing", phasing.Id());
+    EXPECT_EQ("partial", phasing.Text());
+}
+
+TEST(VCF_Format, parsing_general_header_definition_throws_on_empty_string)
+{
+    EXPECT_THROW(VcfFormat::ParsedGeneralDefinition(""), std::runtime_error);
+}
+
+TEST(VCF_Format, parsing_general_header_definition_throws_on_non_vcf_input)
+{
+    EXPECT_THROW(VcfFormat::ParsedGeneralDefinition("not_vcf_header_line"), std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedGeneralDefinition("#line=not_vcf_header_line"),
+                 std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedGeneralDefinition("##line,not_vcf_header_line"),
+                 std::runtime_error);
+}
+
+TEST(VCF_Format, can_parse_contig_definition_with_id_only)
+{
+    const auto contig = VcfFormat::ParsedContigDefinition("##contig=<ID=ctg1>");
+    EXPECT_EQ("ctg1", contig.Id());
+    EXPECT_TRUE(contig.Attributes().empty());
+}
+
+TEST(VCF_Format, can_parse_contig_definition_with_attributes)
+{
+    const auto contig =
+        VcfFormat::ParsedContigDefinition("##contig=<ID=ctg1,assembly=foo,length=3>");
+    EXPECT_EQ("ctg1", contig.Id());
+    ASSERT_EQ(2, contig.Attributes().size());
+
+    const auto& firstAttr = contig.Attributes().at(0);
+    EXPECT_EQ("assembly", firstAttr.first);
+    EXPECT_EQ("foo", firstAttr.second);
+
+    const auto& secondAttr = contig.Attributes().at(1);
+    EXPECT_EQ("length", secondAttr.first);
+    EXPECT_EQ("3", secondAttr.second);
+}
+
+TEST(VCF_Format, parsing_contig_header_definition_throws_on_malformed_contig_line)
+{
+    // internal code already checks for "##contig=<"
+
+    EXPECT_THROW(VcfFormat::ParsedContigDefinition("##contig=<foo"), std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedContigDefinition("##contig=<ID=,>"), std::runtime_error);
+}
+
+TEST(VCF_Format, can_parse_filter_definition)
+{
+    const auto filter =
+        VcfFormat::ParsedFilterDefinition("##FILTER=<ID=FILTER1,Description=\"Filter1\">\n");
+    EXPECT_EQ("FILTER1", filter.Id());
+    EXPECT_EQ("Filter1", filter.Description());
+}
+
+TEST(VCF_Format, parsing_filter_definition_throws_on_malformed_filter_line)
+{
+    // internal code already checks for "##FILTER=<"
+
+    EXPECT_THROW(VcfFormat::ParsedFilterDefinition("##FILTER=<foo"), std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedFilterDefinition("##FILTER=<ID=,>"), std::runtime_error);
+}
+
+TEST(VCF_Format, can_parse_format_definition)
+{
+    const auto format = VcfFormat::ParsedFormatDefinition(
+        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
+    EXPECT_EQ("GT", format.Id());
+    EXPECT_EQ("1", format.Number());
+    EXPECT_EQ("String", format.Type());
+    EXPECT_EQ("Genotype", format.Description());
+}
+
+TEST(VCF_Format, parsing_format_definition_throws_on_malformed_filter_line)
+{
+    // internal code already checks for "##FORMAT=<"
+
+    EXPECT_THROW(VcfFormat::ParsedFormatDefinition("##FORMAT=<foo"), std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedFormatDefinition("##FORMAT=<ID=,>"), std::runtime_error);
+}
+
+TEST(VCF_Format, can_parse_info_definition)
+{
+    const auto info = VcfFormat::ParsedInfoDefinition(
+        "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">\n");
+    EXPECT_EQ("IMPRECISE", info.Id());
+    EXPECT_EQ("0", info.Number());
+    EXPECT_EQ("Flag", info.Type());
+    EXPECT_EQ("Imprecise structural variant", info.Description());
+    EXPECT_FALSE(info.Source().is_initialized());
+    EXPECT_FALSE(info.Version().is_initialized());
+}
+
+TEST(VCF_Format, parsing_info_definition_throws_on_malformed_info_line)
+{
+    // internal code already checks for "##INFO=<"
+
+    EXPECT_THROW(VcfFormat::ParsedInfoDefinition("##INFO=<foo"), std::runtime_error);
+    EXPECT_THROW(VcfFormat::ParsedInfoDefinition("##INFO=<ID=,>"), std::runtime_error);
+}
+
+TEST(VCF_Format, can_create_header_from_text)
+{
+    const VcfHeader hdr{VcfFormatTests::BasicHeaderText};
+
+    EXPECT_EQ("VCFv4.2", hdr.Version());
+    EXPECT_EQ("20180509", hdr.FileDate());
+
+    const auto& infos = hdr.InfoDefinitions();
+    ASSERT_EQ(5, infos.size());
+    EXPECT_EQ("IMPRECISE", infos.at(0).Id());
+    EXPECT_EQ("SVTYPE", infos.at(1).Id());
+    EXPECT_EQ("END", infos.at(2).Id());
+    EXPECT_EQ("SVLEN", infos.at(3).Id());
+    EXPECT_EQ("SVANN", infos.at(4).Id());
+
+    const auto& contigs = hdr.ContigDefinitions();
+    ASSERT_EQ(1, contigs.size());
+    EXPECT_EQ("ctg1", contigs.at(0).Id());
+
+    ASSERT_EQ(3, contigs.at(0).Attributes().size());
+    EXPECT_EQ("length", contigs.at(0).Attributes().at(0).first);
+    EXPECT_EQ("assembly", contigs.at(0).Attributes().at(1).first);
+    EXPECT_EQ("md5", contigs.at(0).Attributes().at(2).first);
+
+    const auto& filters = hdr.FilterDefinitions();
+    ASSERT_EQ(0, filters.size());
+
+    const auto& formats = hdr.FormatDefinitions();
+    ASSERT_EQ(3, formats.size());
+    EXPECT_EQ("GT", formats.at(0).Id());
+    EXPECT_EQ("AD", formats.at(1).Id());
+    EXPECT_EQ("DP", formats.at(2).Id());
+
+    const auto& samples = hdr.Samples();
+    ASSERT_EQ(1, samples.size());
+    EXPECT_EQ("UnnamedSample", samples[0]);
+}
+
+TEST(VCF_Format, header_parsing_throws_on_missing_fileformat_line)
+{
+    const std::string missingFormat{
+        "##fileDate=20180509\n"
+        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample\n"};
+
+    EXPECT_THROW({ VcfHeader h(missingFormat); }, std::runtime_error);
+}
+
+TEST(VCF_Format, header_parsing_throws_on_non_vcf_header_line)
+{
+    const std::string nonVcfLine{
+        "##fileformat=VCFv4.2\n"
+        " --- how did I get in here?? --- \n"
+        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample\n"};
+
+    EXPECT_THROW({ VcfHeader h(nonVcfLine); }, std::runtime_error);
+}
+
+TEST(VCF_Format, can_parse_header_from_stream)
+{
+    std::istringstream in(VcfFormatTests::BasicHeaderText);
+    const auto header = VcfFormat::HeaderFromStream(in);
+    EXPECT_EQ(VcfFormatTests::BasicHeaderText, VcfFormat::FormattedHeader(header));
+}
+
+TEST(VCF_Format, can_parse_header_from_file)
+{
+    const std::string fn{VcfFormatTests::VcfFn};
+    const auto header = VcfFormat::HeaderFromFile(fn);
+    EXPECT_EQ(VcfFormatTests::FileHeaderText, VcfFormat::FormattedHeader(header));
+}
+
+//## ----------------------------------------------------------------- ##
+//
+//              VARIANT FORMATTING
+//
+//## ----------------------------------------------------------------- ##
+
+TEST(VCF_Format, can_format_basic_variant)
+{
+    const VcfVariant var = VcfFormat::ParsedVariant(VcfFormatTests::BasicVariantText);
+    const auto text = VcfFormat::FormattedVariant(var);
+    EXPECT_EQ(VcfFormatTests::BasicVariantText, text);
+}
+
+//## ----------------------------------------------------------------- ##
+//
+//              VARIANT PARSING
+//
+//## ----------------------------------------------------------------- ##
+
+TEST(VCF_Format, can_create_variant_from_text)
+{
+    const VcfVariant var = VcfFormat::ParsedVariant(VcfFormatTests::BasicVariantText);
+
+    // CHROM POS ID REF ALT REF QUAL FILTER
+    EXPECT_EQ("chrXVI", var.Chrom());
+    EXPECT_EQ(660831, var.Position());
+    EXPECT_EQ("pbsv.INS.21", var.Id());
+    EXPECT_EQ("C", var.RefAllele());
+    EXPECT_EQ("CAAAGGAATGGTAAAGATGGGGGGTCAACGGACAAGGGAAAGGATCCATGGGGGCA", var.AltAllele());
+    EXPECT_TRUE(var.IsQualityMissing());
+    EXPECT_EQ("PASS", var.Filter());
+
+    // INFO
+    const auto& infoFields = var.InfoFields();
+    ASSERT_EQ(5, infoFields.size());
+    EXPECT_EQ("IMPRECISE", infoFields.at(0).id);
+    EXPECT_EQ("SVTYPE", infoFields.at(1).id);
+    EXPECT_EQ("END", infoFields.at(2).id);
+    EXPECT_EQ("SVLEN", infoFields.at(3).id);
+    EXPECT_EQ("MULTI", infoFields.at(4).id);
+
+    // GENOTYPES
+    const auto& ids = var.GenotypeIds();
+    ASSERT_EQ(4, ids.size());
+    EXPECT_EQ("GT", ids.at(0));
+    EXPECT_EQ("AD", ids.at(1));
+    EXPECT_EQ("DP", ids.at(2));
+    EXPECT_EQ("AC", ids.at(3));
+
+    const auto& genotypes = var.Genotypes();
+    ASSERT_EQ(1, genotypes.size());
+
+    const auto& sampleGenotype = genotypes.at(0);
+    ASSERT_EQ(4, sampleGenotype.data.size());
+    EXPECT_EQ("0/1", sampleGenotype.data.at(0).value.get());
+    EXPECT_EQ("2", sampleGenotype.data.at(1).value.get());
+    EXPECT_EQ("5", sampleGenotype.data.at(2).value.get());
+    const auto& acData = sampleGenotype.data.at(3);
+    ASSERT_EQ(2, acData.values->size());
+    EXPECT_EQ("1", acData.values->at(0));
+    EXPECT_EQ("2", acData.values->at(1));
+
+    //    ASSERT_TRUE(sampleGenotype.values.is_initialized());
+}
diff --git a/tests/src/test_VcfHeader.cpp b/tests/src/test_VcfHeader.cpp

new file mode 100644 (file)

index 0000000..2f17f67
--- /dev/null
+++ b/tests/src/test_VcfHeader.cpp
@@ -0,0 +1,189 @@
+// Author: Derek Barnett
+
+#include <iostream>
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfHeader.h>
+
+using ContigDefinition = PacBio::VCF::ContigDefinition;
+using FilterDefinition = PacBio::VCF::FilterDefinition;
+using FormatDefinition = PacBio::VCF::FormatDefinition;
+using GeneralDefinition = PacBio::VCF::GeneralDefinition;
+using InfoDefinition = PacBio::VCF::InfoDefinition;
+using VcfHeader = PacBio::VCF::VcfHeader;
+
+namespace VcfHeaderTests {
+
+static const std::string BasicHeaderText{
+    "##fileformat=VCFv4.2\n"
+    "##fileDate=20180509\n"
+    "##contig=<ID=ctg1,length=4200,assembly=foo,md5=dead123beef>\n"
+    "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variant\">\n"
+    "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n"
+    "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant "
+    "described in this record\">\n"
+    "##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT "
+    "alleles\">\n"
+    "##INFO=<ID=SVANN,Number=.,Type=String,Description=\"Repeat annotation of structural "
+    "variant\">\n"
+    "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
+    "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Per-sample read depth of this structural "
+    "variant\">\n"
+    "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth at this position for this "
+    "sample\">\n"
+    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tUnnamedSample\n"};
+
+}  // namespace VcfHeaderTests
+
+TEST(VCF_GeneralDefinition, throws_on_missing_required_fields)
+{
+    const std::string id{"id"};
+    const std::string desc{"desc"};
+
+    EXPECT_THROW(GeneralDefinition("", desc), std::runtime_error);
+    EXPECT_THROW(GeneralDefinition(id, ""), std::runtime_error);
+}
+
+TEST(VCF_ContigDefinition, throws_on_missing_required_fields)
+{
+    EXPECT_THROW(ContigDefinition(""), std::runtime_error);
+}
+
+TEST(VCF_ContigDefinition, can_edit_and_query_attributes)
+{
+    ContigDefinition contig{"id"};
+
+    EXPECT_TRUE(contig.Attributes().empty());
+
+    const std::vector<std::pair<std::string, std::string>> attributes{{"assembly", "foo"},
+                                                                      {"length", "42"}};
+    contig.Attributes(attributes);
+    ASSERT_EQ(2, contig.Attributes().size());
+    EXPECT_EQ("foo", contig.Attributes().at(0).second);
+    EXPECT_EQ("42", contig.Attributes().at(1).second);
+
+    contig.AddAttribute({"md5", "dead123beef"});
+    ASSERT_EQ(3, contig.Attributes().size());
+    EXPECT_EQ("dead123beef", contig.Attributes().at(2).second);
+}
+
+TEST(VCF_FilterDefinition, throws_on_missing_required_fields)
+{
+    const std::string id{"id"};
+    const std::string desc{"desc"};
+
+    EXPECT_THROW(FilterDefinition("", desc), std::runtime_error);
+    EXPECT_THROW(FilterDefinition(id, ""), std::runtime_error);
+}
+
+TEST(VCF_InfoDefinition, throws_on_missing_required_fields)
+{
+    const std::string id{"id"};
+    const std::string num{"num"};
+    const std::string type{"type"};
+    const std::string desc{"desc"};
+
+    EXPECT_THROW(InfoDefinition("", num, type, desc), std::runtime_error);
+    EXPECT_THROW(InfoDefinition(id, "", type, desc), std::runtime_error);
+    EXPECT_THROW(InfoDefinition(id, num, "", desc), std::runtime_error);
+    EXPECT_THROW(InfoDefinition(id, num, type, ""), std::runtime_error);
+}
+
+TEST(VCF_InfoDefinition, missing_optional_fields_is_not_error)
+{
+    InfoDefinition info{"id", "num", "type", "description"};
+
+    EXPECT_FALSE(info.Source().is_initialized());
+    EXPECT_FALSE(info.Version().is_initialized());
+
+    info.Source("source");
+    info.Version("version");
+
+    EXPECT_TRUE(info.Source().is_initialized());
+    EXPECT_TRUE(info.Version().is_initialized());
+}
+
+TEST(VCF_Header, defaults_to_current_version)
+{
+    VcfHeader hdr;
+    EXPECT_EQ("VCFv4.2", hdr.Version());
+}
+
+TEST(VCF_Header, can_lookup_contig_defnition_by_id)
+{
+    const VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto& contig = hdr.ContigDefinition("ctg1");
+    ASSERT_EQ(3, contig.Attributes().size());
+    EXPECT_EQ("length", contig.Attributes().at(0).first);
+    EXPECT_EQ("assembly", contig.Attributes().at(1).first);
+    EXPECT_EQ("md5", contig.Attributes().at(2).first);
+}
+
+TEST(VCF_Header, can_lookup_format_definition_by_id)
+{
+    const VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto& format = hdr.FormatDefinition("GT");
+    EXPECT_EQ("GT", format.Id());
+}
+
+TEST(VCF_Header, can_lookup_general_definition_by_id)
+{
+    const VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto& def = hdr.GeneralDefinition("fileformat");
+    EXPECT_EQ("fileformat", def.Id());
+}
+
+TEST(VCF_Header, can_lookup_info_definition_by_id)
+{
+    const VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto& info = hdr.InfoDefinition("IMPRECISE");
+    EXPECT_EQ("IMPRECISE", info.Id());
+}
+
+TEST(VCF_Header, can_lookup_sample)
+{
+    const VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto idx = hdr.IndexOfSample("UnnamedSample");
+    const auto sample = hdr.SampleAt(idx);
+    EXPECT_EQ("UnnamedSample", sample);
+}
+
+TEST(VCF_Header, add_duplicate_format_replaces_existing_definition)
+{
+    VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto initialFormat = hdr.FormatDefinition("GT");
+    EXPECT_EQ("Genotype", initialFormat.Description());
+
+    const FormatDefinition newFormat{"GT", "num", "type", "newDescription"};
+    hdr.AddFormatDefinition(newFormat);
+
+    const auto nowFormat = hdr.FormatDefinition("GT");
+    EXPECT_EQ("newDescription", nowFormat.Description());
+
+    // rest of defs unchanged
+    const auto& formatDefs = hdr.FormatDefinitions();
+    ASSERT_EQ(3, formatDefs.size());
+    EXPECT_EQ("AD", formatDefs.at(1).Id());
+    EXPECT_EQ("DP", formatDefs.at(2).Id());
+}
+
+TEST(VCF_Header, add_duplicate_info_replaces_existing_definition)
+{
+    VcfHeader hdr{VcfHeaderTests::BasicHeaderText};
+    const auto initialInfo = hdr.InfoDefinition("IMPRECISE");
+    EXPECT_EQ("Imprecise structural variant", initialInfo.Description());
+
+    const InfoDefinition newInfo{"IMPRECISE", "num", "type", "newInfo"};
+    hdr.AddInfoDefinition(newInfo);
+
+    const auto nowInfo = hdr.InfoDefinition("IMPRECISE");
+    EXPECT_EQ("newInfo", nowInfo.Description());
+
+    // rest of defs unchanged
+    const auto& infoDefs = hdr.InfoDefinitions();
+    ASSERT_EQ(5, infoDefs.size());
+    EXPECT_EQ("SVTYPE", infoDefs.at(1).Id());
+    EXPECT_EQ("END", infoDefs.at(2).Id());
+    EXPECT_EQ("SVLEN", infoDefs.at(3).Id());
+    EXPECT_EQ("SVANN", infoDefs.at(4).Id());
+}
diff --git a/tests/src/test_VcfQuery.cpp b/tests/src/test_VcfQuery.cpp

new file mode 100644 (file)

index 0000000..8898add
--- /dev/null
+++ b/tests/src/test_VcfQuery.cpp
@@ -0,0 +1,49 @@
+// Author: Derek Barnett
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfQuery.h>
+
+#include "PbbamTestData.h"
+
+using VcfFile = PacBio::VCF::VcfFile;
+using VcfQuery = PacBio::VCF::VcfQuery;
+using VcfVariant = PacBio::VCF::VcfVariant;
+
+namespace VcfQueryTests {
+
+static const std::vector<std::string> ExpectedIds{
+    "pbsv.INS.1",  "pbsv.DEL.2",  "pbsv.INS.3",  "pbsv.INS.4",  "pbsv.DEL.5",  "pbsv.DEL.6",
+    "pbsv.DEL.7",  "pbsv.INS.8",  "pbsv.INS.9",  "pbsv.INS.10", "pbsv.INS.11", "pbsv.INS.12",
+    "pbsv.INS.13", "pbsv.INS.14", "pbsv.INS.15", "pbsv.INS.16", "pbsv.INS.17", "pbsv.INS.18",
+    "pbsv.INS.19", "pbsv.DEL.20", "pbsv.INS.21"};
+
+static const std::string VcfFn{PacBio::BAM::PbbamTestsConfig::Data_Dir +
+                               "/vcf/structural_variants.vcf"};
+
+}  // namespace VcfQueryTests
+
+TEST(VCF_Query, can_use_range_over_input_filename)
+{
+    size_t i = 0;
+    VcfQuery query{VcfQueryTests::VcfFn};
+    for (const auto& var : query) {
+        EXPECT_EQ(VcfQueryTests::ExpectedIds.at(i), var.Id());
+        ++i;
+    }
+}
+
+TEST(VCF_Query, can_use_range_over_input_file_object)
+{
+    const VcfFile file{VcfQueryTests::VcfFn};
+
+    size_t i = 0;
+    VcfQuery query{file};
+    for (const auto& var : query) {
+        EXPECT_EQ(VcfQueryTests::ExpectedIds.at(i), var.Id());
+        ++i;
+    }
+}
diff --git a/tests/src/test_VcfReader.cpp b/tests/src/test_VcfReader.cpp

new file mode 100644 (file)

index 0000000..5050eee
--- /dev/null
+++ b/tests/src/test_VcfReader.cpp
@@ -0,0 +1,51 @@
+// Author: Derek Barnett
+
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfReader.h>
+
+#include "PbbamTestData.h"
+
+using VcfFile = PacBio::VCF::VcfFile;
+using VcfReader = PacBio::VCF::VcfReader;
+using VcfVariant = PacBio::VCF::VcfVariant;
+
+namespace VcfReaderTests {
+
+static const std::vector<std::string> ExpectedIds{
+    "pbsv.INS.1",  "pbsv.DEL.2",  "pbsv.INS.3",  "pbsv.INS.4",  "pbsv.DEL.5",  "pbsv.DEL.6",
+    "pbsv.DEL.7",  "pbsv.INS.8",  "pbsv.INS.9",  "pbsv.INS.10", "pbsv.INS.11", "pbsv.INS.12",
+    "pbsv.INS.13", "pbsv.INS.14", "pbsv.INS.15", "pbsv.INS.16", "pbsv.INS.17", "pbsv.INS.18",
+    "pbsv.INS.19", "pbsv.DEL.20", "pbsv.INS.21"};
+
+static const std::string VcfFn{PacBio::BAM::PbbamTestsConfig::Data_Dir +
+                               "/vcf/structural_variants.vcf"};
+
+}  // namespace VcfReaderTests
+
+TEST(VCF_Reader, can_fetch_variants_from_vcf_filename)
+{
+    size_t i = 0;
+    VcfReader rdr{VcfReaderTests::VcfFn};
+    VcfVariant var;
+    while (rdr.GetNext(var)) {
+        EXPECT_EQ(VcfReaderTests::ExpectedIds.at(i), var.Id());
+        ++i;
+    }
+}
+
+TEST(VCF_Reader, can_fetch_variants_from_vcf_file_object)
+{
+    const VcfFile file{VcfReaderTests::VcfFn};
+
+    size_t i = 0;
+    VcfReader rdr{file};
+    VcfVariant var;
+    while (rdr.GetNext(var)) {
+        EXPECT_EQ(VcfReaderTests::ExpectedIds.at(i), var.Id());
+        ++i;
+    }
+}
diff --git a/tests/src/test_VcfSort.cpp b/tests/src/test_VcfSort.cpp

new file mode 100644 (file)

index 0000000..4272991
--- /dev/null
+++ b/tests/src/test_VcfSort.cpp
@@ -0,0 +1,51 @@
+// Author: Derek Barnett
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <pbbam/vcf/VcfQuery.h>
+#include <pbbam/vcf/VcfSort.h>
+
+#include "PbbamTestData.h"
+
+using VcfFile = PacBio::VCF::VcfFile;
+using VcfQuery = PacBio::VCF::VcfQuery;
+
+// clang-format off
+
+namespace VcfSortTests {
+
+static const std::string inputFn = PacBio::BAM::PbbamTestsConfig::Data_Dir +
+        "/vcf/unsorted.vcf";
+static const std::string outputFn = PacBio::BAM::PbbamTestsConfig::GeneratedData_Dir + "/sorted.vcf";
+
+} // namespace VcfSortTests
+
+TEST(VCF_Sort, sorts_input_file)
+{
+    const VcfFile file{VcfSortTests::inputFn};
+    PacBio::VCF::SortFile(file, VcfSortTests::outputFn);
+
+    const std::vector<std::string> expectedIds{
+        "variant0",
+        "variant5",
+        "variant1",
+        "variant3",
+        "variant4",
+        "variant2"
+    };
+
+    size_t i= 0;
+    VcfQuery query{VcfSortTests::outputFn};
+    for (const auto& var : query)
+    {
+        EXPECT_EQ(expectedIds.at(i), var.Id());
+        ++i;
+    }
+
+    // remove temp file
+    remove(VcfSortTests::outputFn.c_str());
+}
+
+// clang-format on
diff --git a/tests/src/test_VcfVariant.cpp b/tests/src/test_VcfVariant.cpp

new file mode 100644 (file)

index 0000000..879940d
--- /dev/null
+++ b/tests/src/test_VcfVariant.cpp
@@ -0,0 +1,234 @@
+// Author: Derek Barnett
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfVariant.h>
+
+using InfoField = PacBio::VCF::InfoField;
+using VcfVariant = PacBio::VCF::VcfVariant;
+
+namespace VcfVariantTests {
+
+static const std::string BasicVariantText{
+    "chrXVI\t660831\tpbsv.INS.21\tC\tCAAAGGAATGGTAAAGATGGGGGGTCAACGGACAAGGGAAAGGATCCATGGGGGCA\t."
+    "\tPASS"
+    "\tIMPRECISE;SVTYPE=INS;END=660831;SVLEN=55;MULTI=1,2,3\tGT:AD:DP:AC\t0/1:2:5:1,2"};
+
+}  // namespace VcfVariantTests
+
+TEST(VCF_Variant, default_ctor_provides_proper_default_values)
+{
+    VcfVariant v;
+
+    EXPECT_TRUE(v.Chrom().empty());
+    EXPECT_EQ(PacBio::BAM::UnmappedPosition, v.Position());
+    EXPECT_TRUE(v.Id().empty());
+    EXPECT_TRUE(v.RefAllele().empty());
+    EXPECT_TRUE(v.AltAllele().empty());
+    EXPECT_TRUE(v.IsQualityMissing());
+    EXPECT_EQ("PASS", v.Filter());
+
+    EXPECT_FALSE(v.IsDeletion());
+    EXPECT_FALSE(v.IsInsertion());
+    EXPECT_FALSE(v.IsSnp());
+}
+
+TEST(VCF_Variant, can_create_snp)
+{
+    const VcfVariant v{"var_snp", "3", 3000, "C", "G"};
+
+    EXPECT_EQ("3", v.Chrom());
+    EXPECT_EQ(3000, v.Position());
+    EXPECT_EQ("var_snp", v.Id());
+    EXPECT_EQ("C", v.RefAllele());
+    EXPECT_EQ("G", v.AltAllele());
+    EXPECT_TRUE(v.IsQualityMissing());
+    EXPECT_EQ("PASS", v.Filter());
+
+    EXPECT_FALSE(v.IsDeletion());
+    EXPECT_FALSE(v.IsInsertion());
+    EXPECT_TRUE(v.IsSnp());
+}
+
+TEST(VCF_Variant, can_create_insertion)
+{
+    const VcfVariant v{"var_ins", "3", 3000, "C", "CTAG"};
+
+    EXPECT_EQ("3", v.Chrom());
+    EXPECT_EQ(3000, v.Position());
+    EXPECT_EQ("var_ins", v.Id());
+    EXPECT_EQ("C", v.RefAllele());
+    EXPECT_EQ("CTAG", v.AltAllele());
+    EXPECT_TRUE(v.IsQualityMissing());
+    EXPECT_EQ("PASS", v.Filter());
+
+    EXPECT_FALSE(v.IsDeletion());
+    EXPECT_TRUE(v.IsInsertion());
+    EXPECT_FALSE(v.IsSnp());
+}
+
+TEST(VCF_Variant, can_create_deletion)
+{
+    const VcfVariant v{"var_del", "3", 3000, "TCG", "T"};
+
+    EXPECT_EQ("3", v.Chrom());
+    EXPECT_EQ(3000, v.Position());
+    EXPECT_EQ("var_del", v.Id());
+    EXPECT_EQ("TCG", v.RefAllele());
+    EXPECT_EQ("T", v.AltAllele());
+    EXPECT_TRUE(v.IsQualityMissing());
+    EXPECT_EQ("PASS", v.Filter());
+
+    EXPECT_TRUE(v.IsDeletion());
+    EXPECT_FALSE(v.IsInsertion());
+    EXPECT_FALSE(v.IsSnp());
+}
+
+TEST(VCF_Variant, can_determine_if_info_field_is_present)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    EXPECT_TRUE(v.HasInfoField("SVLEN"));
+    EXPECT_FALSE(v.HasInfoField("nope"));
+}
+
+TEST(VCF_Variant, can_fetch_single_value_info_field)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& value = v.InfoValue("SVTYPE");
+    EXPECT_TRUE(value.is_initialized());
+    EXPECT_EQ("INS", value.get());
+}
+
+TEST(VCF_Variant, can_add_single_value_info_field)
+{
+    VcfVariant v{VcfVariantTests::BasicVariantText};
+
+    InfoField i;
+    i.id = "NEW";
+    i.value = "42";
+    v.AddInfoField(i);
+
+    EXPECT_TRUE(v.HasInfoField("NEW"));
+    EXPECT_EQ("42", v.InfoValue("NEW").get());
+}
+
+TEST(VCF_Variant, can_fetch_multi_value_info_field)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& values = v.InfoValues("MULTI");
+    EXPECT_TRUE(values.is_initialized());
+    EXPECT_EQ(3, values->size());
+    EXPECT_EQ("1", values->at(0));
+    EXPECT_EQ("2", values->at(1));
+    EXPECT_EQ("3", values->at(2));
+}
+
+TEST(VCF_Variant, can_edit_single_value_info_field)
+{
+    VcfVariant v{VcfVariantTests::BasicVariantText};
+    auto value = v.InfoValue("SVTYPE");
+    EXPECT_TRUE(value.is_initialized());
+    EXPECT_EQ("INS", value.get());
+
+    v.InfoValue("SVTYPE", std::string{"FOO"});
+
+    value = v.InfoValue("SVTYPE");
+    EXPECT_TRUE(value.is_initialized());
+    EXPECT_EQ("FOO", value.get());
+}
+
+TEST(VCF_Variant, can_edit_multi_value_info_field)
+{
+    VcfVariant v{VcfVariantTests::BasicVariantText};
+
+    auto values = v.InfoValues("MULTI");
+    EXPECT_TRUE(values.is_initialized());
+    EXPECT_EQ(3, values->size());
+    EXPECT_EQ("1", values->at(0));
+    EXPECT_EQ("2", values->at(1));
+    EXPECT_EQ("3", values->at(2));
+
+    std::vector<std::string> newData{"42", "42", "42"};
+    v.InfoValues("MULTI", newData);
+
+    values = v.InfoValues("MULTI");
+    EXPECT_TRUE(values.is_initialized());
+    EXPECT_EQ(3, values->size());
+    EXPECT_EQ("42", values->at(0));
+    EXPECT_EQ("42", values->at(1));
+    EXPECT_EQ("42", values->at(2));
+}
+
+TEST(VCF_Variant, can_add_multi_value_info_field)
+{
+    VcfVariant v{VcfVariantTests::BasicVariantText};
+    InfoField i;
+    i.id = "NEW";
+    i.values = std::vector<std::string>{"42", "42", "42"};
+    v.AddInfoField(i);
+
+    EXPECT_TRUE(v.HasInfoField("NEW"));
+    const auto& values = v.InfoValues("NEW");
+    EXPECT_EQ(3, values->size());
+    EXPECT_EQ("42", values->at(0));
+    EXPECT_EQ("42", values->at(1));
+    EXPECT_EQ("42", values->at(2));
+}
+
+TEST(VCF_Variant, can_remove_info_field)
+{
+    VcfVariant v{VcfVariantTests::BasicVariantText};
+
+    EXPECT_TRUE(v.HasInfoField("SVLEN"));
+    EXPECT_EQ("INS", v.InfoValue("SVTYPE").get());
+
+    v.RemoveInfoField("SVLEN");
+
+    EXPECT_FALSE(v.HasInfoField("SVLEN"));
+    EXPECT_EQ("INS", v.InfoValue("SVTYPE").get());
+}
+
+TEST(VCF_Variant, can_fetch_all_genotype_ids)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& genotypeIds = v.GenotypeIds();
+    ASSERT_EQ(4, genotypeIds.size());
+    EXPECT_EQ("GT", genotypeIds.at(0));
+    EXPECT_EQ("AD", genotypeIds.at(1));
+    EXPECT_EQ("DP", genotypeIds.at(2));
+    EXPECT_EQ("AC", genotypeIds.at(3));
+}
+
+TEST(VCF_Variant, can_fetch_all_genotype_fields)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& genotypeFields = v.Genotypes();
+    ASSERT_EQ(1, genotypeFields.size());
+}
+
+TEST(VCF_Variant, can_fetch_single_value_genotype_field)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& value = v.GenotypeValue(0, "AD");
+    EXPECT_TRUE(value.is_initialized());
+    EXPECT_EQ("2", value.get());
+}
+
+TEST(VCF_Variant, can_fetch_multi_value_genotype_field)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    const auto& values = v.GenotypeValues(0, "AC");
+    EXPECT_TRUE(values.is_initialized());
+    ASSERT_EQ(2, values->size());
+}
+
+TEST(VCF_Variant, can_determine_if_sample_is_heterozygous)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    EXPECT_TRUE(v.IsSampleHeterozygous(0));
+}
+
+TEST(VCF_Variant, can_determine_if_sample_is_phased)
+{
+    const VcfVariant v{VcfVariantTests::BasicVariantText};
+    EXPECT_FALSE(v.IsSamplePhased(0));
+}
diff --git a/tests/src/test_VcfWriter.cpp b/tests/src/test_VcfWriter.cpp

new file mode 100644 (file)

index 0000000..9dbaa1e
--- /dev/null
+++ b/tests/src/test_VcfWriter.cpp
@@ -0,0 +1,55 @@
+// Author: Derek Barnett
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+#include <pbbam/vcf/VcfFile.h>
+#include <pbbam/vcf/VcfFormat.h>
+#include <pbbam/vcf/VcfQuery.h>
+#include <pbbam/vcf/VcfWriter.h>
+
+#include "PbbamTestData.h"
+
+using VcfFile = PacBio::VCF::VcfFile;
+using VcfFormat = PacBio::VCF::VcfFormat;
+using VcfQuery = PacBio::VCF::VcfQuery;
+using VcfWriter = PacBio::VCF::VcfWriter;
+
+namespace VcfWriterTests {
+
+static const std::string VcfFn{PacBio::BAM::PbbamTestsConfig::Data_Dir +
+                               "/vcf/structural_variants.vcf"};
+
+}  // namespace VcfWriterTests
+
+TEST(VCF_Writer, correctly_copies_vcf_file)
+{
+    const std::string intitialFn{VcfWriterTests::VcfFn};
+    const std::string newFn{PacBio::BAM::PbbamTestsConfig::GeneratedData_Dir + "/temp.vcf"};
+
+    const VcfFile initialFile{VcfWriterTests::VcfFn};
+
+    const std::string expectedHeaderText = VcfFormat::FormattedHeader(initialFile.Header());
+    std::vector<std::string> expectedVariantsText;
+
+    {  // store contents of intitial file & write to a new file
+        VcfWriter writer{newFn, initialFile.Header()};
+        VcfQuery query{initialFile};
+        for (const auto& var : query) {
+            expectedVariantsText.push_back(VcfFormat::FormattedVariant(var));
+            writer.Write(var);
+        }
+    }
+    {  // read new file & compare against original
+
+        const VcfFile newFile{newFn};
+        EXPECT_EQ(expectedHeaderText, VcfFormat::FormattedHeader(newFile.Header()));
+
+        size_t i = 0;
+        for (const auto& var : VcfQuery{newFile}) {
+            EXPECT_EQ(expectedVariantsText.at(i), VcfFormat::FormattedVariant(var));
+            ++i;
+        }
+    }
+    ::remove(newFn.c_str());
+}
diff --git a/tests/src/test_Version.cpp b/tests/src/test_Version.cpp

new file mode 100644 (file)

index 0000000..6c50556
--- /dev/null
+++ b/tests/src/test_Version.cpp
@@ -0,0 +1,295 @@
+// Author: Derek Barnett
+
+#include <sstream>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "../src/Version.h"
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+using namespace PacBio::BAM::internal;
+
+namespace VersionTests {
+
+static inline Version MakeVersion(int x, int y, int z) { return Version(x, y, z); }
+
+}  // namespace VersionTests
+
+TEST(VersionTest, DefaultOk)
+{
+    Version v;
+    EXPECT_EQ(0, v.Major());
+    EXPECT_EQ(0, v.Minor());
+    EXPECT_EQ(0, v.Revision());
+}
+
+TEST(VersionTest, CopyAndMoveOk)
+{
+    {  // copy ctor
+        Version v1(3, 1, 1);
+        EXPECT_EQ(3, v1.Major());
+        EXPECT_EQ(1, v1.Minor());
+        EXPECT_EQ(1, v1.Revision());
+
+        Version v2(v1);
+        EXPECT_EQ(3, v2.Major());
+        EXPECT_EQ(1, v2.Minor());
+        EXPECT_EQ(1, v2.Revision());
+    }
+    {  // copy assign
+        Version v1(3, 1, 1);
+        EXPECT_EQ(3, v1.Major());
+        EXPECT_EQ(1, v1.Minor());
+        EXPECT_EQ(1, v1.Revision());
+
+        Version v2;
+        v2 = v1;
+        EXPECT_EQ(3, v2.Major());
+        EXPECT_EQ(1, v2.Minor());
+        EXPECT_EQ(1, v2.Revision());
+    }
+    {  // move ctor
+        Version v(VersionTests::MakeVersion(3, 1, 1));
+        EXPECT_EQ(3, v.Major());
+        EXPECT_EQ(1, v.Minor());
+        EXPECT_EQ(1, v.Revision());
+    }
+    {  // move assign
+        Version v1(3, 1, 1);
+        EXPECT_EQ(3, v1.Major());
+        EXPECT_EQ(1, v1.Minor());
+        EXPECT_EQ(1, v1.Revision());
+
+        Version v2;
+        v2 = std::move(v1);
+        EXPECT_EQ(3, v2.Major());
+        EXPECT_EQ(1, v2.Minor());
+        EXPECT_EQ(1, v2.Revision());
+    }
+}
+
+TEST(VersionTest, FromIntsOk)
+{
+    {  // normal
+        Version v(3, 1, 1);
+        EXPECT_EQ(3, v.Major());
+        EXPECT_EQ(1, v.Minor());
+        EXPECT_EQ(1, v.Revision());
+    }
+
+    // negatives
+    EXPECT_THROW(Version(-3, 1, 1), std::runtime_error);
+}
+
+TEST(VersionTest, FromStringOk)
+{
+    {  // normal
+        Version v("3.1.1");
+        EXPECT_EQ(3, v.Major());
+        EXPECT_EQ(1, v.Minor());
+        EXPECT_EQ(1, v.Revision());
+    }
+
+    // negatives
+    EXPECT_THROW(Version("-3.1.1"), std::runtime_error);
+
+    // non-numeric
+    EXPECT_THROW(Version("foo.bar.baz"), std::runtime_error);
+
+    // empty
+    EXPECT_THROW(Version(""), std::runtime_error);
+}
+
+TEST(VersionTest, SettersOk)
+{
+    Version v(3, 1, 1);
+
+    v.Major(4);
+
+    EXPECT_EQ(4, v.Major());
+    EXPECT_EQ(1, v.Minor());
+    EXPECT_EQ(1, v.Revision());
+
+    v.Minor(7);
+
+    EXPECT_EQ(4, v.Major());
+    EXPECT_EQ(7, v.Minor());
+    EXPECT_EQ(1, v.Revision());
+
+    v.Revision(23);
+
+    EXPECT_EQ(4, v.Major());
+    EXPECT_EQ(7, v.Minor());
+    EXPECT_EQ(23, v.Revision());
+
+    {  // invalid
+        Version v1(3, 1, 1);
+        Version v2(3, 1, 1);
+        Version v3(3, 1, 1);
+        EXPECT_THROW(v1.Major(-1), std::runtime_error);
+        EXPECT_THROW(v2.Minor(-1), std::runtime_error);
+        EXPECT_THROW(v3.Revision(-1), std::runtime_error);
+    }
+}
+
+TEST(VersionTest, ComparisonsOk)
+{
+    const Version v0_0_0 = Version(0, 0, 0);
+    const Version v0_0_4 = Version(0, 0, 4);
+    const Version v0_1_0 = Version(0, 1, 0);
+    const Version v0_1_4 = Version(0, 1, 4);
+    const Version v3_0_0 = Version(3, 0, 0);
+    const Version v3_0_4 = Version(3, 0, 4);
+    const Version v3_1_0 = Version(3, 1, 0);
+    const Version v3_1_4 = Version(3, 1, 4);
+    const Version v3_1_5 = Version(3, 1, 5);
+
+    // operator==
+    EXPECT_TRUE(v0_0_0 == v0_0_0);
+    EXPECT_TRUE(v3_0_0 == v3_0_0);
+    EXPECT_TRUE(v0_1_0 == v0_1_0);
+    EXPECT_TRUE(v0_0_4 == v0_0_4);
+    EXPECT_TRUE(v3_1_0 == v3_1_0);
+    EXPECT_TRUE(v3_1_4 == v3_1_4);
+
+    EXPECT_FALSE(v3_1_4 == v0_0_0);
+    EXPECT_FALSE(v3_1_4 == v3_0_0);
+    EXPECT_FALSE(v3_1_4 == v0_1_0);
+    EXPECT_FALSE(v3_1_4 == v0_0_4);
+    EXPECT_FALSE(v3_1_4 == v3_1_0);
+    EXPECT_FALSE(v3_1_4 == v3_1_5);
+
+    // operator!=
+    EXPECT_FALSE(v0_0_0 != v0_0_0);
+    EXPECT_FALSE(v3_0_0 != v3_0_0);
+    EXPECT_FALSE(v0_1_0 != v0_1_0);
+    EXPECT_FALSE(v0_0_4 != v0_0_4);
+    EXPECT_FALSE(v3_1_0 != v3_1_0);
+    EXPECT_FALSE(v3_1_4 != v3_1_4);
+
+    EXPECT_TRUE(v3_1_4 != v0_0_0);
+    EXPECT_TRUE(v3_1_4 != v3_0_0);
+    EXPECT_TRUE(v3_1_4 != v0_1_0);
+    EXPECT_TRUE(v3_1_4 != v0_0_4);
+    EXPECT_TRUE(v3_1_4 != v3_1_0);
+    EXPECT_TRUE(v3_1_4 != v3_1_5);
+
+    // operator<
+    EXPECT_FALSE(v0_0_0 < v0_0_0);
+    EXPECT_TRUE(v0_0_0 < v0_0_4);
+    EXPECT_TRUE(v0_0_0 < v0_1_0);
+    EXPECT_TRUE(v0_0_0 < v3_0_0);
+    EXPECT_TRUE(v0_0_0 < v0_1_4);
+    EXPECT_TRUE(v0_0_0 < v3_0_4);
+    EXPECT_TRUE(v0_0_0 < v3_1_0);
+    EXPECT_TRUE(v0_0_0 < v3_1_4);
+
+    EXPECT_TRUE(v0_0_4 < v3_1_4);
+    EXPECT_TRUE(v0_1_0 < v3_1_4);
+    EXPECT_TRUE(v0_1_4 < v3_1_4);
+    EXPECT_TRUE(v3_0_0 < v3_1_4);
+    EXPECT_TRUE(v3_0_4 < v3_1_4);
+    EXPECT_TRUE(v3_1_0 < v3_1_4);
+    EXPECT_FALSE(v3_1_4 < v3_1_4);
+    EXPECT_FALSE(v3_1_5 < v3_1_4);
+
+    EXPECT_FALSE(v3_1_4 < v0_0_0);
+
+    // operator<=
+    EXPECT_TRUE(v0_0_0 <= v0_0_0);
+    EXPECT_TRUE(v0_0_0 <= v0_0_4);
+    EXPECT_TRUE(v0_0_0 <= v0_1_0);
+    EXPECT_TRUE(v0_0_0 <= v3_0_0);
+    EXPECT_TRUE(v0_0_0 <= v0_1_4);
+    EXPECT_TRUE(v0_0_0 <= v3_0_4);
+    EXPECT_TRUE(v0_0_0 <= v3_1_0);
+    EXPECT_TRUE(v0_0_0 <= v3_1_4);
+
+    EXPECT_TRUE(v0_0_4 <= v3_1_4);
+    EXPECT_TRUE(v0_1_0 <= v3_1_4);
+    EXPECT_TRUE(v0_1_4 <= v3_1_4);
+    EXPECT_TRUE(v3_0_0 <= v3_1_4);
+    EXPECT_TRUE(v3_0_4 <= v3_1_4);
+    EXPECT_TRUE(v3_1_0 <= v3_1_4);
+    EXPECT_TRUE(v3_1_4 <= v3_1_4);
+    EXPECT_FALSE(v3_1_5 <= v3_1_4);
+
+    EXPECT_FALSE(v3_1_4 <= v0_0_0);
+
+    // operator>
+    EXPECT_FALSE(v0_0_0 > v0_0_0);
+    EXPECT_FALSE(v0_0_0 > v0_0_4);
+    EXPECT_FALSE(v0_0_0 > v0_1_0);
+    EXPECT_FALSE(v0_0_0 > v3_0_0);
+    EXPECT_FALSE(v0_0_0 > v0_1_4);
+    EXPECT_FALSE(v0_0_0 > v3_0_4);
+    EXPECT_FALSE(v0_0_0 > v3_1_0);
+    EXPECT_FALSE(v0_0_0 > v3_1_4);
+
+    EXPECT_FALSE(v0_0_4 > v3_1_4);
+    EXPECT_FALSE(v0_1_0 > v3_1_4);
+    EXPECT_FALSE(v0_1_4 > v3_1_4);
+    EXPECT_FALSE(v3_0_0 > v3_1_4);
+    EXPECT_FALSE(v3_0_4 > v3_1_4);
+    EXPECT_FALSE(v3_1_0 > v3_1_4);
+    EXPECT_FALSE(v3_1_4 > v3_1_4);
+    EXPECT_TRUE(v3_1_5 > v3_1_4);
+
+    EXPECT_TRUE(v3_1_4 > v0_0_0);
+
+    // operator>=
+    EXPECT_TRUE(v0_0_0 >= v0_0_0);
+    EXPECT_FALSE(v0_0_0 >= v0_0_4);
+    EXPECT_FALSE(v0_0_0 >= v0_1_0);
+    EXPECT_FALSE(v0_0_0 >= v3_0_0);
+    EXPECT_FALSE(v0_0_0 >= v0_1_4);
+    EXPECT_FALSE(v0_0_0 >= v3_0_4);
+    EXPECT_FALSE(v0_0_0 >= v3_1_0);
+    EXPECT_FALSE(v0_0_0 >= v3_1_4);
+
+    EXPECT_FALSE(v0_0_4 >= v3_1_4);
+    EXPECT_FALSE(v0_1_0 >= v3_1_4);
+    EXPECT_FALSE(v0_1_4 >= v3_1_4);
+    EXPECT_FALSE(v3_0_0 >= v3_1_4);
+    EXPECT_FALSE(v3_0_4 >= v3_1_4);
+    EXPECT_FALSE(v3_1_0 >= v3_1_4);
+    EXPECT_TRUE(v3_1_4 >= v3_1_4);
+    EXPECT_TRUE(v3_1_5 >= v3_1_4);
+
+    EXPECT_TRUE(v3_1_4 >= v0_0_0);
+}
+
+TEST(VersionTest, ToStringOk)
+{
+    {
+        Version v(0, 0, 0);
+        EXPECT_EQ(std::string("0.0.0"), v.ToString());
+    }
+    {
+        Version v(3, 1, 4);
+        EXPECT_EQ(std::string("3.1.4"), v.ToString());
+    }
+    {
+        Version v;
+        v.Major(4);
+        EXPECT_EQ(std::string("4.0.0"), v.ToString());
+    }
+    {
+        const std::string s = "1.2.3";
+        Version v(s);
+        EXPECT_EQ(s, v.ToString());
+    }
+}
+
+TEST(VersionTest, OutputStreamOk)
+{
+    Version v(3, 1, 4);
+    Version v2(4, 10, 0);
+
+    std::ostringstream s;
+    s << v << ", " << v2 << ", " << v << std::endl;
+
+    EXPECT_EQ(std::string("3.1.4, 4.10.0, 3.1.4\n"), s.str());
+}
diff --git a/tests/src/test_WhitelistedZmwReadStitcher.cpp b/tests/src/test_WhitelistedZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..64359d9
--- /dev/null
+++ b/tests/src/test_WhitelistedZmwReadStitcher.cpp
@@ -0,0 +1,220 @@
+// Author: Derek Barnett
+
+#include <cstdint>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/BamFile.h>
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/PbiRawData.h>
+#include <pbbam/virtual/WhitelistedZmwReadStitcher.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace WhitelistedZmwReadStitcherTests {
+
+static void Compare(const BamRecord& b1, const BamRecord& b2)
+{
+    EXPECT_TRUE(b1.HasDeletionQV());
+    EXPECT_TRUE(b1.HasDeletionTag());
+    EXPECT_TRUE(b1.HasInsertionQV());
+    EXPECT_TRUE(b1.HasMergeQV());
+    EXPECT_TRUE(b1.HasSubstitutionQV());
+    EXPECT_TRUE(b1.HasSubstitutionTag());
+    EXPECT_TRUE(b1.HasLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelTag());
+    EXPECT_TRUE(b1.HasPkmean());
+    EXPECT_TRUE(b1.HasPkmid());
+    EXPECT_TRUE(b1.HasPulseCall());
+    EXPECT_TRUE(b1.HasIPD());
+    EXPECT_TRUE(b1.HasPulseWidth());
+    EXPECT_TRUE(b1.HasPrePulseFrames());
+    EXPECT_TRUE(b1.HasPulseCallWidth());
+    EXPECT_TRUE(b1.HasPulseMergeQV());
+
+    EXPECT_TRUE(b2.HasDeletionQV());
+    EXPECT_TRUE(b2.HasDeletionTag());
+    EXPECT_TRUE(b2.HasInsertionQV());
+    EXPECT_TRUE(b2.HasMergeQV());
+    EXPECT_TRUE(b2.HasSubstitutionQV());
+    EXPECT_TRUE(b2.HasSubstitutionTag());
+    EXPECT_TRUE(b2.HasLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelTag());
+    EXPECT_TRUE(b2.HasPkmean());
+    EXPECT_TRUE(b2.HasPkmid());
+    EXPECT_TRUE(b2.HasPulseCall());
+    EXPECT_TRUE(b2.HasIPD());
+    EXPECT_TRUE(b2.HasPulseWidth());
+    EXPECT_TRUE(b2.HasPrePulseFrames());
+    EXPECT_TRUE(b2.HasPulseCallWidth());
+    EXPECT_TRUE(b2.HasPulseMergeQV());
+
+    EXPECT_EQ(b1.FullName(), b2.FullName());
+    EXPECT_EQ(b1.HoleNumber(), b2.HoleNumber());
+    EXPECT_EQ(b1.NumPasses(), b2.NumPasses());
+    EXPECT_EQ(b1.Sequence(), b2.Sequence());
+    EXPECT_EQ(b1.Qualities(), b2.Qualities());
+    EXPECT_EQ(b1.DeletionQV(), b2.DeletionQV());
+    EXPECT_EQ(b1.DeletionTag(), b2.DeletionTag());
+    EXPECT_EQ(b1.InsertionQV(), b2.InsertionQV());
+    EXPECT_EQ(b1.MergeQV(), b2.MergeQV());
+    EXPECT_EQ(b1.SubstitutionQV(), b2.SubstitutionQV());
+    EXPECT_EQ(b1.SubstitutionTag(), b2.SubstitutionTag());
+    EXPECT_EQ(b1.LabelQV(), b2.LabelQV());
+    EXPECT_EQ(b1.AltLabelQV(), b2.AltLabelQV());
+    EXPECT_EQ(b1.AltLabelTag(), b2.AltLabelTag());
+    EXPECT_EQ(b1.Pkmean(), b2.Pkmean());
+    EXPECT_EQ(b1.Pkmid(), b2.Pkmid());
+    EXPECT_EQ(b1.PulseCall(), b2.PulseCall());
+    EXPECT_EQ(b1.IPD(), b2.IPD());
+    EXPECT_EQ(b1.PulseWidth(), b2.PulseWidth());
+    EXPECT_EQ(b1.PrePulseFrames(), b2.PrePulseFrames());
+    EXPECT_EQ(b1.PulseCallWidth(), b2.PulseCallWidth());
+    EXPECT_EQ(b1.ReadGroup(), b2.ReadGroup());
+    EXPECT_EQ(b1.PulseMergeQV(), b2.PulseMergeQV());
+}
+
+}  // namespace WhitelistedZmwReadStitcherTests
+
+TEST(WhitelistedZmwReadStitching, EmptyList)
+{
+    const std::vector<int32_t> whitelist = {};
+    WhitelistedZmwReadStitcher stitcher(
+        whitelist, PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+        PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+    EXPECT_FALSE(stitcher.HasNext());
+    EXPECT_TRUE(stitcher.NextRaw().empty());
+}
+
+TEST(WhitelistedZmwReadStitching, SingleValue)
+{
+    const std::vector<int32_t> whitelist = {200000};
+    WhitelistedZmwReadStitcher stitcher(
+        whitelist, PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+        PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+
+    // create virtual record
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // fetch original polymerase read (2nd record)
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    ++begin;
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin++;
+
+    EXPECT_EQ(200000, virtualRecord.HoleNumber());
+
+    WhitelistedZmwReadStitcherTests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(WhitelistedZmwReadStitching, UnknownZmw)
+{
+    const std::vector<int32_t> whitelist{42};  // ZMW not in our files
+    WhitelistedZmwReadStitcher stitcher(
+        whitelist, PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+        PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+
+    EXPECT_FALSE(stitcher.HasNext());
+    EXPECT_TRUE(stitcher.NextRaw().empty());
+}
+
+TEST(WhitelistedZmwReadStitching, MultiValue)
+{
+    const std::vector<int32_t> whitelist = {100000, 300000};
+    WhitelistedZmwReadStitcher stitcher(
+        whitelist, PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+        PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+
+    // create virtual records
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord1 = stitcher.Next();
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord2 = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // fetch original polymerase reads (2nd record)
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+
+    EXPECT_TRUE(begin != end);
+    auto polyRecord1 = *begin++;
+    EXPECT_TRUE(begin != end);
+    ++begin;
+    EXPECT_TRUE(begin != end);
+    auto polyRecord2 = *begin++;
+    EXPECT_TRUE(begin == end);
+
+    EXPECT_EQ(100000, virtualRecord1.HoleNumber());
+    EXPECT_EQ(300000, virtualRecord2.HoleNumber());
+
+    WhitelistedZmwReadStitcherTests::Compare(polyRecord1, virtualRecord1);
+    WhitelistedZmwReadStitcherTests::Compare(polyRecord2, virtualRecord2);
+}
+
+TEST(WhitelistedZmwReadStitching, MultiValue_MixedKnownAndUnknown)
+{
+    const std::vector<int32_t> whitelist{42, 200000, 24};
+    WhitelistedZmwReadStitcher stitcher(
+        whitelist, PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+        PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+
+    // everything below should behave exactly as 'SingleValueOk' test,
+    // as the unknown ZMWs will have been removed during construction
+
+    // create virtual record
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // fetch original polymerase read (2nd record)
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    ++begin;
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin++;
+
+    EXPECT_EQ(200000, virtualRecord.HoleNumber());
+
+    WhitelistedZmwReadStitcherTests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(WhitelistedZmwReadStitching, EmptyScrapsFileOk)
+{
+    const std::vector<int32_t> whitelist = {10944689, 10944690};
+    const std::string primaryBamFn =
+        PbbamTestsConfig::Data_Dir + "/polymerase/scrapless.subreads.bam";
+    const std::string scrapsBamFn = PbbamTestsConfig::Data_Dir + "/polymerase/scrapless.scraps.bam";
+
+    int count = 0;
+    WhitelistedZmwReadStitcher stitcher(whitelist, primaryBamFn, scrapsBamFn);
+    while (stitcher.HasNext()) {
+        auto record = stitcher.Next();
+        //        ()record;
+        ++count;
+    }
+    EXPECT_EQ(2, count);
+
+    const BamFile primaryBam(primaryBamFn);
+    const BamFile scrapsBam(scrapsBamFn);
+    const PbiRawData primaryIdx(primaryBam.PacBioIndexFilename());
+    const PbiRawData scrapsIdx(scrapsBam.PacBioIndexFilename());
+    EXPECT_EQ(3, primaryIdx.NumReads());
+    EXPECT_EQ(0, scrapsIdx.NumReads());
+}
diff --git a/tests/src/test_ZmwQuery.cpp b/tests/src/test_ZmwQuery.cpp

new file mode 100644 (file)

index 0000000..77ee1c2
--- /dev/null
+++ b/tests/src/test_ZmwQuery.cpp
@@ -0,0 +1,31 @@
+// Author: Derek Barnett
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/ZmwQuery.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+//TEST(EntireFileQueryTest, CountRecords)
+//{
+//    EXPECT_NO_THROW(
+//    {
+//        // open input BAM file
+//        BamFile bamFile(inputBamFn);
+
+//        // count records
+//        int count = 0;
+//        EntireFileQuery entireFile(bamFile);
+//        for (const BamRecord& record : entireFile) {
+//            ()record;
+//            ++count;
+//        }
+
+//        EXPECT_EQ(3307, count);
+//    });
+//}
diff --git a/tests/src/test_ZmwReadStitcher.cpp b/tests/src/test_ZmwReadStitcher.cpp

new file mode 100644 (file)

index 0000000..b2c4298
--- /dev/null
+++ b/tests/src/test_ZmwReadStitcher.cpp
@@ -0,0 +1,475 @@
+// Author: Derek Barnett
+
+#include <cstddef>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "PbbamTestData.h"
+
+#include <pbbam/EntireFileQuery.h>
+#include <pbbam/PbiFilter.h>
+#include <pbbam/virtual/VirtualPolymeraseCompositeReader.h>
+#include <pbbam/virtual/VirtualPolymeraseReader.h>
+#include <pbbam/virtual/ZmwReadStitcher.h>
+
+using namespace PacBio;
+using namespace PacBio::BAM;
+
+namespace ZmwReadStitcherTests {
+
+static void Compare(const BamRecord& b1, const BamRecord& b2)
+{
+    EXPECT_TRUE(b1.HasDeletionQV());
+    EXPECT_TRUE(b1.HasDeletionTag());
+    EXPECT_TRUE(b1.HasInsertionQV());
+    EXPECT_TRUE(b1.HasMergeQV());
+    EXPECT_TRUE(b1.HasSubstitutionQV());
+    EXPECT_TRUE(b1.HasSubstitutionTag());
+    EXPECT_TRUE(b1.HasLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelQV());
+    EXPECT_TRUE(b1.HasAltLabelTag());
+    EXPECT_TRUE(b1.HasPkmean());
+    EXPECT_TRUE(b1.HasPkmid());
+    EXPECT_TRUE(b1.HasPulseCall());
+    EXPECT_TRUE(b1.HasIPD());
+    EXPECT_TRUE(b1.HasPulseWidth());
+    EXPECT_TRUE(b1.HasPrePulseFrames());
+    EXPECT_TRUE(b1.HasPulseCallWidth());
+    EXPECT_TRUE(b1.HasPulseMergeQV());
+
+    EXPECT_TRUE(b2.HasDeletionQV());
+    EXPECT_TRUE(b2.HasDeletionTag());
+    EXPECT_TRUE(b2.HasInsertionQV());
+    EXPECT_TRUE(b2.HasMergeQV());
+    EXPECT_TRUE(b2.HasSubstitutionQV());
+    EXPECT_TRUE(b2.HasSubstitutionTag());
+    EXPECT_TRUE(b2.HasLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelQV());
+    EXPECT_TRUE(b2.HasAltLabelTag());
+    EXPECT_TRUE(b2.HasPkmean());
+    EXPECT_TRUE(b2.HasPkmid());
+    EXPECT_TRUE(b2.HasPulseCall());
+    EXPECT_TRUE(b2.HasIPD());
+    EXPECT_TRUE(b2.HasPulseWidth());
+    EXPECT_TRUE(b2.HasPrePulseFrames());
+    EXPECT_TRUE(b2.HasPulseCallWidth());
+    EXPECT_TRUE(b2.HasPulseMergeQV());
+
+    EXPECT_EQ(b1.FullName(), b2.FullName());
+    EXPECT_EQ(b1.HoleNumber(), b2.HoleNumber());
+    EXPECT_EQ(b1.NumPasses(), b2.NumPasses());
+    EXPECT_EQ(b1.Sequence(), b2.Sequence());
+    EXPECT_EQ(b1.Qualities(), b2.Qualities());
+    EXPECT_EQ(b1.DeletionQV(), b2.DeletionQV());
+    EXPECT_EQ(b1.DeletionTag(), b2.DeletionTag());
+    EXPECT_EQ(b1.InsertionQV(), b2.InsertionQV());
+    EXPECT_EQ(b1.MergeQV(), b2.MergeQV());
+    EXPECT_EQ(b1.SubstitutionQV(), b2.SubstitutionQV());
+    EXPECT_EQ(b1.SubstitutionTag(), b2.SubstitutionTag());
+    EXPECT_EQ(b1.LabelQV(), b2.LabelQV());
+    EXPECT_EQ(b1.AltLabelQV(), b2.AltLabelQV());
+    EXPECT_EQ(b1.AltLabelTag(), b2.AltLabelTag());
+    EXPECT_EQ(b1.Pkmean(), b2.Pkmean());
+    EXPECT_EQ(b1.Pkmid(), b2.Pkmid());
+    EXPECT_EQ(b1.PulseCall(), b2.PulseCall());
+    EXPECT_EQ(b1.IPD(), b2.IPD());
+    EXPECT_EQ(b1.PulseWidth(), b2.PulseWidth());
+    EXPECT_EQ(b1.PrePulseFrames(), b2.PrePulseFrames());
+    EXPECT_EQ(b1.PulseCallWidth(), b2.PulseCallWidth());
+    EXPECT_EQ(b1.ReadGroup(), b2.ReadGroup());
+    EXPECT_EQ(b1.PulseMergeQV(), b2.PulseMergeQV());
+}
+
+static size_t NumVirtualRecords(const std::string& primaryBamFn, const std::string& scrapsBamFn)
+{
+    ZmwReadStitcher stitcher(primaryBamFn, scrapsBamFn);
+    size_t count = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        //        ()record;
+        ++count;
+    }
+    return count;
+}
+
+}  // namespace ZmwReadStitcherTests
+
+TEST(ZmwReadStitching, FromBams_NoFilter)
+{
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+    size_t count = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        //        ()record;
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(ZmwReadStitching, FromBams_Filtered)
+{
+    PbiFilter filter{PbiZmwFilter{100000}};  // setup to match DataSet w/ filter
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam",
+                             filter);
+    size_t count = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        EXPECT_EQ(100000, record.HoleNumber());
+        ++count;
+    }
+    EXPECT_EQ(1, count);
+}
+
+TEST(ZmwReadStitching, FromDataSet_NoFilter)
+{
+    // dataset contains these resources (subreads/scraps + hqregion/scraps BAMs)
+    const std::string primaryFn1 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production.subreads.bam";
+    const std::string scrapsFn1 = PbbamTestsConfig::Data_Dir + "/polymerase/production.scraps.bam";
+    const std::string primaryFn2 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.hqregion.bam";
+    const std::string scrapsFn2 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.scraps.bam";
+    const size_t numExpectedRecords =
+        ZmwReadStitcherTests::NumVirtualRecords(primaryFn1, scrapsFn1) +
+        ZmwReadStitcherTests::NumVirtualRecords(primaryFn2, scrapsFn2);
+
+    const std::string datasetFn =
+        PbbamTestsConfig::Data_Dir + "/polymerase/multiple_resources.subread.dataset.xml";
+
+    DataSet ds{datasetFn};
+    ZmwReadStitcher stitcher{ds};
+    size_t numObservedRecords = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        //        ()record;
+        ++numObservedRecords;
+    }
+    EXPECT_EQ(numExpectedRecords, numObservedRecords);
+}
+
+TEST(ZmwReadStitching, FromDataSet_Filtered)
+{
+    // dataset contains these resources (subreads/scraps + hqregion/scraps BAMs)
+    const std::string primaryFn1 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production.subreads.bam";
+    const std::string scrapsFn1 = PbbamTestsConfig::Data_Dir + "/polymerase/production.scraps.bam";
+    const std::string primaryFn2 = PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam";
+    const std::string scrapsFn2 = PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam";
+    const std::string primaryFn3 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.hqregion.bam";
+    const std::string scrapsFn3 =
+        PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.scraps.bam";
+    const size_t totalRecords = ZmwReadStitcherTests::NumVirtualRecords(primaryFn1, scrapsFn1) +
+                                ZmwReadStitcherTests::NumVirtualRecords(primaryFn2, scrapsFn2) +
+                                ZmwReadStitcherTests::NumVirtualRecords(primaryFn3, scrapsFn3);
+    EXPECT_EQ(5, totalRecords);
+
+    // our filter will remove the 2 "production" BAM pairs
+    // using a ZMW filter that only the "internal" pair should pass
+    const std::string datasetFn =
+        PbbamTestsConfig::Data_Dir + "/polymerase/filtered_resources.subread.dataset.xml";
+
+    DataSet ds{datasetFn};
+    ZmwReadStitcher stitcher{ds};
+    size_t numObservedRecords = 0;
+    while (stitcher.HasNext()) {
+        const auto record = stitcher.Next();
+        //        ()record;
+        ++numObservedRecords;
+    }
+    EXPECT_EQ(1, numObservedRecords);
+}
+
+TEST(ZmwReadStitching, FromDataSet_EmptyDataSet)
+{
+    ZmwReadStitcher stitcher{DataSet{}};
+    EXPECT_FALSE(stitcher.HasNext());
+}
+
+TEST(ZmwReadStitching, EmptyScrapsFile)
+{
+    const std::string primaryBamFn =
+        PbbamTestsConfig::Data_Dir + "/polymerase/scrapless.subreads.bam";
+    const std::string scrapsBamFn = PbbamTestsConfig::Data_Dir + "/polymerase/scrapless.scraps.bam";
+
+    const BamFile primaryBam(primaryBamFn);
+    const BamFile scrapsBam(scrapsBamFn);
+    const PbiRawData primaryIdx(primaryBam.PacBioIndexFilename());
+    const PbiRawData scrapsIdx(scrapsBam.PacBioIndexFilename());
+    EXPECT_EQ(3, primaryIdx.NumReads());
+    EXPECT_EQ(0, scrapsIdx.NumReads());
+
+    int count = 0;
+    ZmwReadStitcher stitcher(primaryBamFn, scrapsBamFn);
+    while (stitcher.HasNext()) {
+        auto record = stitcher.Next();
+        //        ()record;
+        ++count;
+    }
+    EXPECT_EQ(3, count);
+}
+
+TEST(ZmwReadStitching, VirtualRegions)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+    auto virtualRecord = stitcher.Next();
+
+    auto regionMap = virtualRecord.VirtualRegionsMap();
+    auto adapter = virtualRecord.VirtualRegionsTable(VirtualRegionType::ADAPTER);
+
+    // Compare different accessors to same source
+    EXPECT_EQ(regionMap[VirtualRegionType::ADAPTER], adapter);
+
+    // Compare to truth
+    EXPECT_EQ(3047, adapter[0].beginPos);
+    EXPECT_EQ(3095, adapter[0].endPos);
+    EXPECT_EQ(3650, adapter[1].beginPos);
+    EXPECT_EQ(3700, adapter[1].endPos);
+    EXPECT_EQ(4289, adapter[2].beginPos);
+    EXPECT_EQ(4335, adapter[2].endPos);
+    EXPECT_EQ(4888, adapter[3].beginPos);
+    EXPECT_EQ(4939, adapter[3].endPos);
+    EXPECT_EQ(5498, adapter[4].beginPos);
+    EXPECT_EQ(5546, adapter[4].endPos);
+    EXPECT_EQ(6116, adapter[5].beginPos);
+    EXPECT_EQ(6173, adapter[5].endPos);
+    EXPECT_EQ(6740, adapter[6].beginPos);
+    EXPECT_EQ(6790, adapter[6].endPos);
+
+    auto barcode = virtualRecord.VirtualRegionsTable(VirtualRegionType::BARCODE);
+    EXPECT_EQ(regionMap[VirtualRegionType::BARCODE], barcode);
+    EXPECT_EQ(3025, barcode[0].beginPos);
+    EXPECT_EQ(3047, barcode[0].endPos);
+    EXPECT_EQ(3095, barcode[1].beginPos);
+    EXPECT_EQ(3116, barcode[1].endPos);
+    EXPECT_EQ(3628, barcode[2].beginPos);
+    EXPECT_EQ(3650, barcode[2].endPos);
+    EXPECT_EQ(3700, barcode[3].beginPos);
+    EXPECT_EQ(3722, barcode[3].endPos);
+    EXPECT_EQ(4267, barcode[4].beginPos);
+    EXPECT_EQ(4289, barcode[4].endPos);
+    EXPECT_EQ(4335, barcode[5].beginPos);
+    EXPECT_EQ(4356, barcode[5].endPos);
+    EXPECT_EQ(4864, barcode[6].beginPos);
+    EXPECT_EQ(4888, barcode[6].endPos);
+    EXPECT_EQ(4939, barcode[7].beginPos);
+    EXPECT_EQ(4960, barcode[7].endPos);
+    EXPECT_EQ(5477, barcode[8].beginPos);
+    EXPECT_EQ(5498, barcode[8].endPos);
+    EXPECT_EQ(5546, barcode[9].beginPos);
+    EXPECT_EQ(5571, barcode[9].endPos);
+    EXPECT_EQ(6087, barcode[10].beginPos);
+    EXPECT_EQ(6116, barcode[10].endPos);
+    EXPECT_EQ(6173, barcode[11].beginPos);
+    EXPECT_EQ(6199, barcode[11].endPos);
+    EXPECT_EQ(6719, barcode[12].beginPos);
+    EXPECT_EQ(6740, barcode[12].endPos);
+    EXPECT_EQ(6790, barcode[13].beginPos);
+    EXPECT_EQ(6812, barcode[13].endPos);
+
+    auto lqregion = virtualRecord.VirtualRegionsTable(VirtualRegionType::LQREGION);
+    EXPECT_EQ(regionMap[VirtualRegionType::LQREGION], lqregion);
+    EXPECT_EQ(0, lqregion[0].beginPos);
+    EXPECT_EQ(2659, lqregion[0].endPos);
+    EXPECT_EQ(7034, lqregion[1].beginPos);
+    EXPECT_EQ(7035, lqregion[1].endPos);
+
+    auto hqregion = virtualRecord.VirtualRegionsTable(VirtualRegionType::HQREGION);
+    EXPECT_EQ(regionMap[VirtualRegionType::HQREGION], hqregion);
+    EXPECT_EQ(2659, hqregion[0].beginPos);
+    EXPECT_EQ(7034, hqregion[0].endPos);
+}
+
+TEST(ZmwReadStitching, InternalSubreadsToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+
+    // Read original polymerase read
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    // check
+    ZmwReadStitcherTests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(ZmwReadStitching, InternalHQToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/internal.hqregions.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/internal.lqregions.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+
+    // Read original polymerase read
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/internal.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    // check
+    ZmwReadStitcherTests::Compare(polyRecord, virtualRecord);
+}
+
+TEST(ZmwReadStitching, ProductionSubreadsToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/production.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/production.scraps.bam");
+
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // Read original polymerase read
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/production.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    EXPECT_EQ(polyRecord.FullName(), virtualRecord.FullName());
+    EXPECT_EQ(polyRecord.HoleNumber(), virtualRecord.HoleNumber());
+    EXPECT_FLOAT_EQ(polyRecord.ReadAccuracy(), virtualRecord.ReadAccuracy());
+    EXPECT_EQ(polyRecord.NumPasses(), virtualRecord.NumPasses());
+    EXPECT_EQ(polyRecord.Sequence(), virtualRecord.Sequence());
+    EXPECT_EQ(polyRecord.Qualities(), virtualRecord.Qualities());
+    EXPECT_EQ(polyRecord.DeletionQV(), virtualRecord.DeletionQV());
+    EXPECT_EQ(polyRecord.DeletionTag(), virtualRecord.DeletionTag());
+    EXPECT_EQ(polyRecord.InsertionQV(), virtualRecord.InsertionQV());
+    EXPECT_EQ(polyRecord.MergeQV(), virtualRecord.MergeQV());
+    EXPECT_EQ(polyRecord.SubstitutionQV(), virtualRecord.SubstitutionQV());
+    EXPECT_EQ(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag());
+    EXPECT_EQ(polyRecord.IPD(), virtualRecord.IPDV1Frames());
+    EXPECT_EQ(polyRecord.ReadGroup(), virtualRecord.ReadGroup());
+}
+
+TEST(ZmwReadStitching, ProductionHQToOriginal)
+{
+    // Create virtual polymerase read
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.hqregion.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/production_hq.scraps.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    auto virtualRecord = stitcher.Next();
+    EXPECT_FALSE(stitcher.HasNext());
+
+    // Read original polymerase read
+    BamFile polyBam(PbbamTestsConfig::Data_Dir + "/polymerase/production.polymerase.bam");
+    EntireFileQuery polyQuery(polyBam);
+
+    auto begin = polyQuery.begin();
+    auto end = polyQuery.end();
+    EXPECT_TRUE(begin != end);
+    auto polyRecord = *begin;
+
+    EXPECT_FALSE(polyRecord.HasPulseCall());
+    EXPECT_FALSE(virtualRecord.HasPulseCall());
+    EXPECT_EQ(polyRecord.FullName(), virtualRecord.FullName());
+    EXPECT_EQ(polyRecord.HoleNumber(), virtualRecord.HoleNumber());
+    EXPECT_EQ(polyRecord.ReadAccuracy(), virtualRecord.ReadAccuracy());
+    EXPECT_EQ(polyRecord.NumPasses(), virtualRecord.NumPasses());
+    EXPECT_EQ(polyRecord.Sequence(), virtualRecord.Sequence());
+    EXPECT_EQ(polyRecord.Qualities(), virtualRecord.Qualities());
+    EXPECT_EQ(polyRecord.DeletionQV(), virtualRecord.DeletionQV());
+    EXPECT_EQ(polyRecord.DeletionTag(), virtualRecord.DeletionTag());
+    EXPECT_EQ(polyRecord.InsertionQV(), virtualRecord.InsertionQV());
+    EXPECT_EQ(polyRecord.MergeQV(), virtualRecord.MergeQV());
+    EXPECT_EQ(polyRecord.SubstitutionQV(), virtualRecord.SubstitutionQV());
+    EXPECT_EQ(polyRecord.SubstitutionTag(), virtualRecord.SubstitutionTag());
+    EXPECT_EQ(polyRecord.IPD(), virtualRecord.IPDV1Frames());
+    EXPECT_EQ(polyRecord.ReadGroup(), virtualRecord.ReadGroup());
+
+    EXPECT_TRUE(polyRecord.HasDeletionQV());
+    EXPECT_TRUE(polyRecord.HasDeletionTag());
+    EXPECT_TRUE(polyRecord.HasInsertionQV());
+    EXPECT_TRUE(polyRecord.HasMergeQV());
+    EXPECT_TRUE(polyRecord.HasSubstitutionQV());
+    EXPECT_TRUE(polyRecord.HasSubstitutionTag());
+    EXPECT_TRUE(polyRecord.HasIPD());
+    EXPECT_FALSE(polyRecord.HasLabelQV());
+    EXPECT_FALSE(polyRecord.HasAltLabelQV());
+    EXPECT_FALSE(polyRecord.HasAltLabelTag());
+    EXPECT_FALSE(polyRecord.HasPkmean());
+    EXPECT_FALSE(polyRecord.HasPkmid());
+    EXPECT_FALSE(polyRecord.HasPulseCall());
+    EXPECT_FALSE(polyRecord.HasPulseWidth());
+    EXPECT_FALSE(polyRecord.HasPrePulseFrames());
+    EXPECT_FALSE(polyRecord.HasPulseCallWidth());
+
+    EXPECT_TRUE(virtualRecord.HasDeletionQV());
+    EXPECT_TRUE(virtualRecord.HasDeletionTag());
+    EXPECT_TRUE(virtualRecord.HasInsertionQV());
+    EXPECT_TRUE(virtualRecord.HasMergeQV());
+    EXPECT_TRUE(virtualRecord.HasSubstitutionQV());
+    EXPECT_TRUE(virtualRecord.HasSubstitutionTag());
+    EXPECT_TRUE(virtualRecord.HasIPD());
+    EXPECT_FALSE(virtualRecord.HasLabelQV());
+    EXPECT_FALSE(virtualRecord.HasAltLabelQV());
+    EXPECT_FALSE(virtualRecord.HasAltLabelTag());
+    EXPECT_FALSE(virtualRecord.HasPkmean());
+    EXPECT_FALSE(virtualRecord.HasPkmid());
+    EXPECT_FALSE(virtualRecord.HasPulseCall());
+    EXPECT_FALSE(virtualRecord.HasPulseWidth());
+    EXPECT_FALSE(virtualRecord.HasPrePulseFrames());
+    EXPECT_FALSE(virtualRecord.HasPulseCallWidth());
+}
+
+TEST(ZmwReadStitching, VirtualRecord_VirtualRegionsTable)
+{
+    ZmwReadStitcher stitcher(PbbamTestsConfig::Data_Dir + "/polymerase/production.subreads.bam",
+                             PbbamTestsConfig::Data_Dir + "/polymerase/production.scraps.bam");
+    EXPECT_TRUE(stitcher.HasNext());
+    const auto virtualRecord = stitcher.Next();
+
+    const auto subreads = virtualRecord.VirtualRegionsTable(VirtualRegionType::SUBREAD);
+    const auto adapters = virtualRecord.VirtualRegionsTable(VirtualRegionType::ADAPTER);
+    const auto hqRegions = virtualRecord.VirtualRegionsTable(VirtualRegionType::HQREGION);
+    const auto lqRegions = virtualRecord.VirtualRegionsTable(VirtualRegionType::LQREGION);
+    const auto barcodes = virtualRecord.VirtualRegionsTable(VirtualRegionType::BARCODE);
+    const auto filtered = virtualRecord.VirtualRegionsTable(VirtualRegionType::FILTERED);
+
+    EXPECT_FALSE(subreads.empty());
+    EXPECT_FALSE(adapters.empty());
+    EXPECT_FALSE(hqRegions.empty());
+    EXPECT_FALSE(lqRegions.empty());
+    EXPECT_FALSE(barcodes.empty());
+    EXPECT_TRUE(filtered.empty());  // this type not present in this data
+}
+
+TEST(ZmwReadStitching, LegacyTypedefsOk)
+{
+    {
+        VirtualPolymeraseReader reader(
+            PbbamTestsConfig::Data_Dir + "/polymerase/internal.subreads.bam",
+            PbbamTestsConfig::Data_Dir + "/polymerase/internal.scraps.bam");
+        size_t count = 0;
+        while (reader.HasNext()) {
+            const auto record = reader.Next();
+            //            ()record;
+            ++count;
+        }
+        EXPECT_EQ(3, count);
+    }
+
+    {
+        VirtualPolymeraseCompositeReader reader{DataSet{}};
+        EXPECT_FALSE(reader.HasNext());
+    }
+}
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt

new file mode 100644 (file)

index 0000000..d1aef15
--- /dev/null
+++ b/tools/CMakeLists.txt
@@ -0,0 +1,48 @@
+
+
+if(DEFINED PacBioBAM_build_pbindex)
+
+    # Deprecating the "PacBioBAM_build_pbindex" command line option in favor of more
+    # general "PacBioBAM_build_tools", as we're starting to add new utilities.
+    #
+    # That said, I don't want to break current auto tests/builds, so I'm providing a
+    # warning message so devs are aware.
+    #
+    # construct warning message
+    set(pbindex_warning "\nDeprecated:\n-DPacBioBAM_build_pbindex\n")
+    if (PacBioBAM_build_pbindex)
+        set(pbindex_warning "${pbindex_warning} Building as requested,")
+    else()
+        set(pbindex_warning "${pbindex_warning} Skipping as requested,")
+    endif()
+    set(pbindex_warning "${pbindex_warning} but support for this option will be removed at some point in the future.\n")
+    message(AUTHOR_WARNING "${pbindex_warning} ** Use -DPacBioBAM_build_tools instead. **\n")
+
+    # force PacBioBAM_build_tools option
+    set(PacBioBAM_build_tools
+        ${PacBioBAM_build_pbindex} CACHE BOOL
+        "Build PacBioBAM with add'l utilities (e.g. pbindex, pbindexdump)." FORCE)
+endif()
+
+if (PacBioBAM_build_tools)
+
+    # tools directory
+    set(ToolsCommonDir ${PacBioBAM_ToolsDir}/common)
+    set(PacBioBAM_CramTestsDir ${PacBioBAM_TestsDir}/src/cram)
+
+    # quash warning with OptionParser
+    include(CheckCXXCompilerFlag)
+    check_cxx_compiler_flag("-Wno-unused-private-field" HAS_NO_UNUSED_PRIVATE_FIELD)
+    if(HAS_NO_UNUSED_PRIVATE_FIELD)
+        set(PacBioBAM_CXX_FLAGS "${PacBioBAM_CXX_FLAGS} -Wno-unused-private-field")
+    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PacBioBAM_CXX_FLAGS}")
+
+    # tools
+    add_subdirectory(bam2sam)
+    add_subdirectory(pbindex)
+    add_subdirectory(pbindexdump)
+    add_subdirectory(pbmerge)
+    add_subdirectory(pbbamify)
+
+endif()
diff --git a/tools/bam2sam/CMakeLists.txt b/tools/bam2sam/CMakeLists.txt

new file mode 100644 (file)

index 0000000..5554970
--- /dev/null
+++ b/tools/bam2sam/CMakeLists.txt
@@ -0,0 +1,39 @@
+
+set(Bam2SamSrcDir ${PacBioBAM_ToolsDir}/bam2sam/src)
+
+# create version header
+set(Bam2Sam_VERSION ${PacBioBAM_VERSION})
+configure_file(
+    ${Bam2SamSrcDir}/Bam2SamVersion.h.in ${GeneratedDir}/Bam2SamVersion.h @ONLY
+)
+
+# list source files
+set(BAM2SAM_SOURCES
+    ${ToolsCommonDir}/OptionParser.cpp
+    ${Bam2SamSrcDir}/main.cpp
+    ${Bam2SamSrcDir}/Bam2Sam.cpp
+)
+
+# build bam2sam executable
+include(PbbamTool)
+create_pbbam_tool(
+    TARGET  bam2sam
+    SOURCES ${BAM2SAM_SOURCES}
+)
+
+# cram tests
+if (PacBioBAM_build_tests)
+
+    configure_file(
+        ${PacBioBAM_CramTestsDir}/bam2sam.t.in
+        ${GeneratedDir}/bam2sam.t
+    )
+
+    add_test(
+        NAME bam2sam_CramTests
+        WORKING_DIRECTORY ${PacBioBAM_TestsDir}/scripts
+        COMMAND "python" cram.py
+            ${GeneratedDir}/bam2sam.t
+    )
+
+endif()
diff --git a/tools/bam2sam/src/Bam2Sam.cpp b/tools/bam2sam/src/Bam2Sam.cpp

new file mode 100644 (file)

index 0000000..ee5ad79
--- /dev/null
+++ b/tools/bam2sam/src/Bam2Sam.cpp
@@ -0,0 +1,80 @@
+// Author: Derek Barnett
+
+#include <cassert>
+#include <memory>
+#include <stdexcept>
+
+#include <htslib/sam.h>
+
+#include "Bam2Sam.h"
+
+using namespace bam2sam;
+
+namespace bam2sam {
+
+struct HtslibFileDeleter
+{
+    void operator()(samFile* file)
+    {
+        if (file) sam_close(file);
+        file = nullptr;
+    }
+};
+
+struct HtslibHeaderDeleter
+{
+    void operator()(bam_hdr_t* hdr)
+    {
+        if (hdr) bam_hdr_destroy(hdr);
+        hdr = nullptr;
+    }
+};
+
+struct HtslibRecordDeleter
+{
+    void operator()(bam1_t* b)
+    {
+        if (b) bam_destroy1(b);
+        b = nullptr;
+    }
+};
+
+}  // namespace bam2sam
+
+void PbBam2Sam::Run(const Settings& settings)
+{
+    int htslibResult = 0;
+
+    // open files
+
+    std::unique_ptr<samFile, HtslibFileDeleter> inFileWrapper(
+        sam_open(settings.inputFilename_.c_str(), "rb"));
+    samFile* in = inFileWrapper.get();
+    if (!in || !in->fp.bgzf) throw std::runtime_error("could not read from stdin");
+
+    std::unique_ptr<samFile, HtslibFileDeleter> outFileWrapper(sam_open("-", "w"));
+    samFile* out = outFileWrapper.get();
+    if (!out) throw std::runtime_error("could not write to stdout");
+
+    // fetch & write header
+
+    std::unique_ptr<bam_hdr_t, HtslibHeaderDeleter> headerWrapper(bam_hdr_read(in->fp.bgzf));
+    bam_hdr_t* hdr = headerWrapper.get();
+    if (!hdr) throw std::runtime_error("could not read header");
+
+    if (!settings.noHeader_) {
+        htslibResult = sam_hdr_write(out, hdr);
+        if (htslibResult != 0) throw std::runtime_error("could not write header");
+        if (settings.printHeaderOnly_) return;
+    }
+
+    // fetch & write records
+
+    std::unique_ptr<bam1_t, HtslibRecordDeleter> recordWrapper(bam_init1());
+    bam1_t* b = recordWrapper.get();
+
+    while ((htslibResult = sam_read1(in, hdr, b)) >= 0) {
+        htslibResult = sam_write1(out, hdr, b);
+        if (htslibResult < 0) throw std::runtime_error("error writing record to stdout");
+    }
+}
diff --git a/tools/bam2sam/src/Bam2Sam.h b/tools/bam2sam/src/Bam2Sam.h

new file mode 100644 (file)

index 0000000..690baf6
--- /dev/null
+++ b/tools/bam2sam/src/Bam2Sam.h
@@ -0,0 +1,18 @@
+// Author: Derek Barnett
+
+#ifndef BAM2SAM_H
+#define BAM2SAM_H
+
+#include "Settings.h"
+
+namespace bam2sam {
+
+class PbBam2Sam
+{
+public:
+    static void Run(const Settings& settings);
+};
+
+}  // namespace bam2sam
+
+#endif  // PBIBAM2SAM_H
diff --git a/tools/bam2sam/src/Bam2SamVersion.h.in b/tools/bam2sam/src/Bam2SamVersion.h.in

new file mode 100644 (file)

index 0000000..f009ad4
--- /dev/null
+++ b/tools/bam2sam/src/Bam2SamVersion.h.in
@@ -0,0 +1,14 @@
+// Author: Derek Barnett
+
+#ifndef BAM2SAMVERSION_H
+#define BAM2SAMVERSION_H
+
+#include <string>
+
+namespace bam2sam {
+
+const std::string Version = std::string("@Bam2Sam_VERSION@");
+
+} // namespace bam2sam
+
+#endif // BAM2SAMVERSION_H
diff --git a/tools/bam2sam/src/Settings.h b/tools/bam2sam/src/Settings.h

new file mode 100644 (file)

index 0000000..967144a
--- /dev/null
+++ b/tools/bam2sam/src/Settings.h
@@ -0,0 +1,25 @@
+// Author: Derek Barnett
+
+#ifndef SETTINGS_H
+#define SETTINGS_H
+
+#include <string>
+#include <vector>
+
+namespace bam2sam {
+
+class Settings
+{
+public:
+    Settings(void) : noHeader_(false), printHeaderOnly_(false) {}
+
+public:
+    std::string inputFilename_;
+    bool noHeader_;
+    bool printHeaderOnly_;
+    std::vector<std::string> errors_;
+};
+
+}  // namespace bam2sam
+
+#endif  // SETTINGS_H
diff --git a/tools/bam2sam/src/main.cpp b/tools/bam2sam/src/main.cpp

new file mode 100644 (file)

index 0000000..07f0e7e
--- /dev/null
+++ b/tools/bam2sam/src/main.cpp
@@ -0,0 +1,87 @@
+// Author: Derek Barnett
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <string>
+#include <vector>
+#include "../common/OptionParser.h"
+#include "Bam2Sam.h"
+#include "Bam2SamVersion.h"
+
+static bam2sam::Settings fromCommandLine(optparse::OptionParser& parser, int argc, char* argv[])
+{
+    bam2sam::Settings settings;
+
+    const optparse::Values options = parser.parse_args(argc, argv);
+
+    // input
+    const std::vector<std::string> positionalArgs = parser.args();
+    const size_t numPositionalArgs = positionalArgs.size();
+    if (numPositionalArgs == 0)
+        settings.inputFilename_ = "-";  // stdin
+    else if (numPositionalArgs == 1)
+        settings.inputFilename_ = parser.args().front();
+    else {
+        assert(numPositionalArgs > 1);
+        settings.errors_.emplace_back("bam2sam does not support more than one input file per run");
+    }
+
+    // header options
+    if (options.is_set("no_header")) settings.noHeader_ = options.get("no_header");
+    if (options.is_set("header_only")) settings.printHeaderOnly_ = options.get("header_only");
+
+    if (settings.noHeader_ && settings.printHeaderOnly_)
+        settings.errors_.emplace_back(
+            "conflicting arguments requested: --no-header and --header-only");
+
+    return settings;
+}
+
+int main(int argc, char* argv[])
+{
+    // setup help & options
+    optparse::OptionParser parser;
+    parser.description(
+        "bam2sam converts a BAM file to SAM. It is essentially a stripped-down "
+        "'samtools view', mostly useful for testing/debugging without requiring samtools. "
+        "Input BAM file is read from a file or stdin, and SAM output is written to stdout.");
+    parser.prog("bam2sam");
+    parser.usage("bam2sam [options] [input]");
+    parser.version(bam2sam::Version);
+    parser.add_version_option(true);
+    parser.add_help_option(true);
+
+    auto optionGroup = optparse::OptionGroup(parser, "Options");
+    optionGroup.add_option("").dest("input").metavar("input").help(
+        "Input BAM file. If not provided, stdin will be used as input.");
+    optionGroup.add_option("--no-header")
+        .dest("no_header")
+        .action("store_true")
+        .help("Omit header from output.");
+    optionGroup.add_option("--header-only")
+        .dest("header_only")
+        .action("store_true")
+        .help("Print only the header (no records).");
+    parser.add_option_group(optionGroup);
+
+    // parse command line for settings
+    const bam2sam::Settings settings = fromCommandLine(parser, argc, argv);
+    if (!settings.errors_.empty()) {
+        std::cerr << std::endl;
+        for (const auto e : settings.errors_)
+            std::cerr << "ERROR: " << e << std::endl;
+        std::cerr << std::endl;
+        parser.print_help();
+        return EXIT_FAILURE;
+    }
+
+    // run tool
+    try {
+        bam2sam::PbBam2Sam::Run(settings);
+        return EXIT_SUCCESS;
+    } catch (std::exception& e) {
+        std::cerr << "ERROR: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tools/check-formatting b/tools/check-formatting

new file mode 100755 (executable)

index 0000000..95a52e8
--- /dev/null
+++ b/tools/check-formatting
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+
+PLATFORM=$(uname)
+TOOLSPATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
+CLANGFORMAT="${TOOLSPATH}/${PLATFORM}/clang-format -style=file"
+
+if [ "$1" == "--all" ]
+then
+    find include src tests/src tools \( -name *.cpp -or -name *.h \) -not -name pugi* -print0 \
+    | xargs -n1 -0 ${CLANGFORMAT} -output-replacements-xml \
+    | grep -c "<replacement " > /dev/null
+    grepCode=$?
+elif [ "$1" == "--staged" ]
+then
+    git diff --cached --name-only --diff-filter=ACMRT | grep -e '.*\.h$' -e '.*\.cpp' -v '**third-party/*' \
+    | xargs -n1 ${CLANGFORMAT} -output-replacements-xml \
+    | grep -c "<replacement " >/dev/null
+    grepCode=$?
+else
+    echo "Please specify --all or --staged"
+    exit 1
+fi
+
+# grep exits 0 => found needed formatting changes
+if [ $grepCode -ne 0 ]
+then
+    echo "Formatting looks good!"
+    exit 0
+else
+    echo "****************************************************"
+    echo "Code needs formatting!  Please use 'tools/format-all'"
+    echo "****************************************************"
+    exit 1
+fi
diff --git a/tools/common/BamFileMerger.h b/tools/common/BamFileMerger.h

new file mode 100644 (file)

index 0000000..a3107af
--- /dev/null
+++ b/tools/common/BamFileMerger.h
@@ -0,0 +1,42 @@
+// Author: Derek Barnett
+
+#ifndef BAMFILEMERGER_H
+#define BAMFILEMERGER_H
+
+#include <pbbam/DataSet.h>
+#include <pbbam/PbiFilter.h>
+#include <pbbam/ProgramInfo.h>
+#include <string>
+#include <vector>
+
+namespace PacBio {
+namespace BAM {
+namespace common {
+
+class BamFileMerger
+{
+public:
+    /// \brief Runs merger on a dataset, applying any supplied filters.
+    ///
+    /// When this function exits, a merged BAM (and optional PBI) will have been
+    /// written and closed.
+    ///
+    /// \param[in] dataset          provides input filenames & filters
+    /// \param[in] outputFilename   resulting BAM output
+    /// \param[in] mergeProgram     info about the calling program. Adds a @PG entry to merged header.
+    /// \param[in] createPbi        if true, creates a PBI alongside output BAM
+    ///
+    /// \throws std::runtime_error if any any errors encountered while reading or writing
+    ///
+    static void Merge(const PacBio::BAM::DataSet& dataset, const std::string& outputFilename,
+                      const PacBio::BAM::ProgramInfo& mergeProgram = PacBio::BAM::ProgramInfo(),
+                      bool createPbi = true);
+};
+
+}  // namespace common
+}  // namespace BAM
+}  // namespace PacBio
+
+#include "BamFileMerger.inl"
+
+#endif  // BAMFILEMERGER_H
diff --git a/tools/common/BamFileMerger.inl b/tools/common/BamFileMerger.inl

new file mode 100644 (file)

index 0000000..f9bde4b
--- /dev/null
+++ b/tools/common/BamFileMerger.inl
@@ -0,0 +1,226 @@
+// Author: Derek Barnett
+
+#include "BamFileMerger.h"
+
+#include <pbbam/BamHeader.h>
+#include <pbbam/BamReader.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/CompositeBamReader.h>
+#include <pbbam/PbiBuilder.h>
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <stdexcept>
+#include <cassert>
+
+namespace PacBio {
+namespace BAM {
+namespace common {
+
+// ICollator
+
+class ICollator
+{
+public:
+    virtual ~ICollator(void) = default;
+
+    bool GetNext(BamRecord& record)
+    {
+        // nothing left to read
+        if (mergeItems_.empty())
+            return false;
+
+        // non-destructive 'pop' of first item from queue
+        auto firstIter = mergeItems_.begin();
+        auto firstItem = PacBio::BAM::internal::CompositeMergeItem{ std::move(firstIter->reader),
+                                                                    std::move(firstIter->record)
+                                                                  };
+        mergeItems_.pop_front();
+
+        // store its record in our output record
+        std::swap(record, firstItem.record);
+
+        // try fetch 'next' from first item's reader
+        // if successful, re-insert it into container & re-sort on our new values
+        // otherwise, this item will go out of scope & reader destroyed
+        if (firstItem.reader->GetNext(firstItem.record)) {
+            mergeItems_.push_front(std::move(firstItem));
+            UpdateSort();
+        }
+
+        // return success
+        return true;
+    }
+
+protected:
+    std::deque<PacBio::BAM::internal::CompositeMergeItem> mergeItems_;
+
+protected:
+    ICollator(std::vector<std::unique_ptr<PacBio::BAM::BamReader> >&& readers)
+    {
+        for (auto&& reader : readers) {
+            auto item = internal::CompositeMergeItem{std::move(reader)};
+            if (item.reader->GetNext(item.record))
+                mergeItems_.push_back(std::move(item));
+        }
+    }
+
+    virtual void UpdateSort(void) =0;
+};
+
+// QNameCollator
+
+struct QNameSorter : std::binary_function<internal::CompositeMergeItem,
+                                          internal::CompositeMergeItem,
+                                          bool>
+{
+    bool operator()(const internal::CompositeMergeItem& lhs,
+                    const internal::CompositeMergeItem& rhs)
+    {
+        const BamRecord& l = lhs.record;
+        const BamRecord& r = rhs.record;
+
+        // movie name
+        const int cmp = l.MovieName().compare(r.MovieName());
+        if (cmp != 0)
+            return cmp < 0;
+
+        // hole number
+        const auto lhsZmw = l.HoleNumber();
+        const auto rhsZmw = r.HoleNumber();
+        if (lhsZmw != rhsZmw)
+            return lhsZmw < rhsZmw;
+
+        // shuffle CCS/transcript reads after all others
+        if (IsCcsOrTranscript(l.Type()))
+            return false;
+        if (IsCcsOrTranscript(r.Type()))
+            return true;
+
+        // sort on qStart, then finally qEnd
+        const auto lhsQStart = l.QueryStart();
+        const auto rhsQStart = r.QueryStart();
+        return lhsQStart < rhsQStart;
+    }
+};
+
+class QNameCollator : public ICollator
+{
+public:
+    QNameCollator(std::vector<std::unique_ptr<PacBio::BAM::BamReader>>&& readers)
+        : ICollator(std::move(readers))
+    { UpdateSort(); }
+
+    void UpdateSort(void)
+    { std::sort(mergeItems_.begin(), mergeItems_.end(), QNameSorter{ }); }
+};
+
+// AlignedCollator
+
+class AlignedCollator : public ICollator
+{
+public:
+    AlignedCollator(std::vector<std::unique_ptr<PacBio::BAM::BamReader>>&& readers)
+        : ICollator(std::move(readers))
+    { UpdateSort(); }
+
+    void UpdateSort(void)
+    { std::sort(mergeItems_.begin(), mergeItems_.end(), PacBio::BAM::PositionSorter{ }); }
+};
+
+// BamFileMerger
+
+inline
+void BamFileMerger::Merge(const DataSet& dataset,
+                          const std::string& outputFilename,
+                          const ProgramInfo& mergeProgram,
+                          bool createPbi)
+{
+    const PbiFilter filter = PbiFilter::FromDataSet(dataset);
+
+    std::vector<std::string> inputFilenames_;
+    const auto& bamFiles = dataset.BamFiles();
+    inputFilenames_.reserve(bamFiles.size());
+    for (const auto& file : bamFiles)
+        inputFilenames_.push_back(file.Filename());
+
+    if (inputFilenames_.empty())
+        throw std::runtime_error("no input filenames provided to BamFileMerger");
+
+    if (outputFilename.empty())
+        throw std::runtime_error("no output filename provide to BamFileMerger");
+
+
+    // attempt open input files
+    std::vector<std::unique_ptr<BamReader> > readers;
+    readers.reserve(inputFilenames_.size());
+    for (const auto& fn : inputFilenames_) {
+        if (filter.IsEmpty())
+            readers.emplace_back(new BamReader(fn));
+        else
+            readers.emplace_back(new PbiIndexedBamReader(filter, fn));
+    }
+
+    // read headers
+    std::vector<BamHeader> headers;
+    headers.reserve(readers.size());
+    for (auto&& reader : readers)
+        headers.push_back(reader->Header());
+
+    assert(!readers.empty());
+    assert(!headers.empty());
+
+    // merge headers
+    BamHeader mergedHeader = headers.front();
+    const std::string& usingSortOrder = mergedHeader.SortOrder();
+    const bool isCoordinateSorted = (usingSortOrder == "coordinate");
+    for (size_t i = 1; i < headers.size(); ++i) {
+        const BamHeader& header = headers.at(i);
+        if (header.SortOrder() != usingSortOrder)
+            throw std::runtime_error("BAM file sort orders do not match, aborting merge");
+        mergedHeader += headers.at(i);
+    }
+    if (mergeProgram.IsValid())
+        mergedHeader.AddProgram(mergeProgram);
+
+    // setup collator, based on sort order
+    std::unique_ptr<ICollator> collator;
+    if (isCoordinateSorted)
+        collator.reset(new AlignedCollator(std::move(readers)));
+    else
+        collator.reset(new QNameCollator(std::move(readers)));
+    // NOTE: readers *moved*, so no longer accessible here
+
+    // do merge, creating PBI on-the-fly
+    if (createPbi && (outputFilename != "-")) {
+
+        // TODO: this implementation recalculates all PBI values, when we really
+        //       only need to collate entries and update offsets
+
+        BamWriter writer(outputFilename, mergedHeader);
+        PbiBuilder builder{ (outputFilename + ".pbi"),
+                            mergedHeader.NumSequences(),
+                            isCoordinateSorted
+                          };
+        BamRecord record;
+        int64_t vOffset = 0;
+        while (collator->GetNext(record)) {
+            writer.Write(record, &vOffset);
+            builder.AddRecord(record, vOffset);
+        }
+    }
+
+    // otherwise just merge BAM
+    else {
+        BamWriter writer(outputFilename, mergedHeader);
+        BamRecord record;
+        while (collator->GetNext(record))
+            writer.Write(record);
+    }
+}
+
+} // namespace common
+} // namespace BAM
+} // namespace PacBio
diff --git a/tools/common/OptionParser.cpp b/tools/common/OptionParser.cpp

new file mode 100644 (file)

index 0000000..41e3d58
--- /dev/null
+++ b/tools/common/OptionParser.cpp
@@ -0,0 +1,555 @@
+/**
+ * Copyright (C) 2010 Johannes Weißl <jargon@molb.org>
+ * License: your favourite BSD-style license
+ *
+ * See OptionParser.h for help.
+ */
+
+#include "OptionParser.h"
+
+#include <algorithm>
+#include <ciso646>
+#include <complex>
+#include <cstddef>
+#include <cstdlib>
+#include <list>
+#include <set>
+#include <string>
+
+#if defined(ENABLE_NLS) && ENABLE_NLS
+#include <libintl.h>
+#define _(s) gettext(s)
+#else
+#define _(s) (static_cast<const char*>(s))
+#endif
+
+namespace optparse {
+
+////////// auxiliary (string) functions { //////////
+class str_wrap
+{
+public:
+    str_wrap(const std::string& l, const std::string& r) : lwrap(l), rwrap(r) {}
+    str_wrap(const std::string& w) : lwrap(w), rwrap(w) {}
+    std::string operator()(const std::string& s) { return lwrap + s + rwrap; }
+    const std::string lwrap, rwrap;
+};
+template <typename InputIterator, typename UnaryOperator>
+static std::string str_join_trans(const std::string& sep, InputIterator begin, InputIterator end,
+                                  UnaryOperator op)
+{
+    std::string buf;
+    for (InputIterator it = begin; it != end; ++it) {
+        if (it != begin) buf += sep;
+        buf += op(*it);
+    }
+    return buf;
+}
+template <class InputIterator>
+static std::string str_join(const std::string& sep, InputIterator begin, InputIterator end)
+{
+    return str_join_trans(sep, begin, end, str_wrap(""));
+}
+static std::string& str_replace(std::string& s, const std::string& patt, const std::string& repl)
+{
+    size_t pos = 0, n = patt.length();
+    while (true) {
+        pos = s.find(patt, pos);
+        if (pos == std::string::npos) break;
+        s.replace(pos, n, repl);
+        pos += repl.size();
+    }
+    return s;
+}
+static std::string str_replace(const std::string& s, const std::string& patt,
+                               const std::string& repl)
+{
+    std::string tmp = s;
+    str_replace(tmp, patt, repl);
+    return tmp;
+}
+static std::string str_format(const std::string& s, size_t pre, size_t len,
+                              bool indent_first = true)
+{
+    std::ostringstream ss;
+    std::string p;
+    if (indent_first) p = std::string(pre, ' ');
+
+    size_t pos = 0, linestart = 0;
+    size_t line = 0;
+    while (true) {
+        bool wrap = false;
+
+        size_t new_pos = s.find_first_of(" \n\t", pos);
+        if (new_pos == std::string::npos) break;
+        if (s[new_pos] == '\n') {
+            pos = new_pos + 1;
+            wrap = true;
+        }
+        if (line == 1) p = std::string(pre, ' ');
+        if (wrap || new_pos + pre > linestart + len) {
+            ss << p << s.substr(linestart, pos - linestart - 1) << std::endl;
+            linestart = pos;
+            line++;
+        }
+        pos = new_pos + 1;
+    }
+    ss << p << s.substr(linestart) << std::endl;
+    return ss.str();
+}
+static std::string str_inc(const std::string& s)
+{
+    std::ostringstream ss;
+    std::string v = (s != "") ? s : "0";
+    long i;
+    std::istringstream(v) >> i;
+    ss << i + 1;
+    return ss.str();
+}
+static unsigned int cols()
+{
+    unsigned int n = 80;
+#ifndef _WIN32
+    const char* s = getenv("COLUMNS");
+    if (s) std::istringstream(s) >> n;
+#endif
+    return n;
+}
+static std::string basename(const std::string& s)
+{
+    std::string b = s;
+    size_t i = b.find_last_not_of('/');
+    if (i == std::string::npos) {
+        if (b[0] == '/') b.erase(1);
+        return b;
+    }
+    b.erase(i + 1, b.length() - i - 1);
+    i = b.find_last_of("/");
+    if (i != std::string::npos) b.erase(0, i + 1);
+    return b;
+}
+////////// } auxiliary (string) functions //////////
+
+////////// class OptionParser { //////////
+OptionParser::OptionParser()
+    : _usage(_("%prog [options]"))
+    , _add_help_option(true)
+    , _add_version_option(true)
+    , _interspersed_args(true)
+{
+}
+
+Option& OptionParser::add_option(const std::string& opt)
+{
+    const std::string tmp[1] = {opt};
+    return add_option(std::vector<std::string>(&tmp[0], &tmp[1]));
+}
+Option& OptionParser::add_option(const std::string& opt1, const std::string& opt2)
+{
+    const std::string tmp[2] = {opt1, opt2};
+    return add_option(std::vector<std::string>(&tmp[0], &tmp[2]));
+}
+Option& OptionParser::add_option(const std::string& opt1, const std::string& opt2,
+                                 const std::string& opt3)
+{
+    const std::string tmp[3] = {opt1, opt2, opt3};
+    return add_option(std::vector<std::string>(&tmp[0], &tmp[3]));
+}
+Option& OptionParser::add_option(const std::vector<std::string>& v)
+{
+    _opts.resize(_opts.size() + 1);
+    Option& option = _opts.back();
+    std::string dest_fallback;
+    for (std::vector<std::string>::const_iterator it = v.begin(); it != v.end(); ++it) {
+        if (it->substr(0, 2) == "--") {
+            const std::string s = it->substr(2);
+            if (option.dest() == "") option.dest(str_replace(s, "-", "_"));
+            option._long_opts.insert(s);
+            _optmap_l[s] = &option;
+        } else if (it->empty()) {
+            continue;
+        } else {
+            const std::string s = it->substr(1, 1);
+            if (dest_fallback == "") dest_fallback = s;
+            option._short_opts.insert(s);
+            _optmap_s[s] = &option;
+        }
+    }
+    if (option.dest() == "") option.dest(dest_fallback);
+    return option;
+}
+
+OptionParser& OptionParser::add_option_group(const OptionGroup& group)
+{
+    for (std::list<Option>::const_iterator oit = group._opts.begin(); oit != group._opts.end();
+         ++oit) {
+        const Option& option = *oit;
+        for (std::set<std::string>::const_iterator it = option._short_opts.begin();
+             it != option._short_opts.end(); ++it)
+            _optmap_s[*it] = &option;
+        for (std::set<std::string>::const_iterator it = option._long_opts.begin();
+             it != option._long_opts.end(); ++it)
+            _optmap_l[*it] = &option;
+    }
+    _groups.push_back(&group);
+    return *this;
+}
+
+const Option& OptionParser::lookup_short_opt(const std::string& opt) const
+{
+    optMap::const_iterator it = _optmap_s.find(opt);
+    if (it == _optmap_s.end()) error(_("no such option") + std::string(": -") + opt);
+    return *it->second;
+}
+
+void OptionParser::handle_short_opt(const std::string& opt, const std::string& arg)
+{
+
+    _remaining.pop_front();
+    std::string value;
+
+    const Option& option = lookup_short_opt(opt);
+    if (option._nargs == 1) {
+        value = arg.substr(2);
+        if (value == "") {
+            if (_remaining.empty()) error("-" + opt + " " + _("option requires an argument"));
+            value = _remaining.front();
+            _remaining.pop_front();
+        }
+    } else {
+        if (arg.length() > 2) _remaining.push_front(std::string("-") + arg.substr(2));
+    }
+
+    process_opt(option, std::string("-") + opt, value);
+}
+
+const Option& OptionParser::lookup_long_opt(const std::string& opt) const
+{
+
+    std::list<std::string> matching;
+    for (optMap::const_iterator it = _optmap_l.begin(); it != _optmap_l.end(); ++it) {
+        if (it->first.compare(0, opt.length(), opt) == 0) matching.push_back(it->first);
+    }
+    if (matching.size() > 1) {
+        std::string x = str_join(", ", matching.begin(), matching.end());
+        error(_("ambiguous option") + std::string(": --") + opt + " (" + x + "?)");
+    }
+    if (matching.size() == 0) error(_("no such option") + std::string(": --") + opt);
+
+    return *_optmap_l.find(matching.front())->second;
+}
+
+void OptionParser::handle_long_opt(const std::string& optstr)
+{
+
+    _remaining.pop_front();
+    std::string opt, value;
+
+    size_t delim = optstr.find("=");
+    if (delim != std::string::npos) {
+        opt = optstr.substr(0, delim);
+        value = optstr.substr(delim + 1);
+    } else
+        opt = optstr;
+
+    const Option& option = lookup_long_opt(opt);
+    if (option._nargs == 1 and delim == std::string::npos) {
+        if (not _remaining.empty()) {
+            value = _remaining.front();
+            _remaining.pop_front();
+        }
+    }
+
+    if (option._nargs == 1 and value == "")
+        error("--" + opt + " " + _("option requires an argument"));
+
+    process_opt(option, std::string("--") + opt, value);
+}
+
+Values& OptionParser::parse_args(const int argc, char const* const* const argv)
+{
+    if (prog() == "") prog(basename(argv[0]));
+    return parse_args(&argv[1], &argv[argc]);
+}
+Values& OptionParser::parse_args(const std::vector<std::string>& v)
+{
+
+    _remaining.assign(v.begin(), v.end());
+
+    if (add_version_option() and version() != "") {
+        add_option("--version").action("version").help(_("show program's version number and exit"));
+        _opts.splice(_opts.begin(), _opts, --(_opts.end()));
+    }
+    if (add_help_option()) {
+        add_option("-h", "--help").action("help").help(_("show this help message and exit"));
+        _opts.splice(_opts.begin(), _opts, --(_opts.end()));
+    }
+
+    while (not _remaining.empty()) {
+        const std::string arg = _remaining.front();
+
+        if (arg == "--") {
+            _remaining.pop_front();
+            break;
+        }
+
+        if (arg.substr(0, 2) == "--") {
+            handle_long_opt(arg.substr(2));
+        } else if (arg.substr(0, 1) == "-" and arg.length() > 1) {
+            handle_short_opt(arg.substr(1, 1), arg);
+        } else {
+            _remaining.pop_front();
+            _leftover.push_back(arg);
+            if (not interspersed_args()) break;
+        }
+    }
+    while (not _remaining.empty()) {
+        const std::string arg = _remaining.front();
+        _remaining.pop_front();
+        _leftover.push_back(arg);
+    }
+
+    for (strMap::const_iterator it = _defaults.begin(); it != _defaults.end(); ++it) {
+        if (not _values.is_set(it->first)) _values[it->first] = it->second;
+    }
+
+    for (std::list<Option>::const_iterator it = _opts.begin(); it != _opts.end(); ++it) {
+        if (it->get_default() != "" and not _values.is_set(it->dest()))
+            _values[it->dest()] = it->get_default();
+    }
+
+    return _values;
+}
+
+void OptionParser::process_opt(const Option& o, const std::string& opt, const std::string& value)
+{
+    if (o.action() == "store") {
+        std::string err = o.check_type(opt, value);
+        if (err != "") error(err);
+        _values[o.dest()] = value;
+        _values.is_set_by_user(o.dest(), true);
+    } else if (o.action() == "store_const") {
+        _values[o.dest()] = o.get_const();
+        _values.is_set_by_user(o.dest(), true);
+    } else if (o.action() == "store_true") {
+        _values[o.dest()] = "1";
+        _values.is_set_by_user(o.dest(), true);
+    } else if (o.action() == "store_false") {
+        _values[o.dest()] = "0";
+        _values.is_set_by_user(o.dest(), true);
+    } else if (o.action() == "append") {
+        std::string err = o.check_type(opt, value);
+        if (err != "") error(err);
+        _values[o.dest()] = value;
+        _values.all(o.dest()).push_back(value);
+        _values.is_set_by_user(o.dest(), true);
+    } else if (o.action() == "append_const") {
+        _values[o.dest()] = o.get_const();
+        _values.all(o.dest()).push_back(o.get_const());
+        _values.is_set_by_user(o.dest(), true);
+    } else if (o.action() == "count") {
+        _values[o.dest()] = str_inc(_values[o.dest()]);
+        _values.is_set_by_user(o.dest(), true);
+    } else if (o.action() == "help") {
+        print_help();
+        std::exit(0);
+    } else if (o.action() == "version") {
+        print_version();
+        std::exit(0);
+    } else if (o.action() == "callback" && o.callback()) {
+        (*o.callback())(o, opt, value, *this);
+    }
+}
+
+std::string OptionParser::format_option_help(unsigned int indent /* = 2 */) const
+{
+    std::ostringstream ss;
+
+    if (_opts.empty()) return ss.str();
+
+    for (std::list<Option>::const_iterator it = _opts.begin(); it != _opts.end(); ++it) {
+        if (it->help() != SUPPRESS_HELP) ss << it->format_help(indent);
+    }
+
+    return ss.str();
+}
+
+std::string OptionParser::format_help() const
+{
+    std::ostringstream ss;
+
+    if (usage() != SUPPRESS_USAGE) ss << get_usage() << std::endl;
+
+    if (description() != "") ss << str_format(description(), 0, cols()) << std::endl;
+
+    ss << _("Options") << ":" << std::endl;
+    ss << format_option_help();
+
+    for (std::list<OptionGroup const*>::const_iterator it = _groups.begin(); it != _groups.end();
+         ++it) {
+        const OptionGroup& group = **it;
+        ss << std::endl << "  " << group.title() << ":" << std::endl;
+        if (group.group_description() != "")
+            ss << str_format(group.group_description(), 4, cols()) << std::endl;
+        ss << group.format_option_help(4);
+    }
+
+    if (epilog() != "") ss << std::endl << str_format(epilog(), 0, cols());
+
+    return ss.str();
+}
+void OptionParser::print_help() const { std::cout << format_help(); }
+
+void OptionParser::set_usage(const std::string& u)
+{
+    std::string lower = u;
+    transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
+    if (lower.compare(0, 7, "usage: ") == 0)
+        _usage = u.substr(7);
+    else
+        _usage = u;
+}
+std::string OptionParser::format_usage(const std::string& u) const
+{
+    std::ostringstream ss;
+    ss << _("Usage") << ": " << u << std::endl;
+    return ss.str();
+}
+std::string OptionParser::get_usage() const
+{
+    if (usage() == SUPPRESS_USAGE) return std::string("");
+    return format_usage(str_replace(usage(), "%prog", prog()));
+}
+void OptionParser::print_usage(std::ostream& out) const
+{
+    std::string u = get_usage();
+    if (u != "") out << u << std::endl;
+}
+void OptionParser::print_usage() const { print_usage(std::cout); }
+
+std::string OptionParser::get_version() const { return str_replace(_version, "%prog", prog()); }
+void OptionParser::print_version(std::ostream& out) const { out << get_version() << std::endl; }
+void OptionParser::print_version() const { print_version(std::cout); }
+
+void OptionParser::exit() const { std::exit(2); }
+void OptionParser::error(const std::string& msg) const
+{
+    print_usage(std::cerr);
+    std::cerr << prog() << ": " << _("error") << ": " << msg << std::endl;
+    exit();
+}
+////////// } class OptionParser //////////
+
+////////// class Values { //////////
+const std::string& Values::operator[](const std::string& d) const
+{
+    strMap::const_iterator it = _map.find(d);
+    static const std::string empty = "";
+    return (it != _map.end()) ? it->second : empty;
+}
+void Values::is_set_by_user(const std::string& d, bool yes)
+{
+    if (yes)
+        _userSet.insert(d);
+    else
+        _userSet.erase(d);
+}
+////////// } class Values //////////
+
+////////// class Option { //////////
+std::string Option::check_type(const std::string& opt, const std::string& val) const
+{
+    std::istringstream ss(val);
+    std::ostringstream err;
+
+    if (type() == "int" || type() == "long") {
+        long t;
+        if (not(ss >> t))
+            err << _("option") << " " << opt << ": " << _("invalid integer value") << ": '" << val
+                << "'";
+    } else if (type() == "float" || type() == "double") {
+        double t;
+        if (not(ss >> t))
+            err << _("option") << " " << opt << ": " << _("invalid floating-point value") << ": '"
+                << val << "'";
+    } else if (type() == "choice") {
+        if (find(choices().begin(), choices().end(), val) == choices().end()) {
+            std::list<std::string> tmp = choices();
+            transform(tmp.begin(), tmp.end(), tmp.begin(), str_wrap("'"));
+            err << _("option") << " " << opt << ": " << _("invalid choice") << ": '" << val << "'"
+                << " (" << _("choose from") << " " << str_join(", ", tmp.begin(), tmp.end()) << ")";
+        }
+    } else if (type() == "complex") {
+        std::complex<double> t;
+        if (not(ss >> t))
+            err << _("option") << " " << opt << ": " << _("invalid complex value") << ": '" << val
+                << "'";
+    }
+
+    return err.str();
+}
+
+std::string Option::format_option_help(unsigned int indent /* = 2 */) const
+{
+
+    std::string mvar_short, mvar_long;
+    if (nargs() == 1) {
+        std::string mvar = metavar();
+        if (mvar == "") {
+            mvar = type();
+            transform(mvar.begin(), mvar.end(), mvar.begin(), ::toupper);
+        }
+        mvar_short = " " + mvar;
+        mvar_long = "=" + mvar;
+    }
+
+    std::ostringstream ss;
+    ss << std::string(indent, ' ');
+
+    if (not _short_opts.empty()) {
+        ss << str_join_trans(", ", _short_opts.begin(), _short_opts.end(),
+                             str_wrap("-", mvar_short));
+        if (not _long_opts.empty()) ss << ", ";
+    }
+    if (not _long_opts.empty())
+        ss << str_join_trans(", ", _long_opts.begin(), _long_opts.end(), str_wrap("--", mvar_long));
+
+    if (_short_opts.empty() && _long_opts.empty()) ss << metavar();
+
+    return ss.str();
+}
+
+std::string Option::format_help(unsigned int indent /* = 2 */) const
+{
+    std::ostringstream ss;
+    std::string h = format_option_help(indent);
+    unsigned int width = cols();
+    unsigned int opt_width = std::min(width * 3 / 10, 36u);
+    bool indent_first = false;
+    ss << h;
+    // if the option list is too long, start a new paragraph
+    if (h.length() >= (opt_width - 1)) {
+        ss << std::endl;
+        indent_first = true;
+    } else {
+        ss << std::string(opt_width - h.length(), ' ');
+        if (help() == "") ss << std::endl;
+    }
+    if (help() != "") {
+        std::string help_str =
+            (get_default() != "") ? str_replace(help(), "%default", get_default()) : help();
+        ss << str_format(help_str, opt_width, width, indent_first);
+    }
+    return ss.str();
+}
+
+Option& Option::action(const std::string& a)
+{
+    _action = a;
+    if (a == "store_const" || a == "store_true" || a == "store_false" || a == "append_const" ||
+        a == "count" || a == "help" || a == "version")
+        nargs(0);
+    return *this;
+}
+////////// } class Option //////////
+}
diff --git a/tools/common/OptionParser.h b/tools/common/OptionParser.h

new file mode 100644 (file)

index 0000000..ac5b788
--- /dev/null
+++ b/tools/common/OptionParser.h
@@ -0,0 +1,454 @@
+/**
+ * Copyright (C) 2010 Johannes Weißl <jargon@molb.org>
+ * License: your favourite BSD-style license
+ *
+ * git clone http://github.com/weisslj/cpp-optparse.git
+ *
+ * This is yet another option parser for C++. It is modelled after the
+ * excellent Python optparse API. Although incomplete, anyone familiar to
+ * optparse should feel at home:
+ * http://docs.python.org/library/optparse.html
+ *
+ * Design decisions:
+ * - elegant and easy usage more important than speed / flexibility
+ * - shortness more important than feature completeness
+ *   * no unicode
+ *   * no checking for user programming errors
+ *
+ * Why not use getopt/getopt_long?
+ * - not C++ / not completely POSIX
+ * - too cumbersome to use, would need lot of additional code
+ *
+ * Why not use Boost.Program_options?
+ * - boost not installed on all target platforms (esp. cluster, HPC, ...)
+ * - too big to include just for option handling:
+ *   322 *.h (44750 lines) + 7 *.cpp (2078 lines)
+ *
+ * Why not use tclap/Opag/Options/CmdLine/Anyoption/Argument_helper/...?
+ * - no reason, writing one is faster than code inspection :-)
+ * - similarity to Python desired for faster learning curve
+ *
+ * Future work:
+ * - nargs > 1?
+ * - comments?
+ *
+ * Python only features:
+ * - conflict handlers
+ * - adding new actions
+ *
+ *
+ * Example:
+ *
+ * using optparse::OptionParser;
+ *
+ * OptionParser parser = OptionParser() .description("just an example");
+ *
+ * parser.add_option("-f", "--file") .dest("filename")
+ *                   .help("write report to FILE") .metavar("FILE");
+ * parser.add_option("-q", "--quiet")
+ *                   .action("store_false") .dest("verbose") .set_default("1")
+ *                   .help("don't print status messages to stdout");
+ * 
+ * optparse::Values options = parser.parse_args(argc, argv);
+ * vector<string> args = parser.args();
+ *
+ * if (options.get("verbose"))
+ *     cout << options["filename"] << endl;
+ *
+ */
+
+#ifndef OPTIONPARSER_H_
+#define OPTIONPARSER_H_
+
+#include <cstddef>
+#include <iostream>
+#include <list>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace optparse {
+
+class OptionParser;
+class OptionGroup;
+class Option;
+class Values;
+class Value;
+class Callback;
+
+typedef std::map<std::string, std::string> strMap;
+typedef std::map<std::string, std::list<std::string> > lstMap;
+typedef std::map<std::string, Option const*> optMap;
+
+const char* const SUPPRESS_HELP =
+    "SUPPRESS"
+    "HELP";
+const char* const SUPPRESS_USAGE =
+    "SUPPRESS"
+    "USAGE";
+
+//! Class for automatic conversion from string -> anytype
+class Value
+{
+public:
+    Value() : str(), valid(false) {}
+    Value(const std::string& v) : str(v), valid(true) {}
+    operator const char*() { return str.c_str(); }
+    operator bool()
+    {
+        bool t;
+        return (valid && (std::istringstream(str) >> t)) ? t : false;
+    }
+    operator short()
+    {
+        short t;
+        return (valid && (std::istringstream(str) >> t)) ? t : 0;
+    }
+    operator unsigned short()
+    {
+        unsigned short t;
+        return (valid && (std::istringstream(str) >> t)) ? t : 0;
+    }
+    operator int()
+    {
+        int t;
+        return (valid && (std::istringstream(str) >> t)) ? t : 0;
+    }
+    operator unsigned int()
+    {
+        unsigned int t;
+        return (valid && (std::istringstream(str) >> t)) ? t : 0;
+    }
+    operator long()
+    {
+        long t;
+        return (valid && (std::istringstream(str) >> t)) ? t : 0;
+    }
+    operator unsigned long()
+    {
+        unsigned long t;
+        return (valid && (std::istringstream(str) >> t)) ? t : 0;
+    }
+    operator float()
+    {
+        float t;
+        return (valid && (std::istringstream(str) >> t)) ? t : 0;
+    }
+    operator double()
+    {
+        double t;
+        return (valid && (std::istringstream(str) >> t)) ? t : 0;
+    }
+    operator long double()
+    {
+        long double t;
+        return (valid && (std::istringstream(str) >> t)) ? t : 0;
+    }
+
+private:
+    const std::string str;
+    bool valid;
+};
+
+class Values
+{
+public:
+    Values() : _map() {}
+    const std::string& operator[](const std::string& d) const;
+    std::string& operator[](const std::string& d) { return _map[d]; }
+    bool is_set(const std::string& d) const { return _map.find(d) != _map.end(); }
+    bool is_set_by_user(const std::string& d) const { return _userSet.find(d) != _userSet.end(); }
+    void is_set_by_user(const std::string& d, bool yes);
+    Value get(const std::string& d) const { return (is_set(d)) ? Value((*this)[d]) : Value(); }
+
+    typedef std::list<std::string>::iterator iterator;
+    typedef std::list<std::string>::const_iterator const_iterator;
+    std::list<std::string>& all(const std::string& d) { return _appendMap[d]; }
+    const std::list<std::string>& all(const std::string& d) const
+    {
+        return _appendMap.find(d)->second;
+    }
+
+private:
+    strMap _map;
+    lstMap _appendMap;
+    std::set<std::string> _userSet;
+};
+
+class OptionParser
+{
+public:
+    OptionParser();
+    virtual ~OptionParser() {}
+
+    OptionParser& usage(const std::string& u)
+    {
+        set_usage(u);
+        return *this;
+    }
+    OptionParser& version(const std::string& v)
+    {
+        _version = v;
+        return *this;
+    }
+    OptionParser& description(const std::string& d)
+    {
+        _description = d;
+        return *this;
+    }
+    OptionParser& add_help_option(bool h)
+    {
+        _add_help_option = h;
+        return *this;
+    }
+    OptionParser& add_version_option(bool v)
+    {
+        _add_version_option = v;
+        return *this;
+    }
+    OptionParser& prog(const std::string& p)
+    {
+        _prog = p;
+        return *this;
+    }
+    OptionParser& epilog(const std::string& e)
+    {
+        _epilog = e;
+        return *this;
+    }
+    OptionParser& set_defaults(const std::string& dest, const std::string& val)
+    {
+        _defaults[dest] = val;
+        return *this;
+    }
+    OptionParser& enable_interspersed_args()
+    {
+        _interspersed_args = true;
+        return *this;
+    }
+    OptionParser& disable_interspersed_args()
+    {
+        _interspersed_args = false;
+        return *this;
+    }
+    OptionParser& add_option_group(const OptionGroup& group);
+
+    const std::string& usage() const { return _usage; }
+    const std::string& version() const { return _version; }
+    const std::string& description() const { return _description; }
+    bool add_help_option() const { return _add_help_option; }
+    bool add_version_option() const { return _add_version_option; }
+    const std::string& prog() const { return _prog; }
+    const std::string& epilog() const { return _epilog; }
+    bool interspersed_args() const { return _interspersed_args; }
+
+    Option& add_option(const std::string& opt);
+    Option& add_option(const std::string& opt1, const std::string& opt2);
+    Option& add_option(const std::string& opt1, const std::string& opt2, const std::string& opt3);
+    Option& add_option(const std::vector<std::string>& opt);
+
+    Values& parse_args(int argc, char const* const* argv);
+    Values& parse_args(const std::vector<std::string>& args);
+    template <typename InputIterator>
+    Values& parse_args(InputIterator begin, InputIterator end)
+    {
+        return parse_args(std::vector<std::string>(begin, end));
+    }
+
+    const std::list<std::string>& args() const { return _leftover; }
+    std::vector<std::string> args()
+    {
+        return std::vector<std::string>(_leftover.begin(), _leftover.end());
+    }
+
+    std::string format_help() const;
+    std::string format_option_help(unsigned int indent = 2) const;
+    void print_help() const;
+
+    void set_usage(const std::string& u);
+    std::string get_usage() const;
+    void print_usage(std::ostream& out) const;
+    void print_usage() const;
+
+    std::string get_version() const;
+    void print_version(std::ostream& out) const;
+    void print_version() const;
+
+    void error(const std::string& msg) const;
+    void exit() const;
+
+private:
+    const Option& lookup_short_opt(const std::string& opt) const;
+    const Option& lookup_long_opt(const std::string& opt) const;
+
+    void handle_short_opt(const std::string& opt, const std::string& arg);
+    void handle_long_opt(const std::string& optstr);
+
+    void process_opt(const Option& option, const std::string& opt, const std::string& value);
+
+    std::string format_usage(const std::string& u) const;
+
+    std::string _usage;
+    std::string _version;
+    std::string _description;
+    bool _add_help_option;
+    bool _add_version_option;
+    std::string _prog;
+    std::string _epilog;
+    bool _interspersed_args;
+
+    Values _values;
+
+    std::list<Option> _opts;
+    optMap _optmap_s;
+    optMap _optmap_l;
+    strMap _defaults;
+    std::list<OptionGroup const*> _groups;
+
+    std::list<std::string> _remaining;
+    std::list<std::string> _leftover;
+};
+
+class OptionGroup : public OptionParser
+{
+public:
+    OptionGroup(const OptionParser& /* p */, const std::string& t, const std::string& d = "")
+        : /* _parser(p), */ _title(t), _group_description(d)
+    {
+    }
+    virtual ~OptionGroup() {}
+
+    OptionGroup& title(const std::string& t)
+    {
+        _title = t;
+        return *this;
+    }
+    OptionGroup& group_description(const std::string& d)
+    {
+        _group_description = d;
+        return *this;
+    }
+    const std::string& title() const { return _title; }
+    const std::string& group_description() const { return _group_description; }
+
+private:
+    /* const OptionParser& _parser; */
+    std::string _title;
+    std::string _group_description;
+};
+
+class Option
+{
+public:
+    Option() : _action("store"), _type("string"), _nargs(1), _callback(0) {}
+    virtual ~Option() {}
+
+    Option& action(const std::string& a);
+    Option& type(const std::string& t)
+    {
+        _type = t;
+        return *this;
+    }
+    Option& dest(const std::string& d)
+    {
+        _dest = d;
+        return *this;
+    }
+    Option& set_default(const std::string& d)
+    {
+        _default = d;
+        return *this;
+    }
+    template <typename T>
+    Option& set_default(T t)
+    {
+        std::ostringstream ss;
+        ss << t;
+        _default = ss.str();
+        return *this;
+    }
+    Option& nargs(size_t n)
+    {
+        _nargs = n;
+        return *this;
+    }
+    Option& set_const(const std::string& c)
+    {
+        _const = c;
+        return *this;
+    }
+    template <typename InputIterator>
+    Option& choices(InputIterator begin, InputIterator end)
+    {
+        _choices.assign(begin, end);
+        type("choice");
+        return *this;
+    }
+    template <typename InputEnumerable>
+    Option& choices(InputEnumerable enumerable)
+    {
+        _choices.assign(enumerable.begin(), enumerable.end());
+        type("choice");
+        return *this;
+    }
+    Option& help(const std::string& h)
+    {
+        _help = h;
+        return *this;
+    }
+    Option& metavar(const std::string& m)
+    {
+        _metavar = m;
+        return *this;
+    }
+    Option& callback(Callback& c)
+    {
+        _callback = &c;
+        return *this;
+    }
+
+    const std::string& action() const { return _action; }
+    const std::string& type() const { return _type; }
+    const std::string& dest() const { return _dest; }
+    const std::string& get_default() const { return _default; }
+    size_t nargs() const { return _nargs; }
+    const std::string& get_const() const { return _const; }
+    const std::list<std::string>& choices() const { return _choices; }
+    const std::string& help() const { return _help; }
+    const std::string& metavar() const { return _metavar; }
+    Callback* callback() const { return _callback; }
+
+private:
+    std::string check_type(const std::string& opt, const std::string& val) const;
+    std::string format_option_help(unsigned int indent = 2) const;
+    std::string format_help(unsigned int indent = 2) const;
+
+    std::set<std::string> _short_opts;
+    std::set<std::string> _long_opts;
+
+    std::string _action;
+    std::string _type;
+    std::string _dest;
+    std::string _default;
+    size_t _nargs;
+    std::string _const;
+    std::list<std::string> _choices;
+    std::string _help;
+    std::string _metavar;
+    Callback* _callback;
+
+    friend class OptionParser;
+};
+
+class Callback
+{
+public:
+    virtual void operator()(const Option& option, const std::string& opt, const std::string& val,
+                            const OptionParser& parser) = 0;
+    virtual ~Callback() {}
+};
+}
+
+#endif
diff --git a/tools/format-all b/tools/format-all

new file mode 100755 (executable)

index 0000000..27a11b4
--- /dev/null
+++ b/tools/format-all
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+# This command can be run by the user to clang-format everything.
+
+PLATFORM=$(uname)
+TOOLSPATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
+CLANGFORMAT="${TOOLSPATH}/${PLATFORM}/clang-format -style=file"
+
+find include src tests/src tools \( -name *.cpp -or -name *.h \) -not -name pugi* -print0 | xargs -n1 -0 ${CLANGFORMAT} -i
diff --git a/tools/git-clang-format b/tools/git-clang-format

new file mode 100755 (executable)

index 0000000..0c45762
--- /dev/null
+++ b/tools/git-clang-format
@@ -0,0 +1,485 @@
+#!/usr/bin/env python
+#
+#===- git-clang-format - ClangFormat Git Integration ---------*- python -*--===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===------------------------------------------------------------------------===#
+
+r"""                                                                             
+clang-format git integration                                                     
+============================                                                     
+                                                                                 
+This file provides a clang-format integration for git. Put it somewhere in your  
+path and ensure that it is executable. Then, "git clang-format" will invoke      
+clang-format on the changes in current files or a specific commit.               
+                                                                                 
+For further details, run:                                                        
+git clang-format -h                                                              
+                                                                                 
+Requires Python 2.7                                                              
+"""               
+
+import argparse
+import collections
+import contextlib
+import errno
+import os
+import re
+import subprocess
+import sys
+
+usage = 'git clang-format [OPTIONS] [<commit>] [--] [<file>...]'
+
+desc = '''
+Run clang-format on all lines that differ between the working directory
+and <commit>, which defaults to HEAD.  Changes are only applied to the working
+directory.
+
+The following git-config settings set the default of the corresponding option:
+  clangFormat.binary
+  clangFormat.commit
+  clangFormat.extension
+  clangFormat.style
+'''
+
+# Name of the temporary index file in which save the output of clang-format.
+# This file is created within the .git directory.
+temp_index_basename = 'clang-format-index'
+
+
+Range = collections.namedtuple('Range', 'start, count')
+
+
+def main():
+  config = load_git_config()
+
+  # In order to keep '--' yet allow options after positionals, we need to
+  # check for '--' ourselves.  (Setting nargs='*' throws away the '--', while
+  # nargs=argparse.REMAINDER disallows options after positionals.)
+  argv = sys.argv[1:]
+  try:
+    idx = argv.index('--')
+  except ValueError:
+    dash_dash = []
+  else:
+    dash_dash = argv[idx:]
+    argv = argv[:idx]
+
+  default_extensions = ','.join([
+      # From clang/lib/Frontend/FrontendOptions.cpp, all lower case
+      'c', 'h',  # C
+      'm',  # ObjC
+      'mm',  # ObjC++
+      'cc', 'cp', 'cpp', 'c++', 'cxx', 'hpp',  # C++
+      # Other languages that clang-format supports
+      'proto', 'protodevel',  # Protocol Buffers
+      'js',  # JavaScript
+      'ts',  # TypeScript
+      ])
+
+  p = argparse.ArgumentParser(
+    usage=usage, formatter_class=argparse.RawDescriptionHelpFormatter,
+    description=desc)
+  p.add_argument('--binary',
+                 default=config.get('clangformat.binary', 'clang-format'),
+                 help='path to clang-format'),
+  p.add_argument('--commit',
+                 default=config.get('clangformat.commit', 'HEAD'),
+                 help='default commit to use if none is specified'),
+  p.add_argument('--diff', action='store_true',
+                 help='print a diff instead of applying the changes')
+  p.add_argument('--extensions',
+                 default=config.get('clangformat.extensions',
+                                    default_extensions),
+                 help=('comma-separated list of file extensions to format, '
+                       'excluding the period and case-insensitive')),
+  p.add_argument('-f', '--force', action='store_true',
+                 help='allow changes to unstaged files')
+  p.add_argument('-p', '--patch', action='store_true',
+                 help='select hunks interactively')
+  p.add_argument('-q', '--quiet', action='count', default=0,
+                 help='print less information')
+  p.add_argument('--style',
+                 default=config.get('clangformat.style', None),
+                 help='passed to clang-format'),
+  p.add_argument('-v', '--verbose', action='count', default=0,
+                 help='print extra information')
+  # We gather all the remaining positional arguments into 'args' since we need
+  # to use some heuristics to determine whether or not <commit> was present.
+  # However, to print pretty messages, we make use of metavar and help.
+  p.add_argument('args', nargs='*', metavar='<commit>',
+                 help='revision from which to compute the diff')
+  p.add_argument('ignored', nargs='*', metavar='<file>...',
+                 help='if specified, only consider differences in these files')
+  opts = p.parse_args(argv)
+
+  opts.verbose -= opts.quiet
+  del opts.quiet
+
+  commit, files = interpret_args(opts.args, dash_dash, opts.commit)
+  changed_lines = compute_diff_and_extract_lines(commit, files)
+  if opts.verbose >= 1:
+    ignored_files = set(changed_lines)
+  filter_by_extension(changed_lines, opts.extensions.lower().split(','))
+  if opts.verbose >= 1:
+    ignored_files.difference_update(changed_lines)
+    if ignored_files:
+      print 'Ignoring changes in the following files (wrong extension):'
+      for filename in ignored_files:
+        print '   ', filename
+    if changed_lines:
+      print 'Running clang-format on the following files:'
+      for filename in changed_lines:
+        print '   ', filename
+  if not changed_lines:
+    print 'no modified files to format'
+    return
+  # The computed diff outputs absolute paths, so we must cd before accessing
+  # those files.
+  cd_to_toplevel()
+  old_tree = create_tree_from_workdir(changed_lines)
+  new_tree = run_clang_format_and_save_to_tree(changed_lines,
+                                               binary=opts.binary,
+                                               style=opts.style)
+  if opts.verbose >= 1:
+    print 'old tree:', old_tree
+    print 'new tree:', new_tree
+  if old_tree == new_tree:
+    if opts.verbose >= 0:
+      print 'clang-format did not modify any files'
+  elif opts.diff:
+    print_diff(old_tree, new_tree)
+  else:
+    changed_files = apply_changes(old_tree, new_tree, force=opts.force,
+                                  patch_mode=opts.patch)
+    if (opts.verbose >= 0 and not opts.patch) or opts.verbose >= 1:
+      print 'changed files:'
+      for filename in changed_files:
+        print '   ', filename
+
+
+def load_git_config(non_string_options=None):
+  """Return the git configuration as a dictionary.
+
+  All options are assumed to be strings unless in `non_string_options`, in which
+  is a dictionary mapping option name (in lower case) to either "--bool" or
+  "--int"."""
+  if non_string_options is None:
+    non_string_options = {}
+  out = {}
+  for entry in run('git', 'config', '--list', '--null').split('\0'):
+    if entry:
+      name, value = entry.split('\n', 1)
+      if name in non_string_options:
+        value = run('git', 'config', non_string_options[name], name)
+      out[name] = value
+  return out
+
+
+def interpret_args(args, dash_dash, default_commit):
+  """Interpret `args` as "[commit] [--] [files...]" and return (commit, files).
+
+  It is assumed that "--" and everything that follows has been removed from
+  args and placed in `dash_dash`.
+
+  If "--" is present (i.e., `dash_dash` is non-empty), the argument to its
+  left (if present) is taken as commit.  Otherwise, the first argument is
+  checked if it is a commit or a file.  If commit is not given,
+  `default_commit` is used."""
+  if dash_dash:
+    if len(args) == 0:
+      commit = default_commit
+    elif len(args) > 1:
+      die('at most one commit allowed; %d given' % len(args))
+    else:
+      commit = args[0]
+    object_type = get_object_type(commit)
+    if object_type not in ('commit', 'tag'):
+      if object_type is None:
+        die("'%s' is not a commit" % commit)
+      else:
+        die("'%s' is a %s, but a commit was expected" % (commit, object_type))
+    files = dash_dash[1:]
+  elif args:
+    if disambiguate_revision(args[0]):
+      commit = args[0]
+      files = args[1:]
+    else:
+      commit = default_commit
+      files = args
+  else:
+    commit = default_commit
+    files = []
+  return commit, files
+
+
+def disambiguate_revision(value):
+  """Returns True if `value` is a revision, False if it is a file, or dies."""
+  # If `value` is ambiguous (neither a commit nor a file), the following
+  # command will die with an appropriate error message.
+  run('git', 'rev-parse', value, verbose=False)
+  object_type = get_object_type(value)
+  if object_type is None:
+    return False
+  if object_type in ('commit', 'tag'):
+    return True
+  die('`%s` is a %s, but a commit or filename was expected' %
+      (value, object_type))
+
+
+def get_object_type(value):
+  """Returns a string description of an object's type, or None if it is not
+  a valid git object."""
+  cmd = ['git', 'cat-file', '-t', value]
+  p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+  stdout, stderr = p.communicate()
+  if p.returncode != 0:
+    return None
+  return stdout.strip()
+
+
+def compute_diff_and_extract_lines(commit, files):
+  """Calls compute_diff() followed by extract_lines()."""
+  diff_process = compute_diff(commit, files)
+  changed_lines = extract_lines(diff_process.stdout)
+  diff_process.stdout.close()
+  diff_process.wait()
+  if diff_process.returncode != 0:
+    # Assume error was already printed to stderr.
+    sys.exit(2)
+  return changed_lines
+
+
+def compute_diff(commit, files):
+  """Return a subprocess object producing the diff from `commit`.
+
+  The return value's `stdin` file object will produce a patch with the
+  differences between the working directory and `commit`, filtered on `files`
+  (if non-empty).  Zero context lines are used in the patch."""
+  cmd = ['git', 'diff-index', '-p', '-U0', commit, '--']
+  cmd.extend(files)
+  p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+  p.stdin.close()
+  return p
+
+
+def extract_lines(patch_file):
+  """Extract the changed lines in `patch_file`.
+
+  The return value is a dictionary mapping filename to a list of (start_line,
+  line_count) pairs.
+
+  The input must have been produced with ``-U0``, meaning unidiff format with
+  zero lines of context.  The return value is a dict mapping filename to a
+  list of line `Range`s."""
+  matches = {}
+  for line in patch_file:
+    match = re.search(r'^\+\+\+\ [^/]+/(.*)', line)
+    if match:
+      filename = match.group(1).rstrip('\r\n')
+    match = re.search(r'^@@ -[0-9,]+ \+(\d+)(,(\d+))?', line)
+    if match:
+      start_line = int(match.group(1))
+      line_count = 1
+      if match.group(3):
+        line_count = int(match.group(3))
+      if line_count > 0:
+        matches.setdefault(filename, []).append(Range(start_line, line_count))
+  return matches
+
+
+def filter_by_extension(dictionary, allowed_extensions):
+  """Delete every key in `dictionary` that doesn't have an allowed extension.
+
+  `allowed_extensions` must be a collection of lowercase file extensions,
+  excluding the period."""
+  allowed_extensions = frozenset(allowed_extensions)
+  for filename in dictionary.keys():
+    base_ext = filename.rsplit('.', 1)
+    if len(base_ext) == 1 or base_ext[1].lower() not in allowed_extensions:
+      del dictionary[filename]
+
+
+def cd_to_toplevel():
+  """Change to the top level of the git repository."""
+  toplevel = run('git', 'rev-parse', '--show-toplevel')
+  os.chdir(toplevel)
+
+
+def create_tree_from_workdir(filenames):
+  """Create a new git tree with the given files from the working directory.
+
+  Returns the object ID (SHA-1) of the created tree."""
+  return create_tree(filenames, '--stdin')
+
+
+def run_clang_format_and_save_to_tree(changed_lines, binary='clang-format',
+                                      style=None):
+  """Run clang-format on each file and save the result to a git tree.
+
+  Returns the object ID (SHA-1) of the created tree."""
+  def index_info_generator():
+    for filename, line_ranges in changed_lines.iteritems():
+      mode = oct(os.stat(filename).st_mode)
+      blob_id = clang_format_to_blob(filename, line_ranges, binary=binary,
+                                     style=style)
+      yield '%s %s\t%s' % (mode, blob_id, filename)
+  return create_tree(index_info_generator(), '--index-info')
+
+
+def create_tree(input_lines, mode):
+  """Create a tree object from the given input.
+
+  If mode is '--stdin', it must be a list of filenames.  If mode is
+  '--index-info' is must be a list of values suitable for "git update-index
+  --index-info", such as "<mode> <SP> <sha1> <TAB> <filename>".  Any other mode
+  is invalid."""
+  assert mode in ('--stdin', '--index-info')
+  cmd = ['git', 'update-index', '--add', '-z', mode]
+  with temporary_index_file():
+    p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
+    for line in input_lines:
+      p.stdin.write('%s\0' % line)
+    p.stdin.close()
+    if p.wait() != 0:
+      die('`%s` failed' % ' '.join(cmd))
+    tree_id = run('git', 'write-tree')
+    return tree_id
+
+
+def clang_format_to_blob(filename, line_ranges, binary='clang-format',
+                         style=None):
+  """Run clang-format on the given file and save the result to a git blob.
+
+  Returns the object ID (SHA-1) of the created blob."""
+  clang_format_cmd = [binary, filename]
+  if style:
+    clang_format_cmd.extend(['-style='+style])
+  clang_format_cmd.extend([
+      '-lines=%s:%s' % (start_line, start_line+line_count-1)
+      for start_line, line_count in line_ranges])
+  try:
+    clang_format = subprocess.Popen(clang_format_cmd, stdin=subprocess.PIPE,
+                                    stdout=subprocess.PIPE)
+  except OSError as e:
+    if e.errno == errno.ENOENT:
+      die('cannot find executable "%s"' % binary)
+    else:
+      raise
+  clang_format.stdin.close()
+  hash_object_cmd = ['git', 'hash-object', '-w', '--path='+filename, '--stdin']
+  hash_object = subprocess.Popen(hash_object_cmd, stdin=clang_format.stdout,
+                                 stdout=subprocess.PIPE)
+  clang_format.stdout.close()
+  stdout = hash_object.communicate()[0]
+  if hash_object.returncode != 0:
+    die('`%s` failed' % ' '.join(hash_object_cmd))
+  if clang_format.wait() != 0:
+    die('`%s` failed' % ' '.join(clang_format_cmd))
+  return stdout.rstrip('\r\n')
+
+
+@contextlib.contextmanager
+def temporary_index_file(tree=None):
+  """Context manager for setting GIT_INDEX_FILE to a temporary file and deleting
+  the file afterward."""
+  index_path = create_temporary_index(tree)
+  old_index_path = os.environ.get('GIT_INDEX_FILE')
+  os.environ['GIT_INDEX_FILE'] = index_path
+  try:
+    yield
+  finally:
+    if old_index_path is None:
+      del os.environ['GIT_INDEX_FILE']
+    else:
+      os.environ['GIT_INDEX_FILE'] = old_index_path
+    os.remove(index_path)
+
+
+def create_temporary_index(tree=None):
+  """Create a temporary index file and return the created file's path.
+
+  If `tree` is not None, use that as the tree to read in.  Otherwise, an
+  empty index is created."""
+  gitdir = run('git', 'rev-parse', '--git-dir')
+  path = os.path.join(gitdir, temp_index_basename)
+  if tree is None:
+    tree = '--empty'
+  run('git', 'read-tree', '--index-output='+path, tree)
+  return path
+
+
+def print_diff(old_tree, new_tree):
+  """Print the diff between the two trees to stdout."""
+  # We use the porcelain 'diff' and not plumbing 'diff-tree' because the output
+  # is expected to be viewed by the user, and only the former does nice things
+  # like color and pagination.
+  subprocess.check_call(['git', 'diff', old_tree, new_tree, '--'])
+
+
+def apply_changes(old_tree, new_tree, force=False, patch_mode=False):
+  """Apply the changes in `new_tree` to the working directory.
+
+  Bails if there are local changes in those files and not `force`.  If
+  `patch_mode`, runs `git checkout --patch` to select hunks interactively."""
+  changed_files = run('git', 'diff-tree', '-r', '-z', '--name-only', old_tree,
+                      new_tree).rstrip('\0').split('\0')
+  if not force:
+    unstaged_files = run('git', 'diff-files', '--name-status', *changed_files)
+    if unstaged_files:
+      print >>sys.stderr, ('The following files would be modified but '
+                           'have unstaged changes:')
+      print >>sys.stderr, unstaged_files
+      print >>sys.stderr, 'Please commit, stage, or stash them first.'
+      sys.exit(2)
+  if patch_mode:
+    # In patch mode, we could just as well create an index from the new tree
+    # and checkout from that, but then the user will be presented with a
+    # message saying "Discard ... from worktree".  Instead, we use the old
+    # tree as the index and checkout from new_tree, which gives the slightly
+    # better message, "Apply ... to index and worktree".  This is not quite
+    # right, since it won't be applied to the user's index, but oh well.
+    with temporary_index_file(old_tree):
+      subprocess.check_call(['git', 'checkout', '--patch', new_tree])
+    index_tree = old_tree
+  else:
+    with temporary_index_file(new_tree):
+      run('git', 'checkout-index', '-a', '-f')
+  return changed_files
+
+
+def run(*args, **kwargs):
+  stdin = kwargs.pop('stdin', '')
+  verbose = kwargs.pop('verbose', True)
+  strip = kwargs.pop('strip', True)
+  for name in kwargs:
+    raise TypeError("run() got an unexpected keyword argument '%s'" % name)
+  p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                       stdin=subprocess.PIPE)
+  stdout, stderr = p.communicate(input=stdin)
+  if p.returncode == 0:
+    if stderr:
+      if verbose:
+        print >>sys.stderr, '`%s` printed to stderr:' % ' '.join(args)
+      print >>sys.stderr, stderr.rstrip()
+    if strip:
+      stdout = stdout.rstrip('\r\n')
+    return stdout
+  if verbose:
+    print >>sys.stderr, '`%s` returned %s' % (' '.join(args), p.returncode)
+  if stderr:
+    print >>sys.stderr, stderr.rstrip()
+  sys.exit(2)
+
+
+def die(message):
+  print >>sys.stderr, 'error:', message
+  sys.exit(2)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tools/meson.build b/tools/meson.build

new file mode 100644 (file)

index 0000000..2eded18
--- /dev/null
+++ b/tools/meson.build
@@ -0,0 +1,239 @@
+###########
+# bam2sam #
+###########
+
+pbbam_Bam2SamVersion_h_config = configuration_data()
+pbbam_Bam2SamVersion_h_config.set('Bam2Sam_VERSION', meson.project_version())
+pbbam_Bam2SamVersion_h_config.set('PacBioBAM_BinDir', meson.current_build_dir())
+pbbam_Bam2SamVersion_h_config.set('PacBioBAM_TestsDir', join_paths([meson.current_source_dir(), '../tests']))
+
+pbbam_Bam2SamVersion_h = configure_file(
+  input : files('bam2sam/src/Bam2SamVersion.h.in'),
+  output : 'Bam2SamVersion.h',
+  configuration : pbbam_Bam2SamVersion_h_config)
+
+pbbam_bam2sam_cpp_sources = [pbbam_Bam2SamVersion_h]
+pbbam_bam2sam_cpp_sources += files([
+  'common/OptionParser.cpp',
+  'bam2sam/src/main.cpp',
+  'bam2sam/src/Bam2Sam.cpp'])
+
+pbbam_bam2sam = executable(
+  'bam2sam',
+  pbbam_bam2sam_cpp_sources,
+  dependencies : [pbbam_htslib_dep, pbbam_thread_dep, pbbam_zlib_dep],
+  include_directories : [pbbam_include_directories, include_directories('bam2sam')],
+  install : get_option('build-tools'),
+  cpp_args : pbbam_warning_flags)
+
+# tests
+if get_option('tests')
+  bam2sam_t = configure_file(
+    input : pbbam_cram_bam2sam_t_in,
+    output : 'bam2sam.t',
+    configuration : pbbam_Bam2SamVersion_h_config)
+
+  test(
+    'bam2sam_CramTests',
+    pbbam_cram_script,
+    args : [
+      '--xunit-file=' + join_paths(meson.build_root(), 'pbbam-cram-bam2sam.xml'),
+      '--verbose'] + [
+        bam2sam_t],
+    timeout : 1800)
+endif
+
+###########
+# pbindex #
+###########
+
+pbbam_PbIndexVersion_h_config = configuration_data()
+pbbam_PbIndexVersion_h_config.set('PbIndex_VERSION', meson.project_version())
+pbbam_PbIndexVersion_h = configure_file(
+  input : files('pbindex/src/PbIndexVersion.h.in'),
+  output : 'PbIndexVersion.h',
+  configuration : pbbam_PbIndexVersion_h_config)
+
+pbbam_pbindex_cpp_sources = [pbbam_PbIndexVersion_h]
+pbbam_pbindex_cpp_sources += files([
+  'common/OptionParser.cpp',
+  'pbindex/src/main.cpp',
+  'pbindex/src/PbIndex.cpp'])
+
+pbbam_pbindex = executable(
+  'pbindex',
+  pbbam_pbindex_cpp_sources,
+  dependencies : [pbbam_htslib_dep, pbbam_zlib_dep],
+  include_directories : [pbbam_include_directories, include_directories('pbindex')],
+  link_with : pbbam_lib,
+  install : get_option('build-tools'),
+  cpp_args : pbbam_warning_flags)
+
+###############
+# pbindexdump #
+###############
+
+pbbam_PbIndexDumpVersion_h_config = configuration_data()
+pbbam_PbIndexDumpVersion_h_config.set('PbIndexDump_VERSION', meson.project_version())
+pbbam_PbIndexDumpVersion_h_config.set('PacBioBAM_BinDir', meson.current_build_dir())
+pbbam_PbIndexDumpVersion_h_config.set('PacBioBAM_TestsDir', join_paths([meson.current_source_dir(), '../tests']))
+pbbam_PbIndexDumpVersion_h = configure_file(
+  input : files('pbindexdump/src/PbIndexDumpVersion.h.in'),
+  output : 'PbIndexDumpVersion.h',
+  configuration : pbbam_PbIndexDumpVersion_h_config)
+
+pbbam_pbindexdump_cpp_sources = [pbbam_PbIndexDumpVersion_h]
+pbbam_pbindexdump_cpp_sources += files([
+  'common/OptionParser.cpp',
+  'pbindexdump/src/CppFormatter.cpp',
+  'pbindexdump/src/JsonFormatter.cpp',
+  'pbindexdump/src/PbIndexDump.cpp',
+  'pbindexdump/src/main.cpp'])
+
+pbbam_pbindexdump = executable(
+  'pbindexdump',
+  pbbam_pbindexdump_cpp_sources,
+  dependencies : [pbbam_htslib_dep, pbbam_zlib_dep],
+  include_directories : [pbbam_include_directories, include_directories('pbindexdump')],
+  link_with : pbbam_lib,
+  install : get_option('build-tools'),
+  cpp_args : pbbam_warning_flags)
+
+# tests
+if get_option('tests')
+  pbindexdump_json_t = configure_file(
+    input : pbbam_cram_pbindexdump_json_t_in,
+    output : 'pbindexdump_json.t',
+    configuration : pbbam_PbIndexDumpVersion_h_config)
+  pbindexdump_cpp_t = configure_file(
+    input : pbbam_cram_pbindexdump_cpp_t_in,
+    output : 'pbindexdump_cpp.t',
+    configuration : pbbam_PbIndexDumpVersion_h_config)
+
+  test(
+    'pbindexdump_CramTests',
+    pbbam_cram_script,
+    args : [
+      '--xunit-file=' + join_paths(meson.build_root(), 'pbbam-cram-pbindexdump.xml'),
+      '--verbose'] + [
+        pbindexdump_json_t,
+        pbindexdump_cpp_t],
+    timeout : 1800)
+endif
+
+###########
+# pbmerge #
+###########
+
+pbbam_PbMergeVersion_h_config = configuration_data()
+pbbam_PbMergeVersion_h_config.set('PbMerge_VERSION', meson.project_version())
+pbbam_PbMergeVersion_h_config.set('PacBioBAM_VERSION', meson.project_version())
+pbbam_PbMergeVersion_h_config.set('PacBioBAM_BinDir', meson.current_build_dir())
+pbbam_PbMergeVersion_h_config.set('PacBioBAM_TestsDir', join_paths([meson.current_source_dir(), '../tests']))
+pbbam_PbMergeVersion_h_config.set('GeneratedTestDataDir', join_paths(meson.current_build_dir()))
+pbbam_PbMergeVersion_h = configure_file(
+  input : files('pbmerge/src/PbMergeVersion.h.in'),
+  output : 'PbMergeVersion.h',
+  configuration : pbbam_PbMergeVersion_h_config)
+
+pbbam_pbmerge_cpp_sources = [pbbam_PbMergeVersion_h]
+pbbam_pbmerge_cpp_sources += files([
+  'common/BamFileMerger.h',
+  'common/OptionParser.cpp',
+  'pbmerge/src/main.cpp'])
+
+pbbam_pbmerge = executable(
+  'pbmerge',
+  pbbam_pbmerge_cpp_sources,
+  dependencies : [pbbam_boost_dep, pbbam_htslib_dep, pbbam_zlib_dep],
+  include_directories : [pbbam_include_directories, include_directories('pbmerge')],
+  link_with : pbbam_lib,
+  install : get_option('build-tools'),
+  cpp_args : pbbam_warning_flags)
+
+# tests
+if get_option('tests') and not get_option('auto-validate')
+  pbmerge_pacbio_ordering_t = configure_file(
+    input : pbbam_cram_pbmerge_pacbio_ordering_t_in,
+    output : 'pbmerge_pacbio_ordering.t', configuration : pbbam_PbMergeVersion_h_config)
+  pbmerge_aligned_ordering_t = configure_file(
+    input : pbbam_cram_pbmerge_aligned_ordering_t_in,
+    output : 'pbmerge_aligned_ordering.t', configuration : pbbam_PbMergeVersion_h_config)
+  pbmerge_mixed_ordering_t = configure_file(
+    input : pbbam_cram_pbmerge_mixed_ordering_t_in,
+    output : 'pbmerge_mixed_ordering.t', configuration : pbbam_PbMergeVersion_h_config)
+  pbmerge_dataset_t = configure_file(
+    input : pbbam_cram_pbmerge_dataset_t_in,
+    output : 'pbmerge_dataset.t', configuration : pbbam_PbMergeVersion_h_config)
+  pbmerge_fofn_t = configure_file(
+    input : pbbam_cram_pbmerge_fofn_t_in,
+    output : 'pbmerge_fofn.t', configuration : pbbam_PbMergeVersion_h_config)
+
+  test(
+    'pbmerge_CramTests',
+    pbbam_cram_script,
+    args : [
+      '--xunit-file=' + join_paths(meson.build_root(), 'pbbam-cram-pbmerge.xml'),
+      '--verbose'] + [
+        pbmerge_pacbio_ordering_t,
+        pbmerge_aligned_ordering_t,
+        pbmerge_mixed_ordering_t,
+        pbmerge_dataset_t,
+        pbmerge_fofn_t],
+    timeout : 1800)
+endif
+
+############
+# pbbamify #
+############
+
+pbbam_PbBamifyVersion_h_config = configuration_data()
+pbbam_PbBamifyVersion_h_config.set('PbBamify_VERSION', meson.project_version())
+pbbam_PbBamifyVersion_h_config.set('PacBioBAM_VERSION', meson.project_version())
+pbbam_PbBamifyVersion_h_config.set('PacBioBAM_BinDir', meson.current_build_dir())
+pbbam_PbBamifyVersion_h_config.set('PacBioBAM_TestsDir', join_paths([meson.current_source_dir(), '../tests']))
+pbbam_PbBamifyVersion_h_config.set('GeneratedTestDataDir', join_paths(meson.current_build_dir()))
+pbbam_PbBamifyVersion_h_config.set('GeneratedDir', join_paths(meson.current_build_dir(), '../tests'))
+pbbam_PbBamifyVersion_h = configure_file(
+  input : files('pbbamify/src/PbBamifyVersion.h.in'),
+  output : 'PbBamifyVersion.h',
+  configuration : pbbam_PbBamifyVersion_h_config)
+
+pbbam_pbbamify_cpp_sources = [pbbam_PbBamifyVersion_h]
+pbbam_pbbamify_cpp_sources += files([
+  'common/OptionParser.cpp',
+  'pbbamify/src/main.cpp',
+  'pbbamify/src/PbBamify.cpp',
+  'pbbamify/src/QueryLookup.cpp'
+  ])
+
+pbbam_pbbamify = executable(
+  'pbbamify',
+  pbbam_pbbamify_cpp_sources,
+  dependencies : [pbbam_boost_dep, pbbam_htslib_dep, pbbam_zlib_dep],
+  include_directories : [pbbam_include_directories, include_directories('pbbamify')],
+  link_with : pbbam_lib,
+  install : get_option('build-tools'),
+  cpp_args : pbbam_warning_flags)
+
+
+# tests
+if get_option('tests')
+  pbbam_test_samtools = find_program('samtools', required : true)
+
+  pbbamify_t = configure_file(
+    input : pbbam_cram_pbbamify_t_in,
+    output : 'pbbamify.t',
+    configuration : pbbam_PbBamifyVersion_h_config)
+
+  test(
+    'pbbamify_CramTests',
+    pbbam_cram_script,
+    args : [
+      '--xunit-file=' + join_paths(meson.build_root(), 'pbbam-cram-pbbamify.xml'),
+      '--verbose'] + [
+        pbbamify_t],
+    env : [
+      'SAMTOOLS=' + pbbam_test_samtools.path()],
+    timeout : 1800)
+endif
diff --git a/tools/pbbamify/CMakeLists.txt b/tools/pbbamify/CMakeLists.txt

new file mode 100644 (file)

index 0000000..cc5e0b8
--- /dev/null
+++ b/tools/pbbamify/CMakeLists.txt
@@ -0,0 +1,40 @@
+
+set(PbbamifySrcDir ${PacBioBAM_ToolsDir}/pbbamify/src)
+
+# create version header
+set(PbBamify_VERSION ${PacBioBAM_VERSION})
+configure_file(
+    ${PbbamifySrcDir}/PbBamifyVersion.h.in ${GeneratedDir}/PbBamifyVersion.h @ONLY
+)
+
+# list source files
+set(PBBAMIFY_SOURCES
+    ${ToolsCommonDir}/OptionParser.cpp
+    ${PbbamifySrcDir}/main.cpp
+    ${PbbamifySrcDir}/PbBamify.cpp
+    ${PbbamifySrcDir}/QueryLookup.cpp
+)
+
+# build pbbamify executable
+include(PbbamTool)
+create_pbbam_tool(
+    TARGET  pbbamify
+    SOURCES ${PBBAMIFY_SOURCES}
+)
+
+# cram tests
+if (PacBioBAM_build_tests AND PacBioBAM_permissive_cigar)
+
+    configure_file(
+        ${PacBioBAM_CramTestsDir}/pbbamify.t.in
+        ${GeneratedDir}/pbbamify.t
+    )
+
+    add_test(
+        NAME pbbamify_CramTests
+        WORKING_DIRECTORY ${PacBioBAM_TestsDir}/scripts
+        COMMAND "python" cram.py
+            ${GeneratedDir}/pbbamify.t
+    )
+
+endif()
diff --git a/tools/pbbamify/src/PbBamify.cpp b/tools/pbbamify/src/PbBamify.cpp

new file mode 100644 (file)

index 0000000..4ee6f38
--- /dev/null
+++ b/tools/pbbamify/src/PbBamify.cpp
@@ -0,0 +1,399 @@
+// Author: Ivan Sovic
+
+#include "PbBamify.h"
+#include <pbbam/../../src/SequenceUtils.h>
+#include <pbbam/BamRecord.h>
+#include <pbbam/Cigar.h>
+#include <pbbam/MD5.h>
+#include <pbbam/PbiFilter.h>
+#include <pbbam/PbiFilterQuery.h>
+#include <pbbam/PbiFilterTypes.h>
+#include <ctime>
+#include <istream>
+#include <ostream>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace pbbamify {
+
+// Taken from BamRecord.cpp, since the implementation there
+// is not public.
+static inline bool ConsumesQuery(const CigarOperationType type)
+{
+    return (bam_cigar_type(static_cast<int>(type)) & 0x1) != 0;
+}
+
+// Taken from BamRecord.cpp, since the implementation there
+// is not public.
+static inline bool ConsumesReference(const CigarOperationType type)
+{
+    return (bam_cigar_type(static_cast<int>(type)) & 0x2) != 0;
+}
+
+PacBio::BAM::BamHeader Pbbamify::ComposeHeader(const PacBio::BAM::DataSet& dataset,
+                                               PacBio::BAM::FastaReader& refReader,
+                                               const PacBio::BAM::BamReader& input)
+{
+
+    PacBio::BAM::BamHeader retHeader;
+    bool headerInitialized = false;
+
+    // Merge all the read groups and additional PacBio info.
+    const auto& bamFiles = dataset.BamFiles();
+    for (auto& bamFile : bamFiles) {
+        auto header = bamFile.Header();
+        if (!headerInitialized) {
+            retHeader = header.DeepCopy();
+            headerInitialized = true;
+        } else {
+            retHeader += header;
+        }
+    }
+
+    // Merge the alignment PG to the header.
+    auto inputHeader = input.Header();
+    for (auto& program : inputHeader.Programs()) {
+        retHeader.AddProgram(program);
+    }
+
+    // Add the sequence info to the header.
+    PacBio::BAM::FastaSequence record;
+    while (refReader.GetNext(record)) {
+        // Convert the sequence length to string,
+        // as required by SequenceInfo.
+        std::ostringstream ossLength;
+        ossLength << record.Bases().size();
+
+        // Clip on whitespace.
+        std::istringstream issHeader(record.Name());
+        std::string header;
+        issHeader >> header;
+
+        // Calculate the MD5 and append to retHeader.
+        PacBio::BAM::SequenceInfo seq(header, ossLength.str());
+        auto hash = PacBio::BAM::MD5Hash(record.Bases());
+        seq.Checksum(hash);
+        retHeader.AddSequence(seq);
+    }
+
+    return retHeader;
+}
+
+bool Pbbamify::IsHardClipped(const Cigar& cigarData)
+{
+    // If it's empty, just return.
+    if (cigarData.size() == 0) {
+        return false;
+    }
+
+    // If there is no hard clipping, just return.
+    if (cigarData.front().Type() == CigarOperationType::HARD_CLIP ||
+        cigarData.back().Type() == CigarOperationType::HARD_CLIP) {
+        return true;
+    }
+
+    return false;
+}
+
+Cigar Pbbamify::ConvertHardToSoftClipping(const Cigar& cigarData)
+{
+    Cigar softCigar;
+
+    // If it's empty, just return.
+    if (cigarData.size() == 0) {
+        return softCigar;
+    }
+
+    CigarOperationType prevOp = CigarOperationType::UNKNOWN_OP;
+
+    for (const auto& cigar : cigarData) {
+        // Change H to S.
+        CigarOperationType op = (cigar.Type() == CigarOperationType::HARD_CLIP)
+                                    ? CigarOperationType::SOFT_CLIP
+                                    : cigar.Type();
+        auto len = cigar.Length();
+
+        // Merge or add.
+        if (softCigar.size() > 0 && op == prevOp) {
+            auto prevLen = softCigar.back().Length();
+            softCigar.back() = CigarOperation(op, len + prevLen);
+        } else {
+            softCigar.emplace_back(CigarOperation(op, len));
+        }
+
+        prevOp = op;
+    }
+
+    return softCigar;
+}
+
+size_t Pbbamify::SequenceLengthFromCigar(const Cigar& cigarData)
+{
+    size_t len = 0;
+
+    if (cigarData.size() == 0) {
+        return len;
+    }
+
+    for (const auto& cigar : cigarData) {
+        if (ConsumesQuery(cigar.Type()) || cigar.Type() == CigarOperationType::HARD_CLIP) {
+            len += cigar.Length();
+        }
+    }
+
+    return len;
+}
+
+bool Pbbamify::CheckIsCigarBasic(const Cigar& cigarData)
+{
+    for (const auto& cigar : cigarData) {
+        if (cigar.Type() == CigarOperationType::ALIGNMENT_MATCH) {
+            return true;
+        }
+    }
+    return false;
+}
+
+/*
+ * Takes the pre-calculated cigarData object so that it's
+ * more efficient (it could always be obtained from the record
+ * at any time).
+*/
+Cigar Pbbamify::BasicToExtendedCigar(const PacBio::BAM::IndexedFastaReader& indexedRefReader,
+                                     const BamRecord& record, const Cigar& cigarData)
+{
+    Cigar extCigar;
+
+    std::string qseq = record.Impl().Sequence();
+    std::string rseq =
+        indexedRefReader.ReferenceSubsequence(record, Orientation::GENOMIC, false, false);
+
+    size_t qpos = 0, rpos = 0;  // The rpos should be 0 because the reference portion is yanked out.
+    for (const auto& cigar : cigarData) {
+        // This shouldn't happen, but let's keep it safe.
+        if (cigar.Length() == 0) {
+            continue;
+        }
+
+        if (cigar.Type() == CigarOperationType::ALIGNMENT_MATCH) {
+            // Decode the prev op.
+            CigarOperationType prevOp = (qseq[qpos] == rseq[rpos])
+                                            ? CigarOperationType::SEQUENCE_MATCH
+                                            : CigarOperationType::SEQUENCE_MISMATCH;
+            size_t prevCount = 0;
+
+            for (size_t i = 0; i < cigar.Length(); ++i) {
+                // Decode the new op.
+                CigarOperationType op = (qseq[qpos + i] == rseq[rpos + i])
+                                            ? CigarOperationType::SEQUENCE_MATCH
+                                            : CigarOperationType::SEQUENCE_MISMATCH;
+
+                if (op == prevOp) {
+                    ++prevCount;
+
+                } else {
+                    extCigar.emplace_back(CigarOperation(prevOp, prevCount));
+                    prevOp = op;
+                    prevCount = 1;
+                }
+            }
+
+            // Add the last operation.
+            extCigar.emplace_back(CigarOperation(prevOp, prevCount));
+
+        } else {
+            extCigar.emplace_back(cigar);
+        }
+
+        if (ConsumesQuery(cigar.Type())) {
+            qpos += cigar.Length();
+        }
+        if (ConsumesReference(cigar.Type())) {
+            rpos += cigar.Length();
+        }
+    }
+
+    return extCigar;
+}
+
+bool Pbbamify::AugmentAlignments(const std::shared_ptr<QueryLookup> queryLookup,
+                                 const PacBio::BAM::IndexedFastaReader& indexedRefReader,
+                                 PacBio::BAM::BamReader& input, PacBio::BAM::BamWriter& writer,
+                                 int32_t verboseLevel)
+{
+
+    // Clock is just for the verbose functionality.
+    clock_t timerStart = clock();
+
+    // Sets the frequency of the proof of life when
+    // processing larger input BAMs.
+    int32_t verboseFrequency =
+        (verboseLevel <= 2)
+            ? 1000000
+            : (verboseLevel == 3)
+                  ? 100000
+                  : (verboseLevel == 4)
+                        ? 10000
+                        : (verboseLevel == 5)
+                              ? 1000
+                              : (verboseLevel == 6) ? 100 : (verboseLevel == 7) ? 10 : 1;
+
+    // Counters for verbose output.
+    size_t numRecords = 0, numWithoutSeq = 0;
+
+    // Holder for the current record.
+    BamRecord record;
+
+    while (input.GetNext(record)) {
+        ++numRecords;
+
+        // Proof of life.
+        if (verboseLevel > 1 && (numRecords % verboseFrequency) == 0) {
+            double elapsedTime =
+                static_cast<double>(clock() - timerStart) / (60.0 * CLOCKS_PER_SEC);
+            elapsedTime = static_cast<int64_t>(elapsedTime * 100.0) / 100.0;
+            std::cerr << "[INFO] Processed " << numRecords << " alignments in " << elapsedTime
+                      << " min." << std::endl;
+        }
+
+        // Some mappers do not output sequences for secondary alignments.
+        if (record.Impl().SequenceLength() == 0) {
+            ++numWithoutSeq;
+            continue;
+        }
+
+        // Update the BAM record with additional data from the PacBio dataset.
+        int rv = AugmentAlignment(queryLookup, indexedRefReader, record, verboseLevel);
+
+        // In case of failure, skip the alignment. Failures should be reported by AugmentAlignment.
+        if (rv == false) {
+            continue;
+        }
+
+        // Finally, write the output.
+        writer.Write(record);
+    }
+
+    if (verboseLevel > 0 && numWithoutSeq) {
+        std::cerr << "[Warning] Found " << numWithoutSeq
+                  << " alignments without a seq field which were not converted (most likely "
+                     "secondary alignments)."
+                  << std::endl;
+    }
+
+    if (verboseLevel > 1) {
+        double elapsedTime = static_cast<double>(clock() - timerStart) / (60.0 * CLOCKS_PER_SEC);
+        elapsedTime = static_cast<int64_t>(elapsedTime * 100.0) / 100.0;
+        std::cerr << "[INFO] Done processing " << numRecords << " alignments in " << elapsedTime
+                  << " min." << std::endl;
+    }
+
+    return true;
+}
+
+bool Pbbamify::AugmentAlignment(const std::shared_ptr<QueryLookup> queryLookup,
+                                const PacBio::BAM::IndexedFastaReader& indexedRefReader,
+                                BamRecord& record, int32_t verboseLevel)
+{
+
+    // Find the BAM record in the original PacBio dataset.
+    BamRecord datasetRecord;
+    bool isFound = queryLookup->Find(record.FullName(), datasetRecord);
+    if (isFound == 0) {
+        if (verboseLevel > 0) {
+            std::cerr << "[Warning] No records found for query '" << record.FullName()
+                      << "'. Skipping." << std::endl;
+        }
+        return false;
+    }
+
+    // If it's not mapped, just output the original.
+    if (record.IsMapped() == false) {
+        record = datasetRecord;
+        return true;
+    }
+
+    // Keep the cigar object since we'll reuse it. More efficient.
+    auto cigar = record.Impl().CigarData();
+
+    // Sanity check that the mapper did not produce something funky.
+    size_t recordSeqLen = SequenceLengthFromCigar(cigar);
+    if (recordSeqLen != datasetRecord.Impl().SequenceLength()) {
+        if (verboseLevel > 0) {
+            std::cerr << "[Warning] Sequence '" << record.FullName() << "' (length " << recordSeqLen
+                      << ") is not of the same length as the PacBio BAM sequence (length "
+                      << datasetRecord.Impl().SequenceLength() << ")! Skipping." << std::endl;
+        }
+        return false;
+    }
+
+    // Update the CIGAR only if necessary.
+    if (CheckIsCigarBasic(cigar)) {
+        cigar = BasicToExtendedCigar(indexedRefReader, record, cigar);
+        record.Impl().CigarData(cigar);
+    }
+
+    // Stomp over any existing tags with matching IDs and add those
+    // which do not yet exist in the aligned BAM. We consider the PacBio
+    // dataset to be the correct answer to any of these. The rest are
+    // produced by a mapper.
+    // For example, BLASR will generate a RG tag even if the input was FASTA.
+    for (auto& tag : datasetRecord.Impl().Tags()) {
+        if (record.Impl().Tags().Contains(tag.first)) {
+            record.Impl().EditTag(tag.first, tag.second);
+        } else {
+            record.Impl().AddTag(tag.first, tag.second);
+        }
+    }
+
+    // Some downstream tools might not work well with the
+    // "undefined" mapping quality value of 255. Here
+    // we set it to a valid arbitrary value.
+    if (record.Impl().MapQuality() == 255) {
+        record.Impl().MapQuality(254);
+    }
+
+    // If the alignment has hard clipping, simply take both the seq and
+    // qual fields from the dataset. This will stomp over any custom
+    // qual values in the input BAM file.
+    if (IsHardClipped(cigar)) {
+        // Take the seq and qual fields from the dataset to override
+        // any hard clippings induced by the mapper.
+        std::string qseq = datasetRecord.Impl().Sequence();
+        std::string quals = datasetRecord.Impl().Qualities().Fastq();
+
+        // Reverse if needed.
+        if (record.Impl().IsReverseStrand()) {
+            PacBio::BAM::internal::ReverseComplement(qseq);
+            std::reverse(quals.begin(), quals.end());
+        }
+
+        // PacBio datasets, when converted to SAM, contain '!' ASCII QVs.
+        // In case QVs aren't provided otherwise, this block adds the '!' values.
+        if (quals.size() == 0) {
+            quals = std::string(qseq.size(), '!');
+        }
+
+        // Replace the seq and qual fields.
+        record.Impl().SetSequenceAndQualities(qseq, quals);
+
+        cigar = ConvertHardToSoftClipping(cigar);
+        record.Impl().CigarData(cigar);
+
+    } else {
+        // PacBio datasets, when converted to SAM, contain '!' ASCII QVs.
+        // In case QVs aren't provided otherwise, this block adds the '!' values.
+        if (record.Impl().Qualities().size() == 0) {
+            std::string qseq = record.Impl().Sequence();
+            std::string quals = std::string(qseq.size(), '!');
+            record.Impl().SetSequenceAndQualities(qseq, quals);
+        }
+    }
+
+    return true;
+}
+
+}  // namespace pbbamify
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/tools/pbbamify/src/PbBamify.h b/tools/pbbamify/src/PbBamify.h

new file mode 100644 (file)

index 0000000..f4e835a
--- /dev/null
+++ b/tools/pbbamify/src/PbBamify.h
@@ -0,0 +1,94 @@
+// Author: Ivan Sovic
+
+#ifndef SRC_PBBAMIFY_H_
+#define SRC_PBBAMIFY_H_
+
+#include <pbbam/BamReader.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/DataSet.h>
+#include <pbbam/FastaReader.h>
+#include <pbbam/IndexedFastaReader.h>
+#include <cstdint>
+#include "QueryLookup.h"
+
+namespace PacBio {
+namespace BAM {
+namespace pbbamify {
+
+/// \brief Takes a PacBio dataset, a reference file and an input arbitrary aligned BAM. Produces a new PacBio-compatible aligned BAM.
+///
+/// \throws std::runtime_error if any any errors encountered while reading or writing
+///
+class Pbbamify
+{
+public:
+    /// \brief Merges all the headers from the dataset and the input, adds the SQ fields with lengths and MD5 checksums.
+    ///
+    /// \returns A BAM header which is composed of: merged headers from BAMs in the dataset, ProgramInfo from the input
+    ///          BAM, and SQ lines formed from the refReader object (together with their length and MD5 checksum).
+    static PacBio::BAM::BamHeader ComposeHeader(const PacBio::BAM::DataSet& dataset,
+                                                PacBio::BAM::FastaReader& refReader,
+                                                const PacBio::BAM::BamReader& input);
+
+    /// \brief Converts a set of generic BAM records into a PacBio compatible BAM by calling AugmentAlignment for
+    ///        each BAM record in the input BAM file. If a BAM record was not mapped, then the original record
+    ///        from the dataset will be set to `record`.
+    ///
+    /// \returns true if the record was successfully augmented, false otherwise.
+    ///
+    static bool AugmentAlignments(const std::shared_ptr<QueryLookup> queryLookup,
+                                  const PacBio::BAM::IndexedFastaReader& indexedRefReader,
+                                  PacBio::BAM::BamReader& input, PacBio::BAM::BamWriter& writer,
+                                  int32_t verboseLevel);
+
+    /// \brief Converts a generic BAM record into a PacBio compatible BAM by: adding tags from the PacBio
+    ///        dataset, replacing the read group, clipping the tags if needed, converting the CIGAR from basic to
+    ///        extended format if needed, changing the mapq from 255 to another value to avoid potential downstream
+    ///        issues, etc.
+    ///
+    /// \returns true if the record was successfully augmented, false otherwise.
+    ///
+    static bool AugmentAlignment(const std::shared_ptr<QueryLookup> queryLookup,
+                                 const PacBio::BAM::IndexedFastaReader& indexedRefReader,
+                                 BamRecord& record, int32_t verboseLevel);
+
+    /// \brief Checks whether the alignment was hard clipped.
+    ///
+    /// \returns true if the front or back CIGAR op is 'H', false otherwise.
+    ///
+    static bool IsHardClipped(const Cigar& cigarData);
+
+    /// \brief If the CIGAR string contains hard clipping operation at the beginning
+    ///        or end of the cigarData vector, these are turned to soft clips and
+    ///        merged with any potential existin soft clipping operations.
+    ///
+    /// \returns a new CIGAR string with only soft clipped bases.
+    ///
+    static Cigar ConvertHardToSoftClipping(const Cigar& cigarData);
+
+    /// \brief Calculates the total sequence length from CIGAR (including clipping), and not just the aligned length.
+    ///        This is used for sanity checking the input BAM records.
+    ///
+    /// \returns The length of the query sequence calculated from the CIGAR string.
+    ///
+    static size_t SequenceLengthFromCigar(const Cigar& cigarData);
+
+    /// \brief Linear pass over the Cigar operations to see if there are any 'M' ops.
+    ///
+    /// \returns true if there are 'M' operations in the CIGAR object.
+    ///
+    static bool CheckIsCigarBasic(const Cigar& cigarData);
+
+    /// \brief Takes the index and a BAM record, and creates a new Cigar object with extended
+    ///        CIGAR operations ('=' and 'X' instead of 'M').
+    ///
+    /// \returns A new Cigar object with '=' and 'X' operations instead of 'M's.
+    static Cigar BasicToExtendedCigar(const PacBio::BAM::IndexedFastaReader& indexedRefReader,
+                                      const BamRecord& record, const Cigar& cigarData);
+};
+
+}  // namespace pbbamify
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif
diff --git a/tools/pbbamify/src/PbBamifyVersion.h.in b/tools/pbbamify/src/PbBamifyVersion.h.in

new file mode 100644 (file)

index 0000000..985641d
--- /dev/null
+++ b/tools/pbbamify/src/PbBamifyVersion.h.in
@@ -0,0 +1,18 @@
+// Author: Ivan Sovic
+
+#ifndef PBBAMIFYVERSION_H
+#define PBBAMIFYVERSION_H
+
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace pbbamify {
+
+const std::string Version = std::string("@PbBamify_VERSION@");
+
+} // namespace pbbamify
+} // namespace BAM
+} // namespace PacBio
+
+#endif // PBBAMIFYVERSION_H
diff --git a/tools/pbbamify/src/QueryLookup.cpp b/tools/pbbamify/src/QueryLookup.cpp

new file mode 100644 (file)

index 0000000..894c68e
--- /dev/null
+++ b/tools/pbbamify/src/QueryLookup.cpp
@@ -0,0 +1,118 @@
+// Author: Ivan Sovic
+
+#include "QueryLookup.h"
+
+#include <pbbam/PbiRawData.h>
+#include <iostream>
+#include <ostream>
+#include <string>
+
+namespace PacBio {
+namespace BAM {
+namespace pbbamify {
+
+std::unique_ptr<QueryLookup> CreateQueryLookup(const PacBio::BAM::DataSet& dataset)
+{
+    return std::unique_ptr<QueryLookup>(new QueryLookup(dataset));
+}
+
+QueryLookup::QueryLookup(const PacBio::BAM::DataSet& dataset) : dataset_(dataset) {}
+
+void QueryLookup::Load()
+{
+    std::vector<BamFile> bamFiles(dataset_.BamFiles());
+
+    // Merge all the read groups for a unified read group lookup.
+    PacBio::BAM::BamHeader jointHeader;
+    bool headerInitialized = false;
+    for (auto& bamFile : bamFiles) {
+        auto header = bamFile.Header();
+        if (!headerInitialized) {
+            jointHeader = header.DeepCopy();
+            headerInitialized = true;
+        } else {
+            jointHeader += header;
+        }
+    }
+
+    // Set-up a vector of readers for each BAM in the PacBio dataset
+    // to allow for random access.
+    readers_.clear();
+    for (auto& file : bamFiles) {
+        auto new_reader = std::make_shared<BamReader>(file);
+        readers_.push_back(new_reader);
+    }
+
+    // Get the PacBio index.
+    PacBio::BAM::PbiRawData pbi(dataset_);
+    const auto& basicData = pbi.BasicData();
+
+    // Clear everything just in case the user called Load() twice.
+    lookup_.clear();
+
+    // Process each read in the dataset and reconstruct it's original
+    // qname. Place the read in the lookup, together with the ID
+    // of the source BAM file and the virtual file offset where
+    // the read is located.
+    for (size_t i = 0; i < pbi.NumReads(); ++i) {
+        const auto zmw = basicData.holeNumber_.at(i);
+        const auto qStart = basicData.qStart_.at(i);
+        const auto qEnd = basicData.qEnd_.at(i);
+        const auto& rgId = basicData.rgId_.at(i);
+        auto fileNumber = basicData.fileNumber_.at(i);
+        auto fileOffset = basicData.fileOffset_.at(i);
+
+        auto rgString = PacBio::BAM::ReadGroupInfo::IntToId(rgId);
+        auto rgInfo = jointHeader.ReadGroup(rgString);
+        auto type = rgInfo.ReadType();
+        std::string movieName = rgInfo.MovieName();
+
+        std::transform(type.begin(), type.end(), type.begin(), ::tolower);
+
+        std::string qName;
+        if (type == std::string("subread")) {
+            std::ostringstream oss;
+            oss << movieName << '/' << zmw << '/' << qStart << '_' << qEnd;
+            qName = oss.str();
+        } else if (type == std::string("ccs")) {
+            std::ostringstream oss;
+            oss << movieName << '/' << zmw << '/' << "ccs";
+            qName = oss.str();
+        } else {
+            std::string message =
+                std::string("Unknown read group type '") + type + std::string("'.");
+            throw std::runtime_error(message);
+        }
+
+        // Sanity check.
+        auto it = lookup_.find(qName);
+        if (it != lookup_.end()) {
+            std::string message = std::string("More than 1 occurrence of qname '") + qName +
+                                  std::string("'. Duplicate reads in the dataset?");
+            throw std::runtime_error(message);
+        }
+
+        lookup_[qName] = QueryLocation(fileNumber, fileOffset);
+    }
+}
+
+bool QueryLookup::Find(const std::string& qName, BamRecord& record) const
+{
+    auto it = lookup_.find(qName);
+
+    if (it == lookup_.end()) {
+        return false;
+    }
+
+    readers_.at(it->second.fileNumber)->VirtualSeek(it->second.fileOffset);
+
+    if (!readers_.at(it->second.fileNumber)->GetNext(record)) {
+        return false;
+    }
+
+    return true;
+}
+
+}  // namespace pbbamify
+}  // namespace BAM
+}  // namespace PacBio
diff --git a/tools/pbbamify/src/QueryLookup.h b/tools/pbbamify/src/QueryLookup.h

new file mode 100644 (file)

index 0000000..a37eb7f
--- /dev/null
+++ b/tools/pbbamify/src/QueryLookup.h
@@ -0,0 +1,81 @@
+// Author: Ivan Sovic
+
+#ifndef PBBAMIFY_SRC_QUERY_LOOKUP_H_
+#define PBBAMIFY_SRC_QUERY_LOOKUP_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <pbbam/BamReader.h>
+#include <pbbam/DataSet.h>
+
+namespace PacBio {
+namespace BAM {
+namespace pbbamify {
+
+class QueryLocation;
+class QueryLookup;
+
+/// \brief A factory function for the QueryLookup objects.
+///
+/// \returns A new QueryLookup object wrapped in a unique_ptr.
+///
+std::unique_ptr<QueryLookup> CreateQueryLookup(const PacBio::BAM::DataSet& dataset);
+
+/// \brief A simple container to hold the location of a read.
+class QueryLocation
+{
+public:
+    QueryLocation() : fileNumber{0}, fileOffset{0} {}
+    QueryLocation(uint16_t _fileNumber, int64_t _fileOffset)
+        : fileNumber{_fileNumber}, fileOffset{_fileOffset}
+    {
+    }
+
+    uint16_t fileNumber;
+    int64_t fileOffset;
+};
+
+/// \brief QueryLookup parses all reads from PacBio indexes and creates a
+///        hash lookup where the key is the read's qname, and the value is a
+///        QueryLocation object pointing to the exact location of the read. The BAM
+///        record can then be loaded by setting the virtual offset and calling GetNext().
+class QueryLookup
+{
+public:
+    friend std::unique_ptr<QueryLookup> CreateQueryLookup(const PacBio::BAM::DataSet& dataset);
+
+    ~QueryLookup() = default;
+
+    /// \brief  Load() performs the work of setting up the BamReaders and constructing
+    ///         the hash table lookup.
+    ///
+    /// \throws std::runtime_error if there are more than 1 record for a given qname.
+    void Load();
+
+    /// \brief Find(...) attempts to find a given qName in the lookup and return
+    ///        the related BAM record. If it cannot be found, the function returns false.
+    ///
+    /// \returns true if the record was found and loaded, false otherwise.
+    bool Find(const std::string& qName, BamRecord& record) const;
+
+private:
+    QueryLookup(const QueryLookup&) = delete;
+    QueryLookup& operator=(const QueryLookup&) = delete;
+
+    /// \brief The constructor simply initializes a private reference to the dataset. No work is performed here.
+    QueryLookup(const PacBio::BAM::DataSet& dataset);
+
+    const PacBio::BAM::DataSet& dataset_;
+    std::vector<std::shared_ptr<PacBio::BAM::BamReader>> readers_;
+    std::unordered_map<std::string, QueryLocation> lookup_;
+};
+
+}  // namespace pbbamify
+}  // namespace BAM
+}  // namespace PacBio
+
+#endif
diff --git a/tools/pbbamify/src/main.cpp b/tools/pbbamify/src/main.cpp

new file mode 100644 (file)

index 0000000..23c5eac
--- /dev/null
+++ b/tools/pbbamify/src/main.cpp
@@ -0,0 +1,176 @@
+// Author: Ivan Sovic (based on code from Derek Barnett)
+
+#include <cassert>
+#include <iostream>
+#include <istream>
+
+#include <pbbam/BamReader.h>
+#include <pbbam/BamWriter.h>
+#include <pbbam/DataSet.h>
+#include <pbbam/FastaReader.h>
+#include "../common/OptionParser.h"
+#include "PbBamify.h"
+#include "PbBamifyVersion.h"
+#include "QueryLookup.h"
+
+namespace PacBio {
+namespace BAM {
+namespace pbbamify {
+
+class Settings
+{
+public:
+    static Settings FromCommandLine(optparse::OptionParser& parser, int argc, char* argv[])
+    {
+        pbbamify::Settings settings;
+        const optparse::Values options = parser.parse_args(argc, argv);
+
+        // Pbbam file for BAM tags.
+        const std::vector<std::string> positionalArgs = parser.args();
+        if (positionalArgs.size() != 2) {
+            settings.errors_.push_back("Exactly two positional arguments must be specified.");
+        } else {
+            settings.referenceFilename_ = positionalArgs[0];
+            settings.pbbamFilename_ = positionalArgs[1];
+        }
+
+        // Input generic BAM file to turn into Pbbam. Optional, so
+        // that BAM records can be piped in and conversion made
+        // on the fly.
+        if (options.is_set("input")) {
+            settings.inputFilename_ = options["input"];
+        } else {
+            settings.inputFilename_ = "-";
+        }
+
+        // If not specified, output is to stdout.
+        if (options.is_set("output")) {
+            settings.outputFilename_ = options["output"];
+        } else {
+            settings.outputFilename_ = "-";
+        }
+
+        // Info messages can be written to stderr if verbose_level > 0.
+        std::istringstream iss(std::string{options["verbose_level"]});
+        iss >> settings.verboseLevel_;
+        if (settings.verboseLevel_ < 0) {
+            settings.verboseLevel_ = 0;
+        }
+
+        // Disable validation of CIGARs that might contain 'M'
+        CigarOperation::validate_ = false;
+
+        return settings;
+    }
+
+public:
+    std::string inputFilename_;
+    std::string outputFilename_;
+    std::string referenceFilename_;
+    std::string pbbamFilename_;
+    std::vector<std::string> errors_;
+    int32_t verboseLevel_;
+
+private:
+    Settings() {}
+};
+
+}  // namespace pbbamify
+}  // namespace BAM
+}  // namespace PacBio
+
+int main(int argc, char* argv[])
+{
+    // Setup help & options
+    optparse::OptionParser parser;
+    parser.description(
+        "pbbamify converts an arbitray aligned BAM file to a PacBio-compatible BAM file."
+        "Input BAM file is read from a file or stdin, the raw-reads PacBio BAM is given"
+        "as a parameter, and BAM output is written to stdout.");
+    parser.prog("pbbamify");
+    parser.usage("pbbamify [options] <ref.fa> <pb.bam>|<pb.fofn>|<pb.xml>");
+    parser.version(PacBio::BAM::pbbamify::Version);
+    parser.add_version_option(true);
+    parser.add_help_option(true);
+
+    parser.set_defaults("verbose_level", "3");
+
+    auto optionGroup = optparse::OptionGroup(parser, "Options");
+    optionGroup.add_option("").dest("ref").help("Reference used to align the input.");
+    optionGroup.add_option("--input").dest("input").metavar("STR").help(
+        "The aligned non-PacBio BAM file. If not provided, stdin will be used as input.");
+    optionGroup.add_option("--output")
+        .dest("output")
+        .metavar("STR")
+        .help("Path to the output BAM file. If not specified, output will be to the stdout.");
+    optionGroup.add_option("--verbose-level")
+        .dest("verbose_level")
+        .type("int")
+        .metavar("INT")
+        .set_default("3")
+        .help(
+            "Specifies the level of info which will be output produced on"
+            "stderr. 0 turns all output off, 1 outputs only warnings, "
+            "while levels 2 and above outputs a status message every "
+            "1000000 (2), 100000 (3), 1000 (4), 100 (5), 10 (6) and 1 (7) reads.");
+    optionGroup.add_option("").dest("pbbam").help("A PacBio BAM file containing raw reads.");
+    // A Pbbam can be one of the following:
+    // - DataSetXML
+    // - FOFN
+    // - BAM
+    parser.add_option_group(optionGroup);
+
+    // Parse command line for settingas.
+    const PacBio::BAM::pbbamify::Settings settings =
+        PacBio::BAM::pbbamify::Settings::FromCommandLine(parser, argc, argv);
+    if (!settings.errors_.empty()) {
+        std::cerr << std::endl;
+        for (const auto& e : settings.errors_) {
+            std::cerr << "ERROR: " << e << std::endl;
+        }
+        std::cerr << std::endl;
+        parser.print_help();
+        return EXIT_FAILURE;
+    }
+
+    // Run the tool.
+    try {
+        // setup our @PG entry to add to header
+        PacBio::BAM::ProgramInfo pbbamifyProgram;
+        pbbamifyProgram.Id(std::string("pbbamify-") + PacBio::BAM::pbbamify::Version)
+            .Name("pbbamify")
+            .Version(PacBio::BAM::pbbamify::Version);
+
+        PacBio::BAM::DataSet dataset = PacBio::BAM::DataSet(settings.pbbamFilename_);
+        PacBio::BAM::BamReader inputBamReader(settings.inputFilename_);
+        PacBio::BAM::BamHeader newHeader;
+
+        {  // A separate block to close the reference file after the header is formed.
+            // Using a sequential reader to construct the header SN lines in order, fast.
+            PacBio::BAM::FastaReader ref_reader(settings.referenceFilename_);
+            newHeader =
+                PacBio::BAM::pbbamify::Pbbamify::ComposeHeader(dataset, ref_reader, inputBamReader);
+        }
+
+        std::shared_ptr<PacBio::BAM::pbbamify::QueryLookup> queryLookup =
+            PacBio::BAM::pbbamify::CreateQueryLookup(dataset);
+        queryLookup->Load();
+
+        {  // A block is used here to close the bamWriter and the reference reader.
+            // (Even though this will be done as soon as the 'try' block ends, this safeguards if any
+            // code should be added in between at some point.)
+            PacBio::BAM::IndexedFastaReader indexedRefReader(settings.referenceFilename_);
+            PacBio::BAM::BamWriter bamWriter(settings.outputFilename_, newHeader);
+            bool augment_rv = PacBio::BAM::pbbamify::Pbbamify::AugmentAlignments(
+                queryLookup, indexedRefReader, inputBamReader, bamWriter, settings.verboseLevel_);
+            if (augment_rv == false) {
+                return EXIT_FAILURE;
+            }
+        }
+
+        return EXIT_SUCCESS;
+    } catch (std::exception& e) {
+        std::cerr << "ERROR: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tools/pbindex/CMakeLists.txt b/tools/pbindex/CMakeLists.txt

new file mode 100644 (file)

index 0000000..0bfcf33
--- /dev/null
+++ b/tools/pbindex/CMakeLists.txt
@@ -0,0 +1,22 @@
+
+set(PbindexSrcDir ${PacBioBAM_ToolsDir}/pbindex/src)
+
+# create version header
+set(PbIndex_VERSION ${PacBioBAM_VERSION})
+configure_file(
+    ${PbindexSrcDir}/PbIndexVersion.h.in ${GeneratedDir}/PbIndexVersion.h @ONLY
+)
+
+# list source files
+set(PBINDEX_SOURCES
+    ${ToolsCommonDir}/OptionParser.cpp
+    ${PbindexSrcDir}/main.cpp
+    ${PbindexSrcDir}/PbIndex.cpp
+)
+
+# build pbindex executable
+include(PbbamTool)
+create_pbbam_tool(
+    TARGET  pbindex
+    SOURCES ${PBINDEX_SOURCES}
+)
diff --git a/tools/pbindex/src/PbIndex.cpp b/tools/pbindex/src/PbIndex.cpp

new file mode 100644 (file)

index 0000000..821bf7f
--- /dev/null
+++ b/tools/pbindex/src/PbIndex.cpp
@@ -0,0 +1,40 @@
+// Author: Derek Barnett
+
+#include "PbIndex.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+
+#include <pbbam/BamFile.h>
+#include <pbbam/PbiRawData.h>
+
+using namespace pbindex;
+
+Settings::Settings() : printPbiContents_(false) {}
+
+int PbIndex::Create(const Settings& settings)
+{
+    try {
+        PacBio::BAM::BamFile bamFile(settings.inputBamFilename_);
+        bamFile.CreatePacBioIndex();
+        return EXIT_SUCCESS;
+    } catch (std::runtime_error& e) {
+        std::cerr << "pbindex ERROR: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
+
+//int PbIndex::Print(const Settings& settings)
+//{
+
+//}
+
+int PbIndex::Run(const Settings& settings)
+{
+    //    if (settings.printPbiContents_)
+    //        return Print(settings);
+    //    else
+    return Create(settings);
+}
diff --git a/tools/pbindex/src/PbIndex.h b/tools/pbindex/src/PbIndex.h

new file mode 100644 (file)

index 0000000..429fad8
--- /dev/null
+++ b/tools/pbindex/src/PbIndex.h
@@ -0,0 +1,35 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEX_H
+#define PBINDEX_H
+
+#include <string>
+#include <vector>
+
+namespace pbindex {
+
+class Settings
+{
+public:
+    Settings();
+
+public:
+public:
+    std::string inputBamFilename_;
+    bool printPbiContents_;
+    std::vector<std::string> errors_;
+};
+
+class PbIndex
+{
+public:
+    static int Run(const Settings& settings);
+
+private:
+    static int Create(const Settings& settings);
+    //    static int Print(const Settings& settings);
+};
+
+}  // namespace pbindex
+
+#endif  // PBINDEX_H
diff --git a/tools/pbindex/src/PbIndexVersion.h.in b/tools/pbindex/src/PbIndexVersion.h.in

new file mode 100644 (file)

index 0000000..4e20df4
--- /dev/null
+++ b/tools/pbindex/src/PbIndexVersion.h.in
@@ -0,0 +1,14 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEXVERSION_H
+#define PBINDEXVERSION_H
+
+#include <string>
+
+namespace pbindex {
+
+const std::string Version = std::string("@PbIndex_VERSION@");
+
+} // namespace pbindex
+
+#endif // PBINDEXVERSION_H
diff --git a/tools/pbindex/src/main.cpp b/tools/pbindex/src/main.cpp

new file mode 100644 (file)

index 0000000..ac16d00
--- /dev/null
+++ b/tools/pbindex/src/main.cpp
@@ -0,0 +1,66 @@
+// Author: Derek Barnett
+
+#include <cassert>
+#include <cstddef>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "../common/OptionParser.h"
+#include "PbIndex.h"
+#include "PbIndexVersion.h"
+
+static pbindex::Settings fromCommandLine(optparse::OptionParser& parser, int argc, char* argv[])
+{
+    const optparse::Values options = parser.parse_args(argc, argv);
+    //    ()options;
+
+    pbindex::Settings settings;
+
+    // get input filename
+    const std::vector<std::string> positionalArgs = parser.args();
+    const size_t numPositionalArgs = positionalArgs.size();
+    if (numPositionalArgs == 0)
+        settings.errors_.push_back("pbindex requires an input BAM filename");
+    else if (numPositionalArgs == 1)
+        settings.inputBamFilename_ = parser.args().front();
+    else {
+        assert(numPositionalArgs > 1);
+        settings.errors_.push_back("pbindex does not support more than one input file per run");
+    }
+
+    return settings;
+}
+
+int main(int argc, char* argv[])
+{
+    // setup help & options
+    optparse::OptionParser parser;
+    parser.description(
+        "pbindex creates a index file that enables random-access to PacBio-specific data in BAM "
+        "files. "
+        "Generated index filename will be the same as input BAM plus .pbi suffix.");
+    parser.prog("pbindex");
+    parser.usage("pbindex <input>");
+    parser.version(pbindex::Version);
+    parser.add_version_option(true);
+    parser.add_help_option(true);
+
+    auto ioGroup = optparse::OptionGroup(parser, "Input/Output");
+    ioGroup.add_option("").dest("input").metavar("input").help("Input BAM file");
+    parser.add_option_group(ioGroup);
+
+    // parse command line for settings
+    const pbindex::Settings settings = fromCommandLine(parser, argc, argv);
+    if (!settings.errors_.empty()) {
+        std::cerr << std::endl;
+        for (const auto e : settings.errors_)
+            std::cerr << "ERROR: " << e << std::endl;
+        std::cerr << std::endl;
+        parser.print_help();
+        return EXIT_FAILURE;
+    }
+
+    // run tool
+    return pbindex::PbIndex::Run(settings);
+}
diff --git a/tools/pbindexdump/CMakeLists.txt b/tools/pbindexdump/CMakeLists.txt

new file mode 100644 (file)

index 0000000..88c07b9
--- /dev/null
+++ b/tools/pbindexdump/CMakeLists.txt
@@ -0,0 +1,47 @@
+
+set(PbindexdumpSrcDir ${PacBioBAM_ToolsDir}/pbindexdump/src)
+
+# create version header
+set(PbIndexDump_VERSION ${PacBioBAM_VERSION})
+configure_file(
+    ${PbindexdumpSrcDir}/PbIndexDumpVersion.h.in ${GeneratedDir}/PbIndexDumpVersion.h @ONLY
+)
+
+# list source files
+set(PBINDEXDUMP_SOURCES
+    ${ToolsCommonDir}/OptionParser.cpp
+    ${PbindexdumpSrcDir}/CppFormatter.cpp
+    ${PbindexdumpSrcDir}/JsonFormatter.cpp
+    ${PbindexdumpSrcDir}/PbIndexDump.cpp
+    ${PbindexdumpSrcDir}/main.cpp
+)
+
+# build pbindexdump executable
+include(PbbamTool)
+create_pbbam_tool(
+    TARGET  pbindexdump
+    SOURCES ${PBINDEXDUMP_SOURCES}
+)
+
+# cram tests
+if (PacBioBAM_build_tests)
+
+    configure_file(
+        ${PacBioBAM_CramTestsDir}/pbindexdump_json.t.in
+        ${GeneratedDir}/pbindexdump_json.t
+    )
+
+    configure_file(
+        ${PacBioBAM_CramTestsDir}/pbindexdump_cpp.t.in
+        ${GeneratedDir}/pbindexdump_cpp.t
+    )
+
+    add_test(
+        NAME pbindexdump_CramTests
+        WORKING_DIRECTORY ${PacBioBAM_TestsDir}/scripts
+        COMMAND "python" cram.py
+            ${GeneratedDir}/pbindexdump_json.t
+            ${GeneratedDir}/pbindexdump_cpp.t
+    )
+
+endif()
diff --git a/tools/pbindexdump/src/CppFormatter.cpp b/tools/pbindexdump/src/CppFormatter.cpp

new file mode 100644 (file)

index 0000000..d4525dc
--- /dev/null
+++ b/tools/pbindexdump/src/CppFormatter.cpp
@@ -0,0 +1,156 @@
+// Author: Derek Barnett
+
+#include "CppFormatter.h"
+#include <pbbam/PbiRawData.h>
+
+#include <cstdint>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+using namespace pbindexdump;
+
+namespace pbindexdump {
+
+static std::string printCppReferenceData(const PacBio::BAM::PbiRawReferenceData& referenceData)
+{
+    auto result = std::string{""};
+    for (const PacBio::BAM::PbiReferenceEntry& entry : referenceData.entries_) {
+        if (!result.empty()) result.append(",\n");
+        result.append(std::string{"    PbiReferenceEntry{"} + std::to_string(entry.tId_) + "," +
+                      std::to_string(entry.beginRow_) + "," + std::to_string(entry.endRow_) +
+                      std::string{"}"});
+    }
+    if (!result.empty()) result.append("\n");
+    return result;
+}
+
+template <typename T>
+std::string printVectorElements(const std::vector<T>& c)
+{
+    std::stringstream s;
+    for (const auto& e : c)
+        s << e << ",";
+    auto result = s.str();
+    if (!result.empty()) result.pop_back();  // remove final comma
+    return result;
+}
+
+template <>
+std::string printVectorElements(const std::vector<uint8_t>& c)
+{
+    std::stringstream s;
+    for (const auto& e : c)
+        s << static_cast<uint16_t>(e)
+          << ",";  // cast to larger uint, force print as number not character
+    auto result = s.str();
+    if (!result.empty()) result.pop_back();  // remove final comma
+    return result;
+}
+
+template <>
+std::string printVectorElements(const std::vector<int8_t>& c)
+{
+    std::stringstream s;
+    for (const auto& e : c)
+        s << static_cast<int16_t>(e)
+          << ",";  // cast to larger int, force print as number not character
+    auto result = s.str();
+    if (!result.empty()) result.pop_back();  // remove final comma
+    return result;
+}
+
+}  // namespace pbindexdump
+
+CppFormatter::CppFormatter(const Settings& settings) : IFormatter(settings) {}
+
+void CppFormatter::Run()
+{
+    using namespace PacBio::BAM;
+
+    const PbiRawData rawData{settings_.inputPbiFilename_};
+    const PbiRawBarcodeData& barcodeData = rawData.BarcodeData();
+    const PbiRawBasicData& basicData = rawData.BasicData();
+    const PbiRawMappedData& mappedData = rawData.MappedData();
+    const PbiRawReferenceData& referenceData = rawData.ReferenceData();
+
+    auto version = std::string{};
+    switch (rawData.Version()) {
+        case PbiFile::Version_3_0_0:
+            version = "PbiFile::Version_3_0_0";
+            break;
+        case PbiFile::Version_3_0_1:
+            version = "PbiFile::Version_3_0_1";
+            break;
+        default:
+            throw std::runtime_error("unsupported PBI version encountered");
+    }
+
+    auto fileSections = std::string{"PbiFile::BASIC"};
+    if (rawData.HasBarcodeData()) fileSections += std::string{" | PbiFile::BARCODE"};
+    if (rawData.HasMappedData()) fileSections += std::string{" | PbiFile::MAPPED"};
+    if (rawData.HasReferenceData()) fileSections += std::string{" | PbiFile::REFERENCE"};
+
+    std::ostringstream s;
+    s << "PbiRawData rawData;" << std::endl
+      << "rawData.Version(" << version << ");" << std::endl
+      << "rawData.FileSections(" << fileSections << ");" << std::endl
+      << "rawData.NumReads(" << rawData.NumReads() << ");" << std::endl
+      << std::endl
+      << "PbiRawBasicData& basicData = rawData.BasicData();" << std::endl
+      << "basicData.rgId_       = {" << printVectorElements(basicData.rgId_) << "};" << std::endl
+      << "basicData.qStart_     = {" << printVectorElements(basicData.qStart_) << "};" << std::endl
+      << "basicData.qEnd_       = {" << printVectorElements(basicData.qEnd_) << "};" << std::endl
+      << "basicData.holeNumber_ = {" << printVectorElements(basicData.holeNumber_) << "};"
+      << std::endl
+      << "basicData.readQual_   = {" << printVectorElements(basicData.readQual_) << "};"
+      << std::endl
+      << "basicData.ctxtFlag_   = {" << printVectorElements(basicData.ctxtFlag_) << "};"
+      << std::endl
+      << "basicData.fileOffset_ = {" << printVectorElements(basicData.fileOffset_) << "};"
+      << std::endl
+      << std::endl;
+
+    if (rawData.HasBarcodeData()) {
+        s << "PbiRawBarcodeData& barcodeData = rawData.BarcodeData();" << std::endl
+          << "barcodeData.bcForward_ = {" << printVectorElements(barcodeData.bcForward_) << "};"
+          << std::endl
+          << "barcodeData.bcReverse_ = {" << printVectorElements(barcodeData.bcReverse_) << "};"
+          << std::endl
+          << "barcodeData.bcQual_    = {" << printVectorElements(barcodeData.bcQual_) << "};"
+          << std::endl
+          << std::endl;
+    }
+
+    if (rawData.HasMappedData()) {
+        s << "PbiRawMappedData& mappedData = rawData.MappedData();" << std::endl
+          << "mappedData.tId_       = {" << printVectorElements(mappedData.tId_) << "};"
+          << std::endl
+          << "mappedData.tStart_    = {" << printVectorElements(mappedData.tStart_) << "};"
+          << std::endl
+          << "mappedData.tEnd_      = {" << printVectorElements(mappedData.tEnd_) << "};"
+          << std::endl
+          << "mappedData.aStart_    = {" << printVectorElements(mappedData.aStart_) << "};"
+          << std::endl
+          << "mappedData.aEnd_      = {" << printVectorElements(mappedData.aEnd_) << "};"
+          << std::endl
+          << "mappedData.revStrand_ = {" << printVectorElements(mappedData.revStrand_) << "};"
+          << std::endl
+          << "mappedData.nM_        = {" << printVectorElements(mappedData.nM_) << "};" << std::endl
+          << "mappedData.nMM_       = {" << printVectorElements(mappedData.nMM_) << "};"
+          << std::endl
+          << "mappedData.mapQV_     = {" << printVectorElements(mappedData.mapQV_) << "};"
+          << std::endl
+          << std::endl;
+    }
+
+    if (rawData.HasReferenceData()) {
+        s << "PbiRawReferenceData& referenceData = rawData.ReferenceData();" << std::endl
+          << "referenceData.entries_ = { " << std::endl
+          << printCppReferenceData(referenceData) << "};" << std::endl
+          << std::endl;
+    }
+
+    std::cout << s.str() << std::endl;
+}
diff --git a/tools/pbindexdump/src/CppFormatter.h b/tools/pbindexdump/src/CppFormatter.h

new file mode 100644 (file)

index 0000000..d03244f
--- /dev/null
+++ b/tools/pbindexdump/src/CppFormatter.h
@@ -0,0 +1,19 @@
+// Author: Derek Barnett
+
+#ifndef CPPFORMATTER_H
+#define CPPFORMATTER_H
+
+#include "IFormatter.h"
+
+namespace pbindexdump {
+
+class CppFormatter : public IFormatter
+{
+public:
+    CppFormatter(const Settings& settings);
+    void Run();
+};
+
+}  // namespace pbindexdump
+
+#endif  // CPPFORMATTER_H
diff --git a/tools/pbindexdump/src/IFormatter.h b/tools/pbindexdump/src/IFormatter.h

new file mode 100644 (file)

index 0000000..8f97966
--- /dev/null
+++ b/tools/pbindexdump/src/IFormatter.h
@@ -0,0 +1,27 @@
+// Author: Derek Barnett
+
+#ifndef IFORMATTER_H
+#define IFORMATTER_H
+
+#include "Settings.h"
+
+namespace pbindexdump {
+
+class IFormatter
+{
+public:
+    virtual ~IFormatter(void) = default;
+
+public:
+    virtual void Run(void) = 0;
+
+protected:
+    const Settings& settings_;
+
+protected:
+    IFormatter(const Settings& settings) : settings_(settings) {}
+};
+
+}  // namespace pbindexdump
+
+#endif  // IFORMATTER_H
diff --git a/tools/pbindexdump/src/JsonFormatter.cpp b/tools/pbindexdump/src/JsonFormatter.cpp

new file mode 100644 (file)

index 0000000..757aa0f
--- /dev/null
+++ b/tools/pbindexdump/src/JsonFormatter.cpp
@@ -0,0 +1,165 @@
+// Author: Derek Barnett
+
+#include "JsonFormatter.h"
+
+#include <cstdint>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include <pbbam/PbiFile.h>
+#include "json.hpp"
+
+using namespace pbindexdump;
+using namespace PacBio::BAM;
+
+namespace pbindexdump {
+
+}  // namespace pbindexdump
+
+JsonFormatter::JsonFormatter(const Settings& settings)
+    : IFormatter(settings), index_(settings.inputPbiFilename_)
+{
+}
+
+void JsonFormatter::FormatMetadata()
+{
+    auto version = std::string{};
+    switch (index_.Version()) {
+        case PbiFile::Version_3_0_0:
+            version = "3.0.0";
+            break;
+        case PbiFile::Version_3_0_1:
+            version = "3.0.1";
+            break;
+        default:
+            throw std::runtime_error("unsupported PBI version encountered");
+    }
+
+    nlohmann::json fileSections;
+    fileSections.push_back("BasicData");
+    if (index_.HasBarcodeData()) fileSections.push_back("BarcodeData");
+    if (index_.HasMappedData()) fileSections.push_back("MappedData");
+    if (index_.HasReferenceData()) fileSections.push_back("ReferenceData");
+
+    json_["version"] = version;
+    json_["fileSections"] = fileSections;
+    json_["numReads"] = index_.NumReads();
+}
+
+void JsonFormatter::FormatRaw()
+{
+    const PbiRawBasicData& basicData = index_.BasicData();
+    json_["basicData"]["rgId"] = basicData.rgId_;
+    json_["basicData"]["qStart"] = basicData.qStart_;
+    json_["basicData"]["qEnd"] = basicData.qEnd_;
+    json_["basicData"]["holeNumber"] = basicData.holeNumber_;
+    json_["basicData"]["readQual"] = basicData.readQual_;
+    json_["basicData"]["ctxtFlag"] = basicData.ctxtFlag_;
+    json_["basicData"]["fileOffset"] = basicData.fileOffset_;
+
+    if (index_.HasBarcodeData()) {
+        const PbiRawBarcodeData& barcodeData = index_.BarcodeData();
+        json_["barcodeData"]["bcForward"] = barcodeData.bcForward_;
+        json_["barcodeData"]["bcReverse"] = barcodeData.bcReverse_;
+        json_["barcodeData"]["bcQuality"] = barcodeData.bcQual_;
+    }
+
+    if (index_.HasMappedData()) {
+        const PbiRawMappedData& mappedData = index_.MappedData();
+
+        // casts to force -1 if unmapped
+        json_["mappedData"]["tId"] = mappedData.tId_;
+        json_["mappedData"]["tStart"] = mappedData.tStart_;
+        json_["mappedData"]["tEnd"] = mappedData.tEnd_;
+
+        json_["mappedData"]["aStart"] = mappedData.aStart_;
+        json_["mappedData"]["aEnd"] = mappedData.aEnd_;
+        json_["mappedData"]["revStrand"] = mappedData.revStrand_;
+        json_["mappedData"]["nM"] = mappedData.nM_;
+        json_["mappedData"]["nMM"] = mappedData.nMM_;
+        json_["mappedData"]["mapQV"] = mappedData.mapQV_;
+    }
+}
+
+void JsonFormatter::FormatRecords()
+{
+    nlohmann::json reads;
+    const uint32_t numReads = index_.NumReads();
+    const bool hasBarcodeData = index_.HasBarcodeData();
+    const bool hasMappedData = index_.HasMappedData();
+    for (uint32_t i = 0; i < numReads; ++i) {
+
+        nlohmann::json read;
+
+        // common data
+        const PbiRawBasicData& basicData = index_.BasicData();
+        read["rgId"] = basicData.rgId_[i];
+        read["qStart"] = basicData.qStart_[i];
+        read["qEnd"] = basicData.qEnd_[i];
+        read["holeNumber"] = basicData.holeNumber_[i];
+        read["readQuality"] = basicData.readQual_[i];
+        read["contextFlag"] = basicData.ctxtFlag_[i];
+        read["fileOffset"] = basicData.fileOffset_[i];
+
+        // barcode data, if present
+        if (hasBarcodeData) {
+            const PbiRawBarcodeData& barcodeData = index_.BarcodeData();
+            read["bcForward"] = barcodeData.bcForward_[i];
+            read["bcReverse"] = barcodeData.bcReverse_[i];
+            read["bcQuality"] = barcodeData.bcQual_[i];
+        }
+
+        // mapping data, if present
+        if (hasMappedData) {
+            const PbiRawMappedData& mappedData = index_.MappedData();
+
+            // casts to force -1 if unmapped
+            read["tId"] = static_cast<int32_t>(mappedData.tId_[i]);
+            read["tStart"] = static_cast<int32_t>(mappedData.tStart_[i]);
+            read["tEnd"] = static_cast<int32_t>(mappedData.tEnd_[i]);
+
+            read["aStart"] = mappedData.aStart_[i];
+            read["aEnd"] = mappedData.aEnd_[i];
+            read["nM"] = mappedData.nM_[i];
+            read["nMM"] = mappedData.nMM_[i];
+            read["mapQuality"] = mappedData.mapQV_[i];
+            read["reverseStrand"] = mappedData.revStrand_[i];
+        }
+
+        reads.push_back(std::move(read));
+    }
+    json_["reads"] = reads;
+}
+
+void JsonFormatter::FormatReferences()
+{
+    if (index_.HasReferenceData()) {
+        const PbiRawReferenceData& referenceData = index_.ReferenceData();
+        nlohmann::json references;
+        for (const PbiReferenceEntry& entry : referenceData.entries_) {
+            nlohmann::json element;
+            element["tId"] = static_cast<int32_t>(entry.tId_);
+            element["beginRow"] = static_cast<int32_t>(entry.beginRow_);
+            element["endRow"] = static_cast<int32_t>(entry.endRow_);
+            references.push_back(std::move(element));
+        }
+        json_["references"] = references;
+    }
+}
+
+void JsonFormatter::Print() { std::cout << json_.dump(settings_.jsonIndentLevel_) << std::endl; }
+
+void JsonFormatter::Run()
+{
+    FormatMetadata();
+    FormatReferences();
+
+    if (settings_.jsonRaw_)
+        FormatRaw();
+    else
+        FormatRecords();
+
+    Print();
+}
diff --git a/tools/pbindexdump/src/JsonFormatter.h b/tools/pbindexdump/src/JsonFormatter.h

new file mode 100644 (file)

index 0000000..ce4bed8
--- /dev/null
+++ b/tools/pbindexdump/src/JsonFormatter.h
@@ -0,0 +1,34 @@
+// Author: Derek Barnett
+
+#ifndef JSONFORMATTER_H
+#define JSONFORMATTER_H
+
+#include <pbbam/PbiRawData.h>
+#include "IFormatter.h"
+#include "json.hpp"
+
+namespace pbindexdump {
+
+class JsonFormatter : public IFormatter
+{
+public:
+    JsonFormatter(const Settings& settings);
+    void Run();
+
+private:
+    void FormatMetadata();
+    void FormatReferences();
+
+    void FormatRaw();
+    void FormatRecords();
+
+    void Print();
+
+private:
+    PacBio::BAM::PbiRawData index_;
+    nlohmann::json json_;
+};
+
+}  // namespace pbindexdump
+
+#endif  // JSONFORMATTER_H
diff --git a/tools/pbindexdump/src/PbIndexDump.cpp b/tools/pbindexdump/src/PbIndexDump.cpp

new file mode 100644 (file)

index 0000000..258e383
--- /dev/null
+++ b/tools/pbindexdump/src/PbIndexDump.cpp
@@ -0,0 +1,30 @@
+// Author: Derek Barnett
+
+#include "PbIndexDump.h"
+
+#include <cassert>
+#include <memory>
+#include <stdexcept>
+#include <string>
+
+#include <pbbam/MakeUnique.h>
+
+#include "CppFormatter.h"
+#include "JsonFormatter.h"
+using namespace pbindexdump;
+
+void PbIndexDump::Run(const Settings& settings)
+{
+    std::unique_ptr<IFormatter> formatter(nullptr);
+    if (settings.format_ == "json")
+        formatter = std::make_unique<JsonFormatter>(settings);
+    else if (settings.format_ == "cpp")
+        formatter = std::make_unique<CppFormatter>(settings);
+    else {
+        std::string msg = {"unsupported output format requested: "};
+        msg += settings.format_;
+        throw std::runtime_error(msg);
+    }
+    assert(formatter);
+    formatter->Run();
+}
diff --git a/tools/pbindexdump/src/PbIndexDump.h b/tools/pbindexdump/src/PbIndexDump.h

new file mode 100644 (file)

index 0000000..c81ab19
--- /dev/null
+++ b/tools/pbindexdump/src/PbIndexDump.h
@@ -0,0 +1,18 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEXDUMP_H
+#define PBINDEXDUMP_H
+
+namespace pbindexdump {
+
+class Settings;
+
+class PbIndexDump
+{
+public:
+    static void Run(const Settings& settings);
+};
+
+}  // namespace pbindex
+
+#endif  // PBINDEXDUMP_H
diff --git a/tools/pbindexdump/src/PbIndexDumpVersion.h.in b/tools/pbindexdump/src/PbIndexDumpVersion.h.in

new file mode 100644 (file)

index 0000000..040cd8a
--- /dev/null
+++ b/tools/pbindexdump/src/PbIndexDumpVersion.h.in
@@ -0,0 +1,14 @@
+// Author: Derek Barnett
+
+#ifndef PBINDEXDUMPVERSION_H
+#define PBINDEXDUMPVERSION_H
+
+#include <string>
+
+namespace pbindexdump {
+
+const std::string Version = std::string("@PbIndexDump_VERSION@");
+
+} // namespace pbindexdump
+
+#endif // PBINDEXDUMPVERSION_H
diff --git a/tools/pbindexdump/src/Settings.h b/tools/pbindexdump/src/Settings.h

new file mode 100644 (file)

index 0000000..dabf88e
--- /dev/null
+++ b/tools/pbindexdump/src/Settings.h
@@ -0,0 +1,26 @@
+// Author: Derek Barnett
+
+#ifndef SETTINGS_H
+#define SETTINGS_H
+
+#include <string>
+#include <vector>
+
+namespace pbindexdump {
+
+class Settings
+{
+public:
+    Settings(void) : format_("json"), jsonIndentLevel_(4), jsonRaw_(false) {}
+
+public:
+    std::string inputPbiFilename_;
+    std::string format_;
+    int jsonIndentLevel_;
+    bool jsonRaw_;
+    std::vector<std::string> errors_;
+};
+
+}  // namespace pbindexdump
+
+#endif  // SETTINGS_H
diff --git a/tools/pbindexdump/src/json.hpp b/tools/pbindexdump/src/json.hpp

new file mode 100644 (file)

index 0000000..cc32e6d
--- /dev/null
+++ b/tools/pbindexdump/src/json.hpp
@@ -0,0 +1,7296 @@
+/*!
+@mainpage
+
+These pages contain the API documentation of JSON for Modern C++, a C++11
+header-only JSON class.
+
+Class @ref nlohmann::basic_json is a good entry point for the documentation.
+
+@copyright The code is licensed under the [MIT
+           License](http://opensource.org/licenses/MIT):
+           <br>
+           Copyright &copy; 2013-2015 Niels Lohmann.
+           <br>
+           Permission is hereby granted, free of charge, to any person
+           obtaining a copy of this software and associated documentation files
+           (the "Software"), to deal in the Software without restriction,
+           including without limitation the rights to use, copy, modify, merge,
+           publish, distribute, sublicense, and/or sell copies of the Software,
+           and to permit persons to whom the Software is furnished to do so,
+           subject to the following conditions:
+           <br>
+           The above copyright notice and this permission notice shall be
+           included in all copies or substantial portions of the Software.
+           <br>
+           THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+           EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+           MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+           NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+           BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+           ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+           CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+           SOFTWARE.
+
+@author [Niels Lohmann](http://nlohmann.me)
+@see https://github.com/nlohmann/json to download the source code
+*/
+
+#ifndef NLOHMANN_JSON_HPP
+#define NLOHMANN_JSON_HPP
+
+#include <algorithm>
+#include <array>
+#include <ciso646>
+#include <cmath>
+#include <cstdio>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+// enable ssize_t on MinGW
+#ifdef __GNUC__
+    #ifdef __MINGW32__
+        #include <sys/types.h>
+    #endif
+#endif
+
+// enable ssize_t for MSVC
+#ifdef _MSC_VER
+    #include <basetsd.h>
+    using ssize_t = SSIZE_T;
+#endif
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+*/
+namespace nlohmann
+{
+
+
+/*!
+@brief unnamed namespace with internal helper functions
+*/
+namespace
+{
+/*!
+@brief Helper to determine whether there's a key_type for T.
+@sa http://stackoverflow.com/a/7728728/266378
+*/
+template<typename T>
+struct has_mapped_type
+{
+  private:
+    template<typename C> static char test(typename C::mapped_type*);
+    template<typename C> static int  test(...);
+  public:
+    enum { value = sizeof(test<T>(0)) == sizeof(char) };
+};
+
+/// "equality" comparison for floating point numbers
+template<typename T>
+static bool approx(const T a, const T b)
+{
+    return not (a > b or a < b);
+}
+}
+
+/*!
+@brief a class to store JSON values
+
+@tparam ObjectType type for JSON objects (@c std::map by default; will be used
+in @ref object_t)
+@tparam ArrayType type for JSON arrays (@c std::vector by default; will be used
+in @ref array_t)
+@tparam StringType type for JSON strings and object keys (@c std::string by
+default; will be used in @ref string_t)
+@tparam BooleanType type for JSON booleans (@c `bool` by default; will be used
+in @ref boolean_t)
+@tparam NumberIntegerType type for JSON integer numbers (@c `int64_t` by
+default; will be used in @ref number_integer_t)
+@tparam NumberFloatType type for JSON floating-point numbers (@c `double` by
+default; will be used in @ref number_float_t)
+@tparam AllocatorType type of the allocator to use (@c `std::allocator` by
+default)
+
+@requirement The class satisfies the following concept requirements:
+- Basic
+ - [DefaultConstructible](http://en.cppreference.com/w/cpp/concept/DefaultConstructible):
+   JSON values can be default constructed. The result will be a JSON null value.
+ - [MoveConstructible](http://en.cppreference.com/w/cpp/concept/MoveConstructible):
+   A JSON value can be constructed from an rvalue argument.
+ - [CopyConstructible](http://en.cppreference.com/w/cpp/concept/CopyConstructible):
+   A JSON value can be copy-constrcuted from an lvalue expression.
+ - [MoveAssignable](http://en.cppreference.com/w/cpp/concept/MoveAssignable):
+   A JSON value van be assigned from an rvalue argument.
+ - [CopyAssignable](http://en.cppreference.com/w/cpp/concept/CopyAssignable):
+   A JSON value can be copy-assigned from an lvalue expression.
+ - [Destructible](http://en.cppreference.com/w/cpp/concept/Destructible):
+   JSON values can be destructed.
+- Layout
+ - [StandardLayoutType](http://en.cppreference.com/w/cpp/concept/StandardLayoutType):
+   JSON values have
+   [standard layout](http://en.cppreference.com/w/cpp/language/data_members#Standard_layout):
+   All non-static data members are private and standard layout types, the class
+   has no virtual functions or (virtual) base classes.
+- Library-wide
+ - [EqualityComparable](http://en.cppreference.com/w/cpp/concept/EqualityComparable):
+   JSON values can be compared with `==`, see @ref
+   operator==(const_reference,const_reference).
+ - [LessThanComparable](http://en.cppreference.com/w/cpp/concept/LessThanComparable):
+   JSON values can be compared with `<`, see @ref
+   operator<(const_reference,const_reference).
+ - [Swappable](http://en.cppreference.com/w/cpp/concept/Swappable):
+   Any JSON lvalue or rvalue of can be swapped with any lvalue or rvalue of
+   other compatible types, using unqualified function call @ref swap().
+ - [NullablePointer](http://en.cppreference.com/w/cpp/concept/NullablePointer):
+   JSON values can be compared against `std::nullptr_t` objects which are used
+   to model the `null` value.
+- Container
+ - [Container](http://en.cppreference.com/w/cpp/concept/Container):
+   JSON values can be used like STL containers and provide iterator access.
+ - [ReversibleContainer](http://en.cppreference.com/w/cpp/concept/ReversibleContainer);
+   JSON values can be used like STL containers and provide reverse iterator
+   access.
+
+@internal
+@note ObjectType trick from http://stackoverflow.com/a/9860911
+@endinternal
+
+@see RFC 7159 <http://rfc7159.net/rfc7159>
+*/
+template <
+    template<typename U, typename V, typename... Args> class ObjectType = std::map,
+    template<typename U, typename... Args> class ArrayType = std::vector,
+    class StringType = std::string,
+    class BooleanType = bool,
+    class NumberIntegerType = int64_t,
+    class NumberFloatType = double,
+    template<typename U> class AllocatorType = std::allocator
+    >
+class basic_json
+{
+  private:
+    /// workaround type for MSVC
+    using basic_json_t = basic_json<ObjectType,
+          ArrayType,
+          StringType,
+          BooleanType,
+          NumberIntegerType,
+          NumberFloatType,
+          AllocatorType>;
+
+  public:
+
+    /////////////////////
+    // container types //
+    /////////////////////
+
+    /// @name container types
+    /// @{
+
+    /// the type of elements in a basic_json container
+    using value_type = basic_json;
+
+    /// the type of an element reference
+    using reference = value_type&;
+
+    /// the type of an element const reference
+    using const_reference = const value_type&;
+
+    /// a type to represent differences between iterators
+    using difference_type = std::ptrdiff_t;
+
+    /// a type to represent container sizes
+    using size_type = std::size_t;
+
+    /// the allocator type
+    using allocator_type = AllocatorType<basic_json>;
+
+    /// the type of an element pointer
+    using pointer = typename std::allocator_traits<allocator_type>::pointer;
+    /// the type of an element const pointer
+    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;
+
+    // forward declaration
+    template<typename Base> class json_reverse_iterator;
+
+    /// an iterator for a basic_json container
+    class iterator;
+    /// a const iterator for a basic_json container
+    class const_iterator;
+    /// a reverse iterator for a basic_json container
+    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
+    /// a const reverse iterator for a basic_json container
+    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;
+
+    /// @}
+
+
+    /*!
+    @brief returns the allocator associated with the container
+    */
+    static allocator_type get_allocator()
+    {
+        return allocator_type();
+    }
+
+
+    ///////////////////////////
+    // JSON value data types //
+    ///////////////////////////
+
+    /// @name JSON value data types
+    /// @{
+
+    /*!
+    @brief a type for an object
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON objects as follows:
+    > An object is an unordered collection of zero or more name/value pairs,
+    > where a name is a string and a value is a string, number, boolean, null,
+    > object, or array.
+
+    To store objects in C++, a type is defined by the template parameters @a
+    ObjectType which chooses the container (e.g., `std::map` or
+    `std::unordered_map`), @a StringType which chooses the type of the keys or
+    names, and @a AllocatorType which chooses the allocator to use.
+
+    #### Default type
+
+    With the default values for @a ObjectType (`std::map`), @a StringType
+    (`std::string`), and @a AllocatorType (`std::allocator`), the default value
+    for @a object_t is:
+
+    @code {.cpp}
+    std::map<
+      std::string, // key_type
+      basic_json, // value_type
+      std::less<std::string>, // key_compare
+      std::allocator<std::pair<const std::string, basic_json>> // allocator_type
+    >
+    @endcode
+
+    #### Behavior
+
+    The choice of @a object_t influences the behavior of the JSON class. With
+    the default type, objects have the following behavior:
+
+    - When all names are unique, objects will be interoperable in the sense
+      that all software implementations receiving that object will agree on the
+      name-value mappings.
+    - When the names within an object are not unique, later stored name/value
+      pairs overwrite previously stored name/value pairs, leaving the used
+      names unique. For instance, `{"key": 1}` and `{"key": 2, "key": 1}` will
+      be treated as equal and both stored as `{"key": 1}`.
+    - Internally, name/value pairs are stored in lexicographical order of the
+      names. Objects will also be serialized (see @ref dump) in this order. For
+      instance, `{"b": 1, "a": 2}` and `{"a": 2, "b": 1}` will be stored and
+      serialized as `{"a": 2, "b": 1}`.
+    - When comparing objects, the order of the name/value pairs is irrelevant.
+      This makes objects interoperable in the sense that they will not be
+      affected by these differences. For instance, `{"b": 1, "a": 2}` and
+      `{"a": 2, "b": 1}` will be treated as equal.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the object's limit of nesting is not constraint explicitly.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the @ref
+    max_size function of a JSON object.
+
+    #### Storage
+
+    Objects are stored as pointers in a `basic_json` type. That is, for any
+    access to object values, a pointer of type `object_t*` must be dereferenced.
+
+    @sa array_t
+    */
+    using object_t = ObjectType<StringType,
+          basic_json,
+          std::less<StringType>,
+          AllocatorType<std::pair<const StringType,
+          basic_json>>>;
+
+    /*!
+    @brief a type for an array
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON arrays as follows:
+    > An array is an ordered sequence of zero or more values.
+
+    To store objects in C++, a type is defined by the template parameters @a
+    ArrayType which chooses the container (e.g., `std::vector` or `std::list`)
+    and @a AllocatorType which chooses the allocator to use.
+
+    #### Default type
+
+    With the default values for @a ArrayType (`std::vector`) and @a
+    AllocatorType (`std::allocator`), the default value for @a array_t is:
+
+    @code {.cpp}
+    std::vector<
+      basic_json, // value_type
+      std::allocator<basic_json> // allocator_type
+    >
+    @endcode
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the array's limit of nesting is not constraint explicitly.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the @ref
+    max_size function of a JSON array.
+
+    #### Storage
+
+    Arrays are stored as pointers in a `basic_json` type. That is, for any
+    access to array values, a pointer of type `array_t*` must be dereferenced.
+    */
+    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;
+
+    /*!
+    @brief a type for a string
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON strings as follows:
+    > A string is a sequence of zero or more Unicode characters.
+
+    To store objects in C++, a type is defined by the template parameters @a
+    StringType which chooses the container (e.g., `std::string`) to use.
+
+    Unicode values are split by the JSON class into byte-sized characters
+    during deserialization.
+
+    #### Default type
+
+    With the default values for @a StringType (`std::string`), the default
+    value for @a string_t is:
+
+    @code {.cpp}
+    std::string
+    @endcode
+
+    #### String comparison
+
+    [RFC 7159](http://rfc7159.net/rfc7159) states:
+    > Software implementations are typically required to test names of object
+    > members for equality. Implementations that transform the textual
+    > representation into sequences of Unicode code units and then perform the
+    > comparison numerically, code unit by code unit, are interoperable in the
+    > sense that implementations will agree in all cases on equality or
+    > inequality of two strings. For example, implementations that compare
+    > strings with escaped characters unconverted may incorrectly find that
+    > `"a\\b"` and `"a\u005Cb"` are not equal.
+
+    This implementation is interoperable as it does compare strings code unit
+    by code unit.
+
+    #### Storage
+
+    String values are stored as pointers in a `basic_json` type. That is, for
+    any access to string values, a pointer of type `string_t*` must be
+    dereferenced.
+    */
+    using string_t = StringType;
+
+    /*!
+    @brief a type for a boolean
+
+    [RFC 7159](http://rfc7159.net/rfc7159) implicitly describes a boolean as a
+    type which differentiates the two literals `true` and `false`.
+
+    To store objects in C++, a type is defined by the template parameter @a
+    BooleanType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a BooleanType (`bool`), the default value for
+    @a boolean_t is:
+
+    @code {.cpp}
+    bool
+    @endcode
+
+    #### Storage
+
+    Boolean values are stored directly inside a `basic_json` type.
+    */
+    using boolean_t = BooleanType;
+
+    /*!
+    @brief a type for a number (integer)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most programming
+    > languages. A number is represented in base 10 using decimal digits. It
+    > contains an integer component that may be prefixed with an optional minus
+    > sign, which may be followed by a fraction part and/or an exponent part.
+    > Leading zeros are not allowed. (...) Numeric values that cannot be
+    > represented in the grammar below (such as Infinity and NaN) are not
+    > permitted.
+
+    This description includes both integer and floating-point numbers. However,
+    C++ allows more precise storage if it is known whether the number is an
+    integer or a floating-point number. Therefore, two different types, @ref
+    number_integer_t and @ref number_float_t are used.
+
+    To store integer numbers in C++, a type is defined by the template
+    parameter @a NumberIntegerType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberIntegerType (`int64_t`), the default
+    value for @a number_integer_t is:
+
+    @code {.cpp}
+    int64_t
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in integer literals lead to an interpretation as octal
+      number. Internally, the value will be stored as decimal number. For
+      instance, the C++ integer literal `010` will be serialized to `8`. During
+      deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
+    > An implementation may set limits on the range and precision of numbers.
+
+    When the default type is used, the maximal integer number that can be
+    stored is `9223372036854775807` (INT64_MAX) and the minimal integer number
+    that can be stored is `-9223372036854775808` (INT64_MIN). Integer numbers
+    that are out of range will yield over/underflow when used in a constructor.
+    During deserialization, too large or small integer numbers will be
+    automatically be stored as @ref number_float_t.
+
+    [RFC 7159](http://rfc7159.net/rfc7159) further states:
+    > Note that when such software is used, numbers that are integers and are
+    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
+    > that implementations will agree exactly on their numeric values.
+
+    As this range is a subrange of the exactly supported range [INT64_MIN,
+    INT64_MAX], this class's integer type is interoperable.
+
+    #### Storage
+
+    Integer number values are stored directly inside a `basic_json` type.
+    */
+    using number_integer_t = NumberIntegerType;
+
+    /*!
+    @brief a type for a number (floating-point)
+
+    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
+    > The representation of numbers is similar to that used in most programming
+    > languages. A number is represented in base 10 using decimal digits. It
+    > contains an integer component that may be prefixed with an optional minus
+    > sign, which may be followed by a fraction part and/or an exponent part.
+    > Leading zeros are not allowed. (...) Numeric values that cannot be
+    > represented in the grammar below (such as Infinity and NaN) are not
+    > permitted.
+
+    This description includes both integer and floating-point numbers. However,
+    C++ allows more precise storage if it is known whether the number is an
+    integer or a floating-point number. Therefore, two different types, @ref
+    number_integer_t and @ref number_float_t are used.
+
+    To store floating-point numbers in C++, a type is defined by the template
+    parameter @a NumberFloatType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberFloatType (`double`), the default
+    value for @a number_float_t is:
+
+    @code {.cpp}
+    double
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in floating-point literals will be ignored. Internally, the
+      value will be stored as decimal number. For instance, the C++
+      floating-point literal `01.2` will be serialized to `1.2`. During
+      deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 7159](http://rfc7159.net/rfc7159) states:
+    > This specification allows implementations to set limits on the range and
+    > precision of numbers accepted. Since software that implements IEEE
+    > 754-2008 binary64 (double precision) numbers is generally available and
+    > widely used, good interoperability can be achieved by implementations that
+    > expect no more precision or range than these provide, in the sense that
+    > implementations will approximate JSON numbers within the expected
+    > precision.
+
+    This implementation does exactly follow this approach, as it uses double
+    precision floating-point numbers. Note values smaller than
+    `-1.79769313486232e+308` and values greather than `1.79769313486232e+308`
+    will be stored as NaN internally and be serialized to `null`.
+
+    #### Storage
+
+    Floating-point number values are stored directly inside a `basic_json` type.
+    */
+    using number_float_t = NumberFloatType;
+
+    /// @}
+
+
+    ///////////////////////////
+    // JSON type enumeration //
+    ///////////////////////////
+
+    /*!
+    @brief the JSON type enumeration
+
+    This enumeration collects the different JSON types. It is internally used
+    to distinguish the stored values, and the functions is_null, is_object,
+    is_array, is_string, is_boolean, is_number, and is_discarded rely on it.
+    */
+    enum class value_t : uint8_t
+    {
+        null,           ///< null value
+        object,         ///< object (unordered set of name/value pairs)
+        array,          ///< array (ordered collection of values)
+        string,         ///< string value
+        boolean,        ///< boolean value
+        number_integer, ///< number value (integer)
+        number_float,   ///< number value (floating-point)
+        discarded       ///< discarded by the the parser callback function
+    };
+
+
+  private:
+    /// helper for exception-safe object creation
+    template<typename T, typename... Args>
+    static T* create( Args&& ... args )
+    {
+        AllocatorType<T> alloc;
+        auto deleter = [&](T * object)
+        {
+            alloc.deallocate(object, 1);
+        };
+        std::unique_ptr<T, decltype(deleter)> object(alloc.allocate(1), deleter);
+        alloc.construct(object.get(), std::forward<Args>(args)...);
+        return object.release();
+    }
+
+    ////////////////////////
+    // JSON value storage //
+    ////////////////////////
+
+    /// a JSON value
+    union json_value
+    {
+        /// object (stored with pointer to save storage)
+        object_t* object;
+        /// array (stored with pointer to save storage)
+        array_t* array;
+        /// string (stored with pointer to save storage)
+        string_t* string;
+        /// boolean
+        boolean_t boolean;
+        /// number (integer)
+        number_integer_t number_integer;
+        /// number (floating-point)
+        number_float_t number_float;
+
+        /// default constructor (for null values)
+        json_value() noexcept = default;
+        /// constructor for booleans
+        json_value(boolean_t v) noexcept : boolean(v) {}
+        /// constructor for numbers (integer)
+        json_value(number_integer_t v) noexcept : number_integer(v) {}
+        /// constructor for numbers (floating-point)
+        json_value(number_float_t v) noexcept : number_float(v) {}
+        /// constructor for empty values of a given type
+        json_value(value_t t)
+        {
+            switch (t)
+            {
+                case (value_t::null):
+                case (value_t::discarded):
+                {
+                    break;
+                }
+
+                case (value_t::object):
+                {
+                    object = create<object_t>();
+                    break;
+                }
+
+                case (value_t::array):
+                {
+                    array = create<array_t>();
+                    break;
+                }
+
+                case (value_t::string):
+                {
+                    string = create<string_t>("");
+                    break;
+                }
+
+                case (value_t::boolean):
+                {
+                    boolean = boolean_t(false);
+                    break;
+                }
+
+                case (value_t::number_integer):
+                {
+                    number_integer = number_integer_t(0);
+                    break;
+                }
+
+                case (value_t::number_float):
+                {
+                    number_float = number_float_t(0.0);
+                    break;
+                }
+            }
+        }
+
+        /// constructor for strings
+        json_value(const string_t& value)
+        {
+            string = create<string_t>(value);
+        }
+
+        /// constructor for objects
+        json_value(const object_t& value)
+        {
+            object = create<object_t>(value);
+        }
+
+        /// constructor for arrays
+        json_value(const array_t& value)
+        {
+            array = create<array_t>(value);
+        }
+    };
+
+
+  public:
+    //////////////////////////
+    // JSON parser callback //
+    //////////////////////////
+
+    /*!
+    @brief JSON callback events
+
+    This enumeration lists the parser events that can trigger calling a
+    callback function of type @ref parser_callback_t during parsing.
+    */
+    enum class parse_event_t : uint8_t
+    {
+        /// the parser read `{` and started to process a JSON object
+        object_start,
+        /// the parser read `}` and finished processing a JSON object
+        object_end,
+        /// the parser read `[` and started to process a JSON array
+        array_start,
+        /// the parser read `]` and finished processing a JSON array
+        array_end,
+        /// the parser read a key of a value in an object
+        key,
+        /// the parser finished reading a JSON value
+        value
+    };
+
+    /*!
+    @brief per-element parser callback type
+
+    With a parser callback function, the result of parsing a JSON text can be
+    influenced. When passed to @ref parse(std::istream&, parser_callback_t) or
+    @ref parse(const string_t&, parser_callback_t), it is called on certain
+    events (passed as @ref parse_event_t via parameter @a event) with a set
+    recursion depth @a depth and context JSON value @a parsed. The return value
+    of the callback function is a boolean indicating whether the element that
+    emitted the callback shall be kept or not.
+
+    We distinguish six scenarios (determined by the event type) in which the
+    callback function can be called. The following table describes the values
+    of the parameters @a depth, @a event, and @a parsed.
+
+    parameter @a event | description | parameter @a depth | parameter @a parsed
+    ------------------ | ----------- | ------------------ | -------------------
+    parse_event_t::object_start | the parser read `{` and started to process a JSON object | depth of the parent of the JSON object | a JSON value with type discarded
+    parse_event_t::key | the parser read a key of a value in an object | depth of the currently parsed JSON object | a JSON string containing the key
+    parse_event_t::object_end | the parser read `}` and finished processing a JSON object | depth of the parent of the JSON object | the parsed JSON object
+    parse_event_t::array_start | the parser read `[` and started to process a JSON array | depth of the parent of the JSON array | a JSON value with type discarded
+    parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
+    parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value
+
+    Discarding a value (i.e., returning `false`) has different effects depending on the
+    context in which function was called:
+
+    - Discarded values in structured types are skipped. That is, the parser
+      will behave as if the discarded value was never read.
+    - In case a value outside a structured type is skipped, it is replaced with
+      `null`. This case happens if the top-level element is skipped.
+
+    @param[in] depth   the depth of the recursion during parsing
+
+    @param[in] event   an event of type parse_event_t indicating the context in
+    the callback function has been called
+
+    @param[in,out] parsed  the current intermediate parse result; note that
+    writing to this value has no effect for parse_event_t::key events
+
+    @return Whether the JSON value which called the function during parsing
+    should be kept (`true`) or not (`false`). In the latter case, it is either
+    skipped completely or replaced by an empty discarded object.
+
+    @sa @ref parse(std::istream&, parser_callback_t) or
+    @ref parse(const string_t&, parser_callback_t) for examples
+    */
+    using parser_callback_t = std::function<bool(
+                                  int depth, parse_event_t event, basic_json& parsed)>;
+
+
+    //////////////////
+    // constructors //
+    //////////////////
+
+    /*!
+    @brief create an empty value with a given type
+
+    Create an empty JSON value with a given type. The value will be default
+    initialized with an empty value which depends on the type:
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    object      | `{}`
+    array       | `[]`
+
+    @param[in] value  the type of the value to create
+
+    @complexity Constant.
+
+    @throw std::bad_alloc if allocation for object, array, or string value
+    fails
+
+    @liveexample{The following code shows the constructor for different @ref
+    value_t values,basic_json__value_t}
+    */
+    basic_json(const value_t value)
+        : m_type(value), m_value(value)
+    {}
+
+    /*!
+    @brief create a null object (implicitly)
+
+    Create a `null` JSON value. This is the implicit version of the `null`
+    value constructor as it takes no parameters.
+
+    @complexity Constant.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - As postcondition, it holds: `basic_json().empty() == true`.
+
+    @liveexample{The following code shows the constructor for a `null` JSON
+    value.,basic_json}
+
+    @sa basic_json(std::nullptr_t)
+    */
+    basic_json() noexcept = default;
+
+    /*!
+    @brief create a null object (explicitly)
+
+    Create a `null` JSON value. This is the explicitly version of the `null`
+    value constructor as it takes a null pointer as parameter. It allows to
+    create `null` values by explicitly assigning a @c nullptr to a JSON value.
+    The passed null pointer itself is not read - it is only used to choose the
+    right constructor.
+
+    @complexity Constant.
+
+    @liveexample{The following code shows the constructor with null pointer
+    parameter.,basic_json__nullptr_t}
+
+    @sa basic_json()
+    */
+    basic_json(std::nullptr_t) noexcept
+        : basic_json(value_t::null)
+    {}
+
+    /*!
+    @brief create an object (explicit)
+
+    Create an object JSON value with a given content.
+
+    @param[in] value  a value for the object
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for object value fails
+
+    @liveexample{The following code shows the constructor with an @ref object_t
+    parameter.,basic_json__object_t}
+
+    @sa basic_json(const CompatibleObjectType&)
+    */
+    basic_json(const object_t& value)
+        : m_type(value_t::object), m_value(value)
+    {}
+
+    /*!
+    @brief create an object (implicit)
+
+    Create an object JSON value with a given content. This constructor allows
+    any type that can be used to construct values of type @ref object_t.
+    Examples include the types `std::map` and `std::unordered_map`.
+
+    @tparam CompatibleObjectType an object type whose `key_type` and
+    `value_type` is compatible to @ref object_t
+
+    @param[in] value  a value for the object
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for object value fails
+
+    @liveexample{The following code shows the constructor with several
+    compatible object type parameters.,basic_json__CompatibleObjectType}
+
+    @sa basic_json(const object_t&)
+    */
+    template <class CompatibleObjectType, typename
+              std::enable_if<
+                  std::is_constructible<typename object_t::key_type, typename CompatibleObjectType::key_type>::value and
+                  std::is_constructible<basic_json, typename CompatibleObjectType::mapped_type>::value, int>::type
+              = 0>
+    basic_json(const CompatibleObjectType& value)
+        : m_type(value_t::object)
+    {
+        using std::begin;
+        using std::end;
+        m_value.object = create<object_t>(begin(value), end(value));
+    }
+
+    /*!
+    @brief create an array (explicit)
+
+    Create an array JSON value with a given content.
+
+    @param[in] value  a value for the array
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for array value fails
+
+    @liveexample{The following code shows the constructor with an @ref array_t
+    parameter.,basic_json__array_t}
+
+    @sa basic_json(const CompatibleArrayType&)
+    */
+    basic_json(const array_t& value)
+        : m_type(value_t::array), m_value(value)
+    {}
+
+    /*!
+    @brief create an array (implicit)
+
+    Create an array JSON value with a given content. This constructor allows
+    any type that can be used to construct values of type @ref array_t.
+    Examples include the types `std::vector`, `std::list`, and `std::set`.
+
+    @tparam CompatibleArrayType an object type whose `value_type` is compatible
+    to @ref array_t
+
+    @param[in] value  a value for the array
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for array value fails
+
+    @liveexample{The following code shows the constructor with several
+    compatible array type parameters.,basic_json__CompatibleArrayType}
+
+    @sa basic_json(const array_t&)
+    */
+    template <class CompatibleArrayType, typename
+              std::enable_if<
+                  not std::is_same<CompatibleArrayType, typename basic_json_t::iterator>::value and
+                  not std::is_same<CompatibleArrayType, typename basic_json_t::const_iterator>::value and
+                  not std::is_same<CompatibleArrayType, typename basic_json_t::reverse_iterator>::value and
+                  not std::is_same<CompatibleArrayType, typename basic_json_t::const_reverse_iterator>::value and
+                  not std::is_same<CompatibleArrayType, typename array_t::iterator>::value and
+                  not std::is_same<CompatibleArrayType, typename array_t::const_iterator>::value and
+                  std::is_constructible<basic_json, typename CompatibleArrayType::value_type>::value, int>::type
+              = 0>
+    basic_json(const CompatibleArrayType& value)
+        : m_type(value_t::array)
+    {
+        using std::begin;
+        using std::end;
+        m_value.array = create<array_t>(begin(value), end(value));
+    }
+
+    /*!
+    @brief create a string (explicit)
+
+    Create an string JSON value with a given content.
+
+    @param[in] value  a value for the string
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for string value fails
+
+    @liveexample{The following code shows the constructor with an @ref string_t
+    parameter.,basic_json__string_t}
+
+    @sa basic_json(const typename string_t::value_type*)
+    @sa basic_json(const CompatibleStringType&)
+    */
+    basic_json(const string_t& value)
+        : m_type(value_t::string), m_value(value)
+    {}
+
+    /*!
+    @brief create a string (explicit)
+
+    Create a string JSON value with a given content.
+
+    @param[in] value  a literal value for the string
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for string value fails
+
+    @liveexample{The following code shows the constructor with string literal
+    parameter.,basic_json__string_t_value_type}
+
+    @sa basic_json(const string_t&)
+    @sa basic_json(const CompatibleStringType&)
+    */
+    basic_json(const typename string_t::value_type* value)
+        : basic_json(string_t(value))
+    {}
+
+    /*!
+    @brief create a string (implicit)
+
+    Create a string JSON value with a given content.
+
+    @param[in] value  a value for the string
+
+    @tparam CompatibleStringType an string type which is compatible to @ref
+    string_t
+
+    @complexity Linear in the size of the passed @a value.
+
+    @throw std::bad_alloc if allocation for string value fails
+
+    @liveexample{The following code shows the construction of a string value
+    from a compatible type.,basic_json__CompatibleStringType}
+
+    @sa basic_json(const string_t&)
+    */
+    template <class CompatibleStringType, typename
+              std::enable_if<
+                  std::is_constructible<string_t, CompatibleStringType>::value, int>::type
+              = 0>
+    basic_json(const CompatibleStringType& value)
+        : basic_json(string_t(value))
+    {}
+
+    /*!
+    @brief create a boolean (explicit)
+
+    Creates a JSON boolean type from a given value.
+
+    @param[in] value  a boolean value to store
+
+    @complexity Constant.
+
+    @liveexample{The example below demonstrates boolean
+    values.,basic_json__boolean_t}
+    */
+    basic_json(boolean_t value)
+        : m_type(value_t::boolean), m_value(value)
+    {}
+
+    /*!
+    @brief create an integer number (explicit)
+
+    Create an interger number JSON value with a given content.
+
+    @tparam T  helper type to compare number_integer_t and int (not visible in)
+    the interface.
+
+    @param[in] value  an integer to create a JSON number from
+
+    @note This constructor would have the same signature as @ref
+    basic_json(const int value), so we need to switch this one off in case
+    number_integer_t is the same as int. This is done via the helper type @a T.
+
+    @complexity Constant.
+
+    @liveexample{The example below shows the construction of a JSON integer
+    number value.,basic_json__number_integer_t}
+
+    @sa basic_json(const int)
+    */
+    template<typename T,
+             typename std::enable_if<
+                 not (std::is_same<T, int>::value)
+                 and std::is_same<T, number_integer_t>::value
+                 , int>::type = 0>
+    basic_json(const number_integer_t value)
+        : m_type(value_t::number_integer), m_value(value)
+    {}
+
+    /*!
+    @brief create an integer number from an enum type (explicit)
+
+    Create an integer number JSON value with a given content.
+
+    @param[in] value  an integer to create a JSON number from
+
+    @note This constructor allows to pass enums directly to a constructor. As
+    C++ has no way of specifying the type of an anonymous enum explicitly, we
+    can only rely on the fact that such values implicitly convert to int. As
+    int may already be the same type of number_integer_t, we may need to switch
+    off the constructor @ref basic_json(const number_integer_t).
+
+    @complexity Constant.
+
+    @liveexample{The example below shows the construction of a JSON integer
+    number value from an anonymous enum.,basic_json__const_int}
+
+    @sa basic_json(const number_integer_t)
+    */
+    basic_json(const int value)
+        : m_type(value_t::number_integer),
+          m_value(static_cast<number_integer_t>(value))
+    {}
+
+    /*!
+    @brief create an integer number (implicit)
+
+    Create an integer number JSON value with a given content. This constructor
+    allows any type that can be used to construct values of type @ref
+    number_integer_t. Examples may include the types `int`, `int32_t`, or
+    `short`.
+
+    @tparam CompatibleNumberIntegerType an integer type which is compatible to
+    @ref number_integer_t.
+
+    @param[in] value  an integer to create a JSON number from
+
+    @complexity Constant.
+
+    @liveexample{The example below shows the construction of several JSON
+    integer number values from compatible
+    types.,basic_json__CompatibleIntegerNumberType}
+
+    @sa basic_json(const number_integer_t)
+    */
+    template<typename CompatibleNumberIntegerType, typename
+             std::enable_if<
+                 std::is_constructible<number_integer_t, CompatibleNumberIntegerType>::value and
+                 std::numeric_limits<CompatibleNumberIntegerType>::is_integer, CompatibleNumberIntegerType>::type
+             = 0>
+    basic_json(const CompatibleNumberIntegerType value) noexcept
+        : m_type(value_t::number_integer),
+          m_value(static_cast<number_integer_t>(value))
+    {}
+
+    /*!
+    @brief create a floating-point number (explicit)
+
+    Create a floating-point number JSON value with a given content.
+
+    @param[in] value  a floating-point value to create a JSON number from
+
+    @note RFC 7159 <http://www.rfc-editor.org/rfc/rfc7159.txt>, section 6
+    disallows NaN values:
+    > Numeric values that cannot be represented in the grammar below (such
+    > as Infinity and NaN) are not permitted.
+    In case the parameter @a value is not a number, a JSON null value is
+    created instead.
+
+    @complexity Constant.
+
+    @liveexample{The following example creates several floating-point
+    values.,basic_json__number_float_t}
+    */
+    basic_json(const number_float_t value)
+        : m_type(value_t::number_float), m_value(value)
+    {
+        // replace infinity and NAN by null
+        if (not std::isfinite(value))
+        {
+            m_type = value_t::null;
+            m_value = json_value();
+        }
+    }
+
+    /*!
+    @brief create an floating-point number (implicit)
+
+    Create an floating-point number JSON value with a given content. This
+    constructor allows any type that can be used to construct values of type
+    @ref number_float_t. Examples may include the types `float`.
+
+    @tparam CompatibleNumberFloatType a floating-point type which is compatible
+    to @ref number_float_t.
+
+    @param[in] value  a floating-point to create a JSON number from
+
+    @note RFC 7159 <http://www.rfc-editor.org/rfc/rfc7159.txt>, section 6
+    disallows NaN values:
+    > Numeric values that cannot be represented in the grammar below (such
+    > as Infinity and NaN) are not permitted.
+    In case the parameter @a value is not a number, a JSON null value is
+    created instead.
+
+    @complexity Constant.
+
+    @liveexample{The example below shows the construction of several JSON
+    floating-point number values from compatible
+    types.,basic_json__CompatibleNumberFloatType}
+
+    @sa basic_json(const number_float_t)
+    */
+    template<typename CompatibleNumberFloatType, typename = typename
+             std::enable_if<
+                 std::is_constructible<number_float_t, CompatibleNumberFloatType>::value and
+                 std::is_floating_point<CompatibleNumberFloatType>::value>::type
+             >
+    basic_json(const CompatibleNumberFloatType value) noexcept
+        : basic_json(number_float_t(value))
+    {}
+
+    /*!
+    @brief create a container (array or object) from an initializer list
+
+    Creates a JSON value of type array or object from the passed initializer
+    list @a init. In case @a type_deduction is `true` (default), the type of
+    the JSON value to be created is deducted from the initializer list @a init
+    according to the following rules:
+
+    1. If the list is empty, an empty JSON object value `{}` is created.
+    2. If the list consists of pairs whose first element is a string, a JSON
+    object value is created where the first elements of the pairs are treated
+    as keys and the second elements are as values.
+    3. In all other cases, an array is created.
+
+    The rules aim to create the best fit between a C++ initializer list and
+    JSON values. The ratioinale is as follows:
+
+    1. The empty initializer list is written as `{}` which is exactly an empty
+    JSON object.
+    2. C++ has now way of describing mapped types other than to list a list of
+    pairs. As JSON requires that keys must be of type string, rule 2 is the
+    weakest constraint one can pose on initializer lists to interpret them as
+    an object.
+    3. In all other cases, the initializer list could not be interpreted as
+    JSON object type, so interpreting it as JSON array type is safe.
+
+    With the rules described above, the following JSON values cannot be
+    expressed by an initializer list:
+
+    - the empty array (`[]`): use @ref array(std::initializer_list<basic_json>)
+      with an empty initializer list in this case
+    - arrays whose elements satisfy rule 2: use @ref
+      array(std::initializer_list<basic_json>) with the same initializer list
+      in this case
+
+    @note When used without parentheses around an empty initializer list, @ref
+    basic_json() is called instead of this function, yielding the JSON null
+    value.
+
+    @param[in] init  initializer list with JSON values
+
+    @param[in] type_deduction internal parameter; when set to `true`, the type
+    of the JSON value is deducted from the initializer list @a init; when set
+    to `false`, the type provided via @a manual_type is forced. This mode is
+    used by the functions @ref array(std::initializer_list<basic_json>) and
+    @ref object(std::initializer_list<basic_json>).
+
+    @param[in] manual_type internal parameter; when @a type_deduction is set to
+    `false`, the created JSON value will use the provided type (only @ref
+    value_t::array and @ref value_t::object are valid); when @a type_deduction
+    is set to `true`, this parameter has no effect
+
+    @throw std::domain_error if @a type_deduction is `false`, @a manual_type is
+    `value_t::object`, but @a init contains an element which is not a pair
+    whose first element is a string
+
+    @complexity Linear in the size of the initializer list @a init.
+
+    @liveexample{The example below shows how JSON values are created from
+    initializer lists,basic_json__list_init_t}
+
+    @sa basic_json array(std::initializer_list<basic_json>) - create a JSON
+    array value from an initializer list
+    @sa basic_json object(std::initializer_list<basic_json>) - create a JSON
+    object value from an initializer list
+    */
+    basic_json(std::initializer_list<basic_json> init,
+               bool type_deduction = true,
+               value_t manual_type = value_t::array)
+    {
+        // the initializer list could describe an object
+        bool init_is_object = true;
+
+        // check if each element is an array with two elements whose first element
+        // is a string
+        for (const auto& element : init)
+        {
+            if (element.m_type != value_t::array or element.size() != 2
+                    or element[0].m_type != value_t::string)
+            {
+                // we found an element that makes it impossible to use the
+                // initializer list as object
+                init_is_object = false;
+                break;
+            }
+        }
+
+        // adjust type if type deduction is not wanted
+        if (not type_deduction)
+        {
+            // if array is wanted, do not create an object though possible
+            if (manual_type == value_t::array)
+            {
+                init_is_object = false;
+            }
+
+            // if object is wanted but impossible, throw an exception
+            if (manual_type == value_t::object and not init_is_object)
+            {
+                throw std::domain_error("cannot create object from initializer list");
+            }
+        }
+
+        if (init_is_object)
+        {
+            // the initializer list is a list of pairs -> create object
+            m_type = value_t::object;
+            m_value = value_t::object;
+
+            for (auto& element : init)
+            {
+                m_value.object->emplace(std::move(*(element[0].m_value.string)), std::move(element[1]));
+            }
+        }
+        else
+        {
+            // the initializer list describes an array -> create array
+            m_type = value_t::array;
+            m_value.array = create<array_t>(std::move(init));
+        }
+    }
+
+    /*!
+    @brief explicitly create an array from an initializer list
+
+    Creates a JSON array value from a given initializer list. That is, given a
+    list of values `a, b, c`, creates the JSON value `[a, b, c]`. If the
+    initializer list is empty, the empty array `[]` is created.
+
+    @note This function is only needed to express two edge cases that cannot be
+    realized with the initializer list constructor (@ref
+    basic_json(std::initializer_list<basic_json>, bool, value_t)). These cases
+    are:
+    1. creating an array whose elements are all pairs whose first element is a
+    string - in this case, the initializer list constructor would create an
+    object, taking the first elements as keys
+    2. creating an empty array - passing the empty initializer list to the
+    initializer list constructor yields an empty object
+
+    @param[in] init  initializer list with JSON values to create an array from
+    (optional)
+
+    @return JSON array value
+
+    @complexity Linear in the size of @a init.
+
+    @liveexample{The following code shows an example for the @ref array
+    function.,array}
+
+    @sa basic_json(std::initializer_list<basic_json>, bool, value_t) - create a
+    JSON value from an initializer list
+    @sa basic_json object(std::initializer_list<basic_json>) - create a JSON
+    object value from an initializer list
+    */
+    static basic_json array(std::initializer_list<basic_json> init =
+                                std::initializer_list<basic_json>())
+    {
+        return basic_json(init, false, value_t::array);
+    }
+
+    /*!
+    @brief explicitly create an object from an initializer list
+
+    Creates a JSON object value from a given initializer list. The initializer
+    lists elements must be pairs, and their first elments must be strings. If
+    the initializer list is empty, the empty object `{}` is created.
+
+    @note This function is only added for symmetry reasons. In contrast to the
+    related function @ref basic_json array(std::initializer_list<basic_json>),
+    there are no cases which can only be expressed by this function. That is,
+    any initializer list @a init can also be passed to the initializer list
+    constructor @ref basic_json(std::initializer_list<basic_json>, bool,
+    value_t).
+
+    @param[in] init  initializer list to create an object from (optional)
+
+    @return JSON object value
+
+    @throw std::domain_error if @a init is not a pair whose first elements are
+    strings; thrown by @ref basic_json(std::initializer_list<basic_json>, bool,
+    value_t)
+
+    @complexity Linear in the size of @a init.
+
+    @liveexample{The following code shows an example for the @ref object
+    function.,object}
+
+    @sa basic_json(std::initializer_list<basic_json>, bool, value_t) - create a
+    JSON value from an initializer list
+    @sa basic_json array(std::initializer_list<basic_json>) - create a JSON
+    array value from an initializer list
+    */
+    static basic_json object(std::initializer_list<basic_json> init =
+                                 std::initializer_list<basic_json>())
+    {
+        return basic_json(init, false, value_t::object);
+    }
+
+    /*!
+    @brief construct an array with count copies of given value
+
+    Constructs a JSON array value by creating @a count copies of a passed
+    value. In case @a count is `0`, an empty array is created. As postcondition,
+    `std::distance(begin(),end()) == count` holds.
+
+    @param[in] count  the number of JSON copies of @a value to create
+    @param[in] value  the JSON value to copy
+
+    @complexity Linear in @a count.
+
+    @liveexample{The following code shows examples for the @ref
+    basic_json(size_type\, const basic_json&)
+    constructor.,basic_json__size_type_basic_json}
+    */
+    basic_json(size_type cnt, const basic_json& value)
+        : m_type(value_t::array)
+    {
+        m_value.array = create<array_t>(cnt, value);
+    }
+
+    /*!
+    @brief construct a JSON container given an iterator range
+
+    Constructs the JSON value with the contents of the range `[first, last)`.
+    The semantics depends on the different types a JSON value can have:
+    - In case of primitive types (number, boolean, or string), @a first must
+      be `begin()` and @a last must be `end()`. In this case, the value is
+      copied. Otherwise, std::out_of_range is thrown.
+    - In case of structured types (array, object), the constructor behaves
+      as similar versions for `std::vector`.
+    - In case of a null type, std::domain_error is thrown.
+
+    @tparam InputIT an input iterator type (@ref iterator or @ref
+    const_iterator)
+
+    @param[in] first begin of the range to copy from (included)
+    @param[in] last end of the range to copy from (excluded)
+
+    @throw std::domain_error if iterators are not compatible; that is, do not
+    belong to the same JSON value
+    @throw std::out_of_range if iterators are for a primitive type (number,
+    boolean, or string) where an out of range error can be detected easily
+    @throw std::bad_alloc if allocation for object, array, or string fails
+    @throw std::domain_error if called with a null value
+
+    @complexity Linear in distance between @a first and @a last.
+
+    @liveexample{The example below shows several ways to create JSON values by
+    specifying a subrange with iterators.,basic_json__InputIt_InputIt}
+    */
+    template <class InputIT, typename
+              std::enable_if<
+                  std::is_same<InputIT, typename basic_json_t::iterator>::value or
+                  std::is_same<InputIT, typename basic_json_t::const_iterator>::value
+                  , int>::type
+              = 0>
+    basic_json(InputIT first, InputIT last) : m_type(first.m_object->m_type)
+    {
+        // make sure iterator fits the current value
+        if (first.m_object != last.m_object)
+        {
+            throw std::domain_error("iterators are not compatible");
+        }
+
+        // check if iterator range is complete for primitive values
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            case value_t::number_float:
+            case value_t::boolean:
+            case value_t::string:
+            {
+                if (not first.m_it.primitive_iterator.is_begin() or not last.m_it.primitive_iterator.is_end())
+                {
+                    throw std::out_of_range("iterators out of range");
+                }
+                break;
+            }
+
+            default:
+            {
+                break;
+            }
+        }
+
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_value.number_integer = first.m_object->m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value.number_float = first.m_object->m_value.number_float;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value.boolean = first.m_object->m_value.boolean;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value = *first.m_object->m_value.string;
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_value.object = create<object_t>(first.m_it.object_iterator, last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value.array = create<array_t>(first.m_it.array_iterator, last.m_it.array_iterator);
+                break;
+            }
+
+            default:
+            {
+                throw std::domain_error("cannot use construct with iterators from " + first.m_object->type_name());
+            }
+        }
+    }
+
+    ///////////////////////////////////////
+    // other constructors and destructor //
+    ///////////////////////////////////////
+
+    /*!
+    @brief copy constructor
+
+    Creates a copy of a given JSON value.
+
+    @param[in] other  the JSON value to copy
+
+    @complexity Linear in the size of @a other.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is linear.
+    - As postcondition, it holds: `other == basic_json(other)`.
+
+    @throw std::bad_alloc if allocation for object, array, or string fails.
+
+    @liveexample{The following code shows an example for the copy
+    constructor.,basic_json__basic_json}
+    */
+    basic_json(const basic_json& other)
+        : m_type(other.m_type)
+    {
+        switch (m_type)
+        {
+            case (value_t::null):
+            case (value_t::discarded):
+            {
+                break;
+            }
+
+            case (value_t::object):
+            {
+                m_value = *other.m_value.object;
+                break;
+            }
+
+            case (value_t::array):
+            {
+                m_value = *other.m_value.array;
+                break;
+            }
+
+            case (value_t::string):
+            {
+                m_value = *other.m_value.string;
+                break;
+            }
+
+            case (value_t::boolean):
+            {
+                m_value = other.m_value.boolean;
+                break;
+            }
+
+            case (value_t::number_integer):
+            {
+                m_value = other.m_value.number_integer;
+                break;
+            }
+
+            case (value_t::number_float):
+            {
+                m_value = other.m_value.number_float;
+                break;
+            }
+        }
+    }
+
+    /*!
+    @brief move constructor
+
+    Move constructor. Constructs a JSON value with the contents of the given
+    value @a other using move semantics. It "steals" the resources from @a
+    other and leaves it as JSON null value.
+
+    @param[in,out] other  value to move to this object
+
+    @post @a other is a JSON null value
+
+    @complexity Constant.
+
+    @liveexample{The code below shows the move constructor explicitly called
+    via std::move.,basic_json__moveconstructor}
+    */
+    basic_json(basic_json&& other) noexcept
+        : m_type(std::move(other.m_type)),
+          m_value(std::move(other.m_value))
+    {
+        // invalidate payload
+        other.m_type = value_t::null;
+        other.m_value = {};
+    }
+
+    /*!
+    @brief copy assignment
+
+    Copy assignment operator. Copies a JSON value via the "copy and swap"
+    strategy: It is expressed in terms of the copy constructor, destructor, and
+    the swap() member function.
+
+    @param[in] other  value to copy from
+
+    @complexity Linear.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is linear.
+
+    @liveexample{The code below shows and example for the copy assignment. It
+    creates a copy of value `a` which is then swapped with `b`. Finally\, the
+    copy of `a` (which is the null value after the swap) is
+    destroyed.,basic_json__copyassignment}
+    */
+    reference& operator=(basic_json other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value and
+        std::is_nothrow_move_assignable<value_t>::value and
+        std::is_nothrow_move_constructible<json_value>::value and
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        using std::swap;
+        swap(m_type, other.m_type);
+        swap(m_value, other.m_value);
+        return *this;
+    }
+
+    /*!
+    @brief destructor
+
+    Destroys the JSON value and frees all allocated memory.
+
+    @complexity Linear.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is linear.
+    - All stored elements are destroyed and all memory is freed.
+    */
+    ~basic_json()
+    {
+        switch (m_type)
+        {
+            case (value_t::object):
+            {
+                AllocatorType<object_t> alloc;
+                alloc.destroy(m_value.object);
+                alloc.deallocate(m_value.object, 1);
+                break;
+            }
+
+            case (value_t::array):
+            {
+                AllocatorType<array_t> alloc;
+                alloc.destroy(m_value.array);
+                alloc.deallocate(m_value.array, 1);
+                break;
+            }
+
+            case (value_t::string):
+            {
+                AllocatorType<string_t> alloc;
+                alloc.destroy(m_value.string);
+                alloc.deallocate(m_value.string, 1);
+                break;
+            }
+
+            default:
+            {
+                // all other types need no specific destructor
+                break;
+            }
+        }
+    }
+
+
+  public:
+    ///////////////////////
+    // object inspection //
+    ///////////////////////
+
+    /// @name object inspection
+    /// @{
+
+    /*!
+    @brief serialization
+
+    Serialization function for JSON values. The function tries to mimick
+    Python's @p json.dumps() function, and currently supports its @p indent
+    parameter.
+
+    @param[in] indent if indent is nonnegative, then array elements and object
+    members will be pretty-printed with that indent level. An indent level of 0
+    will only insert newlines. -1 (the default) selects the most compact
+    representation
+
+    @return string containing the serialization of the JSON value
+
+    @complexity Linear.
+
+    @liveexample{The following example shows the effect of different @a indent
+    parameters to the result of the serializaion.,dump}
+
+    @see https://docs.python.org/2/library/json.html#json.dump
+    */
+    string_t dump(const int indent = -1) const
+    {
+        std::ostringstream ss;
+
+        if (indent >= 0)
+        {
+            dump(ss, true, static_cast<unsigned int>(indent));
+        }
+        else
+        {
+            dump(ss, false, 0);
+        }
+
+        return ss.str();
+    }
+
+    /*!
+    @brief return the type of the JSON value (explicit)
+
+    Return the type of the JSON value as a value from the @ref value_t
+    enumeration.
+
+    @return the type of the JSON value
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref type() for all JSON
+    types.,type}
+    */
+    value_t type() const noexcept
+    {
+        return m_type;
+    }
+
+    /*!
+    @brief return whether type is primitive
+
+    This function returns true iff the JSON type is primitive (string, number,
+    boolean, or null).
+
+    @return `true` if type is primitive (string, number, boolean, or null),
+    `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_primitive for all JSON
+    types.,is_primitive}
+    */
+    bool is_primitive() const noexcept
+    {
+        return is_null() or is_string() or is_boolean() or is_number();
+    }
+
+    /*!
+    @brief return whether type is structured
+
+    This function returns true iff the JSON type is structured (array or
+    object).
+
+    @return `true` if type is structured (array or object), `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_structured for all JSON
+    types.,is_structured}
+    */
+    bool is_structured() const noexcept
+    {
+        return is_array() or is_object();
+    }
+
+    /*!
+    @brief return whether value is null
+
+    This function returns true iff the JSON value is null.
+
+    @return `true` if type is null, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_null for all JSON
+    types.,is_null}
+    */
+    bool is_null() const noexcept
+    {
+        return m_type == value_t::null;
+    }
+
+    /*!
+    @brief return whether value is a boolean
+
+    This function returns true iff the JSON value is a boolean.
+
+    @return `true` if type is boolean, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_boolean for all JSON
+    types.,is_boolean}
+    */
+    bool is_boolean() const noexcept
+    {
+        return m_type == value_t::boolean;
+    }
+
+    /*!
+    @brief return whether value is a number
+
+    This function returns true iff the JSON value is a number. This includes
+    both integer and floating-point values.
+
+    @return `true` if type is number, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_number for all JSON
+    types.,is_number}
+    */
+    bool is_number() const noexcept
+    {
+        return is_number_integer() or is_number_float();
+    }
+
+    /*!
+    @brief return whether value is an integer number
+
+    This function returns true iff the JSON value is an integer number. This
+    excludes floating-point values.
+
+    @return `true` if type is an integer number, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_number_integer for all
+    JSON types.,is_number_integer}
+    */
+    bool is_number_integer() const noexcept
+    {
+        return m_type == value_t::number_integer;
+    }
+
+    /*!
+    @brief return whether value is a floating-point number
+
+    This function returns true iff the JSON value is a floating-point number.
+    This excludes integer values.
+
+    @return `true` if type is a floating-point number, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_number_float for all
+    JSON types.,is_number_float}
+    */
+    bool is_number_float() const noexcept
+    {
+        return m_type == value_t::number_float;
+    }
+
+    /*!
+    @brief return whether value is an object
+
+    This function returns true iff the JSON value is an object.
+
+    @return `true` if type is object, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_object for all JSON
+    types.,is_object}
+    */
+    bool is_object() const noexcept
+    {
+        return m_type == value_t::object;
+    }
+
+    /*!
+    @brief return whether value is an array
+
+    This function returns true iff the JSON value is an array.
+
+    @return `true` if type is array, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_array for all JSON
+    types.,is_array}
+    */
+    bool is_array() const noexcept
+    {
+        return m_type == value_t::array;
+    }
+
+    /*!
+    @brief return whether value is a string
+
+    This function returns true iff the JSON value is a string.
+
+    @return `true` if type is string, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_string for all JSON
+    types.,is_string}
+    */
+    bool is_string() const noexcept
+    {
+        return m_type == value_t::string;
+    }
+
+    /*!
+    @brief return whether value is discarded
+
+    This function returns true iff the JSON value was discarded during parsing
+    with a callback function (see @ref parser_callback_t).
+
+    @note This function will always be `false` for JSON values after parsing.
+    That is, discarded values can only occur during parsing, but will be
+    removed when inside a structured value or replaced by null in other cases.
+
+    @return `true` if type is discarded, `false` otherwise.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies @ref is_discarded for all JSON
+    types.,is_discarded}
+    */
+    bool is_discarded() const noexcept
+    {
+        return m_type == value_t::discarded;
+    }
+
+    /*!
+    @brief return the type of the JSON value (implicit)
+
+    Implicitly return the type of the JSON value as a value from the @ref
+    value_t enumeration.
+
+    @return the type of the JSON value
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies the value_t operator for all
+    JSON types.,operator__value_t}
+    */
+    operator value_t() const noexcept
+    {
+        return m_type;
+    }
+
+    /// @}
+
+  private:
+    //////////////////
+    // value access //
+    //////////////////
+
+    /// get an object (explicit)
+    template <class T, typename
+              std::enable_if<
+                  std::is_convertible<typename object_t::key_type, typename T::key_type>::value and
+                  std::is_convertible<basic_json_t, typename T::mapped_type>::value
+                  , int>::type = 0>
+    T get_impl(T*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::object):
+            {
+                return T(m_value.object->begin(), m_value.object->end());
+            }
+            default:
+            {
+                throw std::domain_error("type must be object, but is " + type_name());
+            }
+        }
+    }
+
+    /// get an object (explicit)
+    object_t get_impl(object_t*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::object):
+            {
+                return *(m_value.object);
+            }
+            default:
+            {
+                throw std::domain_error("type must be object, but is " + type_name());
+            }
+        }
+    }
+
+    /// get an array (explicit)
+    template <class T, typename
+              std::enable_if<
+                  std::is_convertible<basic_json_t, typename T::value_type>::value and
+                  not std::is_same<basic_json_t, typename T::value_type>::value and
+                  not std::is_arithmetic<T>::value and
+                  not std::is_convertible<std::string, T>::value and
+                  not has_mapped_type<T>::value
+                  , int>::type = 0>
+    T get_impl(T*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::array):
+            {
+                T to_vector;
+                std::transform(m_value.array->begin(), m_value.array->end(),
+                               std::inserter(to_vector, to_vector.end()), [](basic_json i)
+                {
+                    return i.get<typename T::value_type>();
+                });
+                return to_vector;
+            }
+            default:
+            {
+                throw std::domain_error("type must be array, but is " + type_name());
+            }
+        }
+    }
+
+    /// get an array (explicit)
+    template <class T, typename
+              std::enable_if<
+                  std::is_convertible<basic_json_t, T>::value and
+                  not std::is_same<basic_json_t, T>::value
+                  , int>::type = 0>
+    std::vector<T> get_impl(std::vector<T>*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::array):
+            {
+                std::vector<T> to_vector;
+                to_vector.reserve(m_value.array->size());
+                std::transform(m_value.array->begin(), m_value.array->end(),
+                               std::inserter(to_vector, to_vector.end()), [](basic_json i)
+                {
+                    return i.get<T>();
+                });
+                return to_vector;
+            }
+            default:
+            {
+                throw std::domain_error("type must be array, but is " + type_name());
+            }
+        }
+    }
+
+    /// get an array (explicit)
+    template <class T, typename
+              std::enable_if<
+                  std::is_same<basic_json, typename T::value_type>::value and
+                  not has_mapped_type<T>::value
+                  , int>::type = 0>
+    T get_impl(T*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::array):
+            {
+                return T(m_value.array->begin(), m_value.array->end());
+            }
+            default:
+            {
+                throw std::domain_error("type must be array, but is " + type_name());
+            }
+        }
+    }
+
+    /// get an array (explicit)
+    array_t get_impl(array_t*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::array):
+            {
+                return *(m_value.array);
+            }
+            default:
+            {
+                throw std::domain_error("type must be array, but is " + type_name());
+            }
+        }
+    }
+
+    /// get a string (explicit)
+    template <typename T, typename
+              std::enable_if<
+                  std::is_convertible<string_t, T>::value
+                  , int>::type = 0>
+    T get_impl(T*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::string):
+            {
+                return *m_value.string;
+            }
+            default:
+            {
+                throw std::domain_error("type must be string, but is " + type_name());
+            }
+        }
+    }
+
+    /// get a number (explicit)
+    template<typename T, typename
+             std::enable_if<
+                 std::is_arithmetic<T>::value
+                 , int>::type = 0>
+    T get_impl(T*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::number_integer):
+            {
+                return static_cast<T>(m_value.number_integer);
+            }
+            case (value_t::number_float):
+            {
+                return static_cast<T>(m_value.number_float);
+            }
+            default:
+            {
+                throw std::domain_error("type must be number, but is " + type_name());
+            }
+        }
+    }
+
+    /// get a boolean (explicit)
+    boolean_t get_impl(boolean_t*) const
+    {
+        switch (m_type)
+        {
+            case (value_t::boolean):
+            {
+                return m_value.boolean;
+            }
+            default:
+            {
+                throw std::domain_error("type must be boolean, but is " + type_name());
+            }
+        }
+    }
+
+    /// get a pointer to the value (object)
+    object_t* get_impl_ptr(object_t*) noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (object)
+    const object_t* get_impl_ptr(const object_t*) const noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    array_t* get_impl_ptr(array_t*) noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    const array_t* get_impl_ptr(const array_t*) const noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    string_t* get_impl_ptr(string_t*) noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    const string_t* get_impl_ptr(const string_t*) const noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    boolean_t* get_impl_ptr(boolean_t*) noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    const boolean_t* get_impl_ptr(const boolean_t*) const noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    number_integer_t* get_impl_ptr(number_integer_t*) noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    const number_integer_t* get_impl_ptr(const number_integer_t*) const noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    number_float_t* get_impl_ptr(number_float_t*) noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    const number_float_t* get_impl_ptr(const number_float_t*) const noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+  public:
+
+    /// @name value access
+    /// @{
+
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value.
+
+    @tparam ValueType non-pointer type compatible to the JSON value, for
+    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
+    `std::vector` types for JSON arrays
+
+    @return copy of the JSON value, converted to type @a ValueType
+
+    @throw std::domain_error in case passed type @a ValueType is incompatible
+    to JSON
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows serveral conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    assiciative containers such as `std::unordered_map<std::string\,
+    json>`.,get__ValueType_const}
+
+    @internal
+    The idea of using a casted null pointer to choose the correct
+    implementation is from <http://stackoverflow.com/a/8315197/266378>.
+    @endinternal
+
+    @sa @ref operator ValueType() const for implicit conversion
+    @sa @ref get() for pointer-member access
+    */
+    template<typename ValueType, typename
+             std::enable_if<
+                 not std::is_pointer<ValueType>::value
+                 , int>::type = 0>
+    ValueType get() const
+    {
+        return get_impl(static_cast<ValueType*>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+
+    Explicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning Writing data to the pointee of the result yields an undefined
+    state.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, or @ref
+    number_float_t.
+
+    @return pointer to the internally stored JSON value if the requested pointer
+    type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get__PointerType}
+
+    @sa @ref get_ptr() for explicit pointer-member access
+    */
+    template<typename PointerType, typename
+             std::enable_if<
+                 std::is_pointer<PointerType>::value
+                 , int>::type = 0>
+    PointerType get() noexcept
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+    @copydoc get()
+    */
+    template<typename PointerType, typename
+             std::enable_if<
+                 std::is_pointer<PointerType>::value
+                 , int>::type = 0>
+    const PointerType get() const noexcept
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /*!
+    @brief get a pointer value (implicit)
+
+    Implict pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning Writing data to the pointee of the result yields an undefined
+    state.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, or @ref
+    number_float_t.
+
+    @return pointer to the internally stored JSON value if the requested pointer
+    type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get_ptr}
+    */
+    template<typename PointerType, typename
+             std::enable_if<
+                 std::is_pointer<PointerType>::value
+                 , int>::type = 0>
+    PointerType get_ptr() noexcept
+    {
+        // delegate the call to get_impl_ptr<>()
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (implicit)
+    @copydoc get_ptr()
+    */
+    template<typename PointerType, typename
+             std::enable_if<
+                 std::is_pointer<PointerType>::value
+                 and std::is_const<PointerType>::value
+                 , int>::type = 0>
+    const PointerType get_ptr() const noexcept
+    {
+        // delegate the call to get_impl_ptr<>() const
+        return get_impl_ptr(static_cast<const PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a value (implicit)
+
+    Implict type conversion between the JSON value and a compatible value. The
+    call is realized by calling @ref get() const.
+
+    @tparam ValueType non-pointer type compatible to the JSON value, for
+    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
+    `std::vector` types for JSON arrays
+
+    @return copy of the JSON value, converted to type @a ValueType
+
+    @throw std::domain_error in case passed type @a ValueType is incompatible
+    to JSON, thrown by @ref get() const
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows serveral conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    assiciative containers such as `std::unordered_map<std::string\,
+    json>`.,operator__ValueType}
+    */
+    template<typename ValueType, typename
+             std::enable_if<
+                 not std::is_pointer<ValueType>::value
+                 , int>::type = 0>
+    operator ValueType() const
+    {
+        // delegate the call to get<>() const
+        return get<ValueType>();
+    }
+
+    /// @}
+
+
+    ////////////////////
+    // element access //
+    ////////////////////
+
+    /// @name element access
+    /// @{
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a reference to the element at specified location @a idx, with
+    bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw std::domain_error if JSON is not an array
+    @throw std::out_of_range if the index @a idx is out of range of the array;
+    that is, `idx >= size()`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how array elements can be read and
+    written using at.,at__size_type}
+    */
+    reference at(size_type idx)
+    {
+        // at only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use at() with " + type_name());
+        }
+
+        return m_value.array->at(idx);
+    }
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a const reference to the element at specified location @a idx, with
+    bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw std::domain_error if JSON is not an array
+    @throw std::out_of_range if the index @a idx is out of range of the array;
+    that is, `idx >= size()`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how array elements can be read using
+    at.,at__size_type_const}
+    */
+    const_reference at(size_type idx) const
+    {
+        // at only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use at() with " + type_name());
+        }
+
+        return m_value.array->at(idx);
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a reference to the element at with specified key @a key, with
+    bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object
+    @throw std::out_of_range if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using at.,at__object_t_key_type}
+    */
+    reference at(const typename object_t::key_type& key)
+    {
+        // at only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use at() with " + type_name());
+        }
+
+        return m_value.object->at(key);
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a const reference to the element at with specified key @a key, with
+    bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object
+    @throw std::out_of_range if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    at.,at__object_t_key_type_const}
+    */
+    const_reference at(const typename object_t::key_type& key) const
+    {
+        // at only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use at() with " + type_name());
+        }
+
+        return m_value.object->at(key);
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a reference to the element at specified location @a idx.
+
+    @note If @a idx is beyond the range of the array (i.e., `idx >= size()`),
+    then the array is silently filled up with `null` values to make `idx` a
+    valid reference to the last stored element.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw std::domain_error if JSON is not an array or null
+
+    @complexity Constant if @a idx is in the range of the array. Otherwise
+    linear in `idx - size()`.
+
+    @liveexample{The example below shows how array elements can be read and
+    written using [] operator. Note the addition of `null`
+    values.,operatorarray__size_type}
+    */
+    reference operator[](size_type idx)
+    {
+        // implicitly convert null to object
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::array;
+            m_value.array = create<array_t>();
+        }
+
+        // [] only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        for (size_t i = m_value.array->size(); i <= idx; ++i)
+        {
+            m_value.array->push_back(basic_json());
+        }
+
+        return m_value.array->operator[](idx);
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a const reference to the element at specified location @a idx.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw std::domain_error if JSON is not an array
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how array elements can be read using
+    the [] operator.,operatorarray__size_type_const}
+    */
+    const_reference operator[](size_type idx) const
+    {
+        // at only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        return m_value.array->operator[](idx);
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object or null
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the [] operator.,operatorarray__key_type}
+    */
+    reference operator[](const typename object_t::key_type& key)
+    {
+        // implicitly convert null to object
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+        }
+
+        // [] only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        return m_value.object->operator[](key);
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object or null
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the [] operator.,operatorarray__key_type_const}
+    */
+    const_reference operator[](const typename object_t::key_type& key) const
+    {
+        // at only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        return m_value.object->operator[](key);
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @note This function is required for compatibility reasons with Clang.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object or null
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the [] operator.,operatorarray__key_type}
+    */
+    template<typename T, std::size_t n>
+    reference operator[](const T (&key)[n])
+    {
+        // implicitly convert null to object
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+        }
+
+        // at only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        return m_value.object->operator[](key);
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note This function is required for compatibility reasons with Clang.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw std::domain_error if JSON is not an object or null
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the [] operator.,operatorarray__key_type_const}
+    */
+    template<typename T, std::size_t n>
+    const_reference operator[](const T (&key)[n]) const
+    {
+        // at only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use operator[] with " + type_name());
+        }
+
+        return m_value.object->operator[](key);
+    }
+
+    /*!
+    @brief access the first element
+
+    Returns a reference to the first element in the container. For a JSON
+    container `c`, the expression `c.front()` is equivalent to `*c.begin()`.
+
+    @return In case of a structured type (array or object), a reference to the
+    first element is returned. In cast of number, string, or boolean values, a
+    reference to the value is returned.
+
+    @complexity Constant.
+
+    @note Calling `front` on an empty container is undefined.
+
+    @throw std::out_of_range when called on null value
+
+    @liveexample{The following code shows an example for @ref front.,front}
+    */
+    reference front()
+    {
+        return *begin();
+    }
+
+    /*!
+    @copydoc basic_json::front()
+    */
+    const_reference front() const
+    {
+        return *cbegin();
+    }
+
+    /*!
+    @brief access the last element
+
+    Returns a reference to the last element in the container. For a JSON
+    container `c`, the expression `c.back()` is equivalent to `{ auto tmp =
+    c.end(); --tmp; return *tmp; }`.
+
+    @return In case of a structured type (array or object), a reference to the
+    last element is returned. In cast of number, string, or boolean values, a
+    reference to the value is returned.
+
+    @complexity Constant.
+
+    @note Calling `back` on an empty container is undefined.
+
+    @throw std::out_of_range when called on null value.
+
+    @liveexample{The following code shows an example for @ref back.,back}
+    */
+    reference back()
+    {
+        auto tmp = end();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @copydoc basic_json::back()
+    */
+    const_reference back() const
+    {
+        auto tmp = cend();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @brief remove element given an iterator
+
+    Removes the element specified by iterator @a pos. Invalidates iterators and
+    references at or after the point of the erase, including the end()
+    iterator. The iterator @a pos must be valid and dereferenceable. Thus the
+    end() iterator (which is valid, but is not dereferencable) cannot be used
+    as a value for @a pos.
+
+    If called on a primitive type other than null, the resulting JSON value
+    will be `null`.
+
+    @param[in] pos iterator to the element to remove
+    @return Iterator following the last removed element. If the iterator @a pos
+    refers to the last element, the end() iterator is returned.
+
+    @tparam InteratorType an @ref iterator or @ref const_iterator
+
+    @throw std::domain_error if called on a `null` value
+    @throw std::domain_error if called on an iterator which does not belong to
+    the current JSON value
+    @throw std::out_of_range if called on a primitive type with invalid iterator
+    (i.e., any iterator which is not end())
+
+    @complexity The complexity depends on the type:
+    - objects: amortized constant
+    - arrays: linear in distance between pos and the end of the container
+    - strings: linear in the length of the string
+    - other types: constant
+
+    @liveexample{The example shows the result of erase for different JSON
+    types.,erase__IteratorType}
+    */
+    template <class InteratorType, typename
+              std::enable_if<
+                  std::is_same<InteratorType, typename basic_json_t::iterator>::value or
+                  std::is_same<InteratorType, typename basic_json_t::const_iterator>::value
+                  , int>::type
+              = 0>
+    InteratorType erase(InteratorType pos)
+    {
+        // make sure iterator fits the current value
+        if (this != pos.m_object)
+        {
+            throw std::domain_error("iterator does not fit current value");
+        }
+
+        InteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            case value_t::number_float:
+            case value_t::boolean:
+            case value_t::string:
+            {
+                if (not pos.m_it.primitive_iterator.is_begin())
+                {
+                    throw std::out_of_range("iterator out of range");
+                }
+
+                if (m_type == value_t::string)
+                {
+                    delete m_value.string;
+                    m_value.string = nullptr;
+                }
+
+                m_type = value_t::null;
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator);
+                break;
+            }
+
+            default:
+            {
+                throw std::domain_error("cannot use erase() with " + type_name());
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove elements given an iterator range
+
+    Removes the element specified by the range `[first; last)`. Invalidates
+    iterators and references at or after the point of the erase, including the
+    end() iterator. The iterator @a first does not need to be dereferenceable
+    if `first == last`: erasing an empty range is a no-op.
+
+    If called on a primitive type other than null, the resulting JSON value
+    will be `null`.
+
+    @param[in] first iterator to the beginning of the range to remove
+    @param[in] last iterator past the end of the range to remove
+    @return Iterator following the last removed element. If the iterator @a
+    second refers to the last element, the end() iterator is returned.
+
+    @tparam InteratorType an @ref iterator or @ref const_iterator
+
+    @throw std::domain_error if called on a `null` value
+    @throw std::domain_error if called on iterators which does not belong to
+    the current JSON value
+    @throw std::out_of_range if called on a primitive type with invalid iterators
+    (i.e., if `first != begin()` and `last != end()`)
+
+    @complexity The complexity depends on the type:
+    - objects: `log(size()) + std::distance(first, last)`
+    - arrays: linear in the distance between @a first and @a last, plus linear
+      in the distance between @a last and end of the container
+    - strings: linear in the length of the string
+    - other types: constant
+
+    @liveexample{The example shows the result of erase for different JSON
+    types.,erase__IteratorType_IteratorType}
+    */
+    template <class InteratorType, typename
+              std::enable_if<
+                  std::is_same<InteratorType, typename basic_json_t::iterator>::value or
+                  std::is_same<InteratorType, typename basic_json_t::const_iterator>::value
+                  , int>::type
+              = 0>
+    InteratorType erase(InteratorType first, InteratorType last)
+    {
+        // make sure iterator fits the current value
+        if (this != first.m_object or this != last.m_object)
+        {
+            throw std::domain_error("iterators do not fit current value");
+        }
+
+        InteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            case value_t::number_float:
+            case value_t::boolean:
+            case value_t::string:
+            {
+                if (not first.m_it.primitive_iterator.is_begin() or not last.m_it.primitive_iterator.is_end())
+                {
+                    throw std::out_of_range("iterators out of range");
+                }
+
+                if (m_type == value_t::string)
+                {
+                    delete m_value.string;
+                    m_value.string = nullptr;
+                }
+
+                m_type = value_t::null;
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator,
+                                              last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator,
+                                             last.m_it.array_iterator);
+                break;
+            }
+
+            default:
+            {
+                throw std::domain_error("cannot use erase with " + type_name());
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove element from a JSON object given a key
+
+    Removes elements from a JSON object with the key value @a key.
+
+    @param[in] key value of the elements to remove
+
+    @return Number of elements removed. If ObjectType is the default `std::map`
+    type, the return value will always be `0` (@a key was not found) or `1` (@a
+    key was found).
+
+    @throw std::domain_error when called on a type other than JSON object
+
+    @complexity `log(size()) + count(key)`
+
+    @liveexample{The example shows the effect of erase.,erase__key_type}
+    */
+    size_type erase(const typename object_t::key_type& key)
+    {
+        // this erase only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use erase() with " + type_name());
+        }
+
+        return m_value.object->erase(key);
+    }
+
+    /*!
+    @brief remove element from a JSON array given an index
+
+    Removes element from a JSON array at the index @a idx.
+
+    @param[in] idx index of the element to remove
+
+    @throw std::domain_error when called on a type other than JSON array
+    @throw std::out_of_range when `idx >= size()`
+
+    @complexity Linear in distance between @a idx and the end of the container.
+
+    @liveexample{The example shows the effect of erase.,erase__size_type}
+    */
+    void erase(const size_type idx)
+    {
+        // this erase only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use erase() with " + type_name());
+        }
+
+        if (idx >= size())
+        {
+            throw std::out_of_range("index out of range");
+        }
+
+        m_value.array->erase(m_value.array->begin() + static_cast<difference_type>(idx));
+    }
+
+    /*!
+    @brief find an element in a JSON object
+
+    Finds an element in a JSON object with key equivalent to @a key. If the
+    element is not found or the JSON value is not an object, end() is returned.
+
+    @param[in] key key value of the element to search for
+
+    @return Iterator to an element with key equivalent to @a key. If no such
+    element is found, past-the-end (see end()) iterator is returned.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how find is used.,find__key_type}
+    */
+    iterator find(typename object_t::key_type key)
+    {
+        auto result = end();
+
+        if (m_type == value_t::object)
+        {
+            result.m_it.object_iterator = m_value.object->find(key);
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief find an element in a JSON object
+    @copydoc find(typename object_t::key_type)
+    */
+    const_iterator find(typename object_t::key_type key) const
+    {
+        auto result = cend();
+
+        if (m_type == value_t::object)
+        {
+            result.m_it.object_iterator = m_value.object->find(key);
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief returns the number of occurrences of a key in a JSON object
+
+    Returns the number of elements with key @a key. If ObjectType is the
+    default `std::map` type, the return value will always be `0` (@a key was
+    not found) or `1` (@a key was found).
+
+    @param[in] key key value of the element to count
+
+    @return Number of elements with key @a key. If the JSON value is not an
+    object, the return value will be `0`.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how count is used.,count}
+    */
+    size_type count(typename object_t::key_type key) const
+    {
+        // return 0 for all nonobject types
+        return (m_type == value_t::object) ? m_value.object->count(key) : 0;
+    }
+
+    /// @}
+
+
+    ///////////////
+    // iterators //
+    ///////////////
+
+    /// @name iterators
+    /// @{
+
+    /*!
+    @brief returns an iterator to the first element
+
+    Returns an iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for @ref begin.,begin}
+    */
+    iterator begin()
+    {
+        iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cbegin()
+    */
+    const_iterator begin() const
+    {
+        return cbegin();
+    }
+
+    /*!
+    @brief returns a const iterator to the first element
+
+    Returns a const iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).begin()`.
+
+    @liveexample{The following code shows an example for @ref cbegin.,cbegin}
+    */
+    const_iterator cbegin() const
+    {
+        const_iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to one past the last element
+
+    Returns an iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for @ref end.,end}
+    */
+    iterator end()
+    {
+        iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cend()
+    */
+    const_iterator end() const
+    {
+        return cend();
+    }
+
+    /*!
+    @brief returns a const iterator to one past the last element
+
+    Returns a const iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).end()`.
+
+    @liveexample{The following code shows an example for @ref cend.,cend}
+    */
+    const_iterator cend() const
+    {
+        const_iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-beginning
+
+    Returns an iterator to the reverse-beginning; that is, the last element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function satisfies the ReversibleContainer requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(end())`.
+
+    @liveexample{The following code shows an example for @ref rbegin.,rbegin}
+    */
+    reverse_iterator rbegin()
+    {
+        return reverse_iterator(end());
+    }
+
+    /*!
+    @copydoc basic_json::crbegin()
+    */
+    const_reverse_iterator rbegin() const
+    {
+        return crbegin();
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-end
+
+    Returns an iterator to the reverse-end; that is, one before the first
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function satisfies the ReversibleContainer requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(begin())`.
+
+    @liveexample{The following code shows an example for @ref rend.,rend}
+    */
+    reverse_iterator rend()
+    {
+        return reverse_iterator(begin());
+    }
+
+    /*!
+    @copydoc basic_json::crend()
+    */
+    const_reverse_iterator rend() const
+    {
+        return crend();
+    }
+
+    /*!
+    @brief returns a const reverse iterator to the last element
+
+    Returns a const iterator to the reverse-beginning; that is, the last
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function satisfies the ReversibleContainer requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rbegin()`.
+
+    @liveexample{The following code shows an example for @ref crbegin.,crbegin}
+    */
+    const_reverse_iterator crbegin() const
+    {
+        return const_reverse_iterator(cend());
+    }
+
+    /*!
+    @brief returns a const reverse iterator to one before the first
+
+    Returns a const reverse iterator to the reverse-end; that is, one before
+    the first element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function satisfies the ReversibleContainer requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rend()`.
+
+    @liveexample{The following code shows an example for @ref crend.,crend}
+    */
+    const_reverse_iterator crend() const
+    {
+        return const_reverse_iterator(cbegin());
+    }
+
+    /// @}
+
+
+    //////////////
+    // capacity //
+    //////////////
+
+    /// @name capacity
+    /// @{
+
+    /*!
+    @brief checks whether the container is empty
+
+    Checks if a JSON value has no elements.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | @c true
+            boolean     | @c false
+            string      | @c false
+            number      | @c false
+            object      | result of function object_t::empty()
+            array       | result of function array_t::empty()
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy the
+                Container concept; that is, their empty() functions have
+                constant complexity.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - Has the semantics of `begin() == end()`.
+
+    @liveexample{The following code uses @ref empty to check if a @ref json
+    object contains any elements.,empty}
+    */
+    bool empty() const noexcept
+    {
+        switch (m_type)
+        {
+            case (value_t::null):
+            {
+                return true;
+            }
+
+            case (value_t::array):
+            {
+                return m_value.array->empty();
+            }
+
+            case (value_t::object):
+            {
+                return m_value.object->empty();
+            }
+
+            default:
+            {
+                // all other types are nonempty
+                return false;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the number of elements
+
+    Returns the number of elements in a JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | @c 0
+            boolean     | @c 1
+            string      | @c 1
+            number      | @c 1
+            object      | result of function object_t::size()
+            array       | result of function array_t::size()
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy the
+                Container concept; that is, their size() functions have
+                constant complexity.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - Has the semantics of `std::distance(begin(), end())`.
+
+    @liveexample{The following code calls @ref size on the different value
+    types.,size}
+    */
+    size_type size() const noexcept
+    {
+        switch (m_type)
+        {
+            case (value_t::null):
+            {
+                return 0;
+            }
+
+            case (value_t::array):
+            {
+                return m_value.array->size();
+            }
+
+            case (value_t::object):
+            {
+                return m_value.object->size();
+            }
+
+            default:
+            {
+                // all other types have size 1
+                return 1;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the maximum possible number of elements
+
+    Returns the maximum number of elements a JSON value is able to hold due to
+    system or library implementation limitations, i.e. `std::distance(begin(),
+    end())` for the JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | @c 0 (same as size())
+            boolean     | @c 1 (same as size())
+            string      | @c 1 (same as size())
+            number      | @c 1 (same as size())
+            object      | result of function object_t::max_size()
+            array       | result of function array_t::max_size()
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy the
+                Container concept; that is, their max_size() functions have
+                constant complexity.
+
+    @requirement This function satisfies the Container requirements:
+    - The complexity is constant.
+    - Has the semantics of returning `b.size()` where `b` is the largest
+      possible JSON value.
+
+    @liveexample{The following code calls @ref max_size on the different value
+    types. Note the output is implementation specific.,max_size}
+    */
+    size_type max_size() const noexcept
+    {
+        switch (m_type)
+        {
+            case (value_t::array):
+            {
+                return m_value.array->max_size();
+            }
+
+            case (value_t::object):
+            {
+                return m_value.object->max_size();
+            }
+
+            default:
+            {
+                // all other types have max_size() == size()
+                return size();
+            }
+        }
+    }
+
+    /// @}
+
+
+    ///////////////
+    // modifiers //
+    ///////////////
+
+    /// @name modifiers
+    /// @{
+
+    /*!
+    @brief clears the contents
+
+    Clears the content of a JSON value and resets it to the default value as
+    if @ref basic_json(value_t) would have been called:
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    object      | `{}`
+    array       | `[]`
+
+    @note Floating-point numbers are set to `0.0` which will be serialized to
+    `0`. The vale type remains @ref number_float_t.
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows the effect of @ref clear to different
+    JSON types.,clear}
+    */
+    void clear() noexcept
+    {
+        switch (m_type)
+        {
+            case (value_t::null):
+            case (value_t::discarded):
+            {
+                break;
+            }
+
+            case (value_t::number_integer):
+            {
+                m_value.number_integer = 0;
+                break;
+            }
+
+            case (value_t::number_float):
+            {
+                m_value.number_float = 0.0;
+                break;
+            }
+
+            case (value_t::boolean):
+            {
+                m_value.boolean = false;
+                break;
+            }
+
+            case (value_t::string):
+            {
+                m_value.string->clear();
+                break;
+            }
+
+            case (value_t::array):
+            {
+                m_value.array->clear();
+                break;
+            }
+
+            case (value_t::object):
+            {
+                m_value.object->clear();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @brief add an object to an array
+
+    Appends the given element @a value to the end of the JSON value. If the
+    function is called on a JSON null value, an empty array is created before
+    appending @a value.
+
+    @param value the value to add to the JSON array
+
+    @throw std::domain_error when called on a type other than JSON array or null
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows how `push_back` and `+=` can be used to add
+    elements to a JSON array. Note how the `null` value was silently converted
+    to a JSON array.,push_back}
+    */
+    void push_back(basic_json&& value)
+    {
+        // push_back only works for null objects or arrays
+        if (not(m_type == value_t::null or m_type == value_t::array))
+        {
+            throw std::domain_error("cannot use push_back() with " + type_name());
+        }
+
+        // transform null object into an array
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+        }
+
+        // add element to array (move semantics)
+        m_value.array->push_back(std::move(value));
+        // invalidate object
+        value.m_type = value_t::null;
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(basic_json&& value)
+    {
+        push_back(std::move(value));
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    void push_back(const basic_json& value)
+    {
+        // push_back only works for null objects or arrays
+        if (not(m_type == value_t::null or m_type == value_t::array))
+        {
+            throw std::domain_error("cannot use push_back() with " + type_name());
+        }
+
+        // transform null object into an array
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+        }
+
+        // add element to array
+        m_value.array->push_back(value);
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(const basic_json& value)
+    {
+        push_back(value);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an object
+
+    Inserts the given element @a value to the JSON object. If the function is
+    called on a JSON null value, an empty object is created before inserting @a
+    value.
+
+    @param[in] value the value to add to the JSON object
+
+    @throw std::domain_error when called on a type other than JSON object or
+    null
+
+    @complexity Logarithmic in the size of the container, O(log(`size()`)).
+
+    @liveexample{The example shows how `push_back` and `+=` can be used to add
+    elements to a JSON object. Note how the `null` value was silently converted
+    to a JSON object.,push_back__object_t__value}
+    */
+    void push_back(const typename object_t::value_type& value)
+    {
+        // push_back only works for null objects or objects
+        if (not(m_type == value_t::null or m_type == value_t::object))
+        {
+            throw std::domain_error("cannot use push_back() with " + type_name());
+        }
+
+        // transform null object into an object
+        if (m_type == value_t::null)
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+        }
+
+        // add element to array
+        m_value.object->insert(value);
+    }
+
+    /*!
+    @brief add an object to an object
+    @copydoc push_back(const typename object_t::value_type&)
+    */
+    reference operator+=(const typename object_t::value_type& value)
+    {
+        push_back(value);
+        return operator[](value.first);
+    }
+
+    /*!
+    @brief inserts element
+
+    Inserts element @a value before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] value element to insert
+    @return iterator pointing to the inserted @a value.
+
+    @throw std::domain_error if called on JSON values other than arrays
+    @throw std::domain_error if @a pos is not an iterator of *this
+
+    @complexity Constant plus linear in the distance between pos and end of the
+    container.
+
+    @liveexample{The example shows how insert is used.,insert}
+    */
+    iterator insert(const_iterator pos, const basic_json& value)
+    {
+        // insert only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use insert() with " + type_name());
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (pos.m_object != this)
+        {
+            throw std::domain_error("iterator does not fit current value");
+        }
+
+        // insert to array and return iterator
+        iterator result(this);
+        result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, value);
+        return result;
+    }
+
+    /*!
+    @brief inserts element
+    @copydoc insert(const_iterator, const basic_json&)
+    */
+    iterator insert(const_iterator pos, basic_json&& value)
+    {
+        return insert(pos, value);
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts @a count copies of @a value before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] count number of copies of @a value to insert
+    @param[in] value element to insert
+    @return iterator pointing to the first element inserted, or @a pos if
+    `count==0`
+
+    @throw std::domain_error if called on JSON values other than arrays
+    @throw std::domain_error if @a pos is not an iterator of *this
+
+    @complexity Linear in @a count plus linear in the distance between @a pos
+    and end of the container.
+
+    @liveexample{The example shows how insert is used.,insert__count}
+    */
+    iterator insert(const_iterator pos, size_type cnt, const basic_json& value)
+    {
+        // insert only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use insert() with " + type_name());
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (pos.m_object != this)
+        {
+            throw std::domain_error("iterator does not fit current value");
+        }
+
+        // insert to array and return iterator
+        iterator result(this);
+        result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, value);
+        return result;
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from range `[first, last)` before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw std::domain_error if called on JSON values other than arrays
+    @throw std::domain_error if @a pos is not an iterator of *this
+    @throw std::domain_error if @a first and @a last do not belong to the same
+    JSON value
+    @throw std::domain_error if @a first or @a last are iterators into
+    container for which insert is called
+    @return iterator pointing to the first element inserted, or @a pos if
+    `first==last`
+
+    @complexity Linear in `std::distance(first, last)` plus linear in the
+    distance between @a pos and end of the container.
+
+    @liveexample{The example shows how insert is used.,insert__range}
+    */
+    iterator insert(const_iterator pos, const_iterator first, const_iterator last)
+    {
+        // insert only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use insert() with " + type_name());
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (pos.m_object != this)
+        {
+            throw std::domain_error("iterator does not fit current value");
+        }
+
+        if (first.m_object != last.m_object)
+        {
+            throw std::domain_error("iterators does not fit");
+        }
+
+        if (first.m_object == this or last.m_object == this)
+        {
+            throw std::domain_error("passed iterators may not belong to container");
+        }
+
+        // insert to array and return iterator
+        iterator result(this);
+        result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator,
+                                     first.m_it.array_iterator, last.m_it.array_iterator);
+        return result;
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from initializer list @a ilist before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] ilist initializer list to insert the values from
+
+    @throw std::domain_error if called on JSON values other than arrays
+    @throw std::domain_error if @a pos is not an iterator of *this
+    @return iterator pointing to the first element inserted, or @a pos if
+    `ilist` is empty
+
+    @complexity Linear in `ilist.size()` plus linear in the distance between @a
+    pos and end of the container.
+
+    @liveexample{The example shows how insert is used.,insert__ilist}
+    */
+    iterator insert(const_iterator pos, std::initializer_list<basic_json> ilist)
+    {
+        // insert only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use insert() with " + type_name());
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (pos.m_object != this)
+        {
+            throw std::domain_error("iterator does not fit current value");
+        }
+
+        // insert to array and return iterator
+        iterator result(this);
+        result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, ilist);
+        return result;
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of the JSON value with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other JSON value to exchange the contents with
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON arrays can be
+    swapped.,swap__reference}
+    */
+    void swap(reference other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value and
+        std::is_nothrow_move_assignable<value_t>::value and
+        std::is_nothrow_move_constructible<json_value>::value and
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        std::swap(m_type, other.m_type);
+        std::swap(m_value, other.m_value);
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON array with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other array to exchange the contents with
+
+    @throw std::domain_error when JSON value is not an array
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be
+    swapped.,swap__array_t}
+    */
+    void swap(array_t& other)
+    {
+        // swap only works for arrays
+        if (m_type != value_t::array)
+        {
+            throw std::domain_error("cannot use swap() with " + type_name());
+        }
+
+        // swap arrays
+        std::swap(*(m_value.array), other);
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON object with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other object to exchange the contents with
+
+    @throw std::domain_error when JSON value is not an object
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be
+    swapped.,swap__object_t}
+    */
+    void swap(object_t& other)
+    {
+        // swap only works for objects
+        if (m_type != value_t::object)
+        {
+            throw std::domain_error("cannot use swap() with " + type_name());
+        }
+
+        // swap objects
+        std::swap(*(m_value.object), other);
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON string with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other string to exchange the contents with
+
+    @throw std::domain_error when JSON value is not a string
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be
+    swapped.,swap__string_t}
+    */
+    void swap(string_t& other)
+    {
+        // swap only works for strings
+        if (m_type != value_t::string)
+        {
+            throw std::domain_error("cannot use swap() with " + type_name());
+        }
+
+        // swap strings
+        std::swap(*(m_value.string), other);
+    }
+
+    /// @}
+
+
+    //////////////////////////////////////////
+    // lexicographical comparison operators //
+    //////////////////////////////////////////
+
+    /// @name lexicographical comparison operators
+    /// @{
+
+  private:
+    /*!
+    @brief comparison operator for JSON types
+
+    Returns an ordering that is similar to Python:
+    - order: null < boolean < number < object < array < string
+    - furthermore, each type is not smaller than itself
+    */
+    friend bool operator<(const value_t lhs, const value_t rhs)
+    {
+        static constexpr const std::array<uint8_t, 7> order = {{
+                0, // null
+                3, // object
+                4, // array
+                5, // string
+                1, // boolean
+                2, // integer
+                2  // float
+            }
+        };
+
+        // discarded values are not comparable
+        if (lhs == value_t::discarded or rhs == value_t::discarded)
+        {
+            return false;
+        }
+
+        return order[static_cast<std::size_t>(lhs)] < order[static_cast<std::size_t>(rhs)];
+    }
+
+  public:
+    /*!
+    @brief comparison: equal
+
+    Compares two JSON values for equality according to the following rules:
+    - Two JSON values are equal if (1) they are from the same type and (2)
+      their stored values are the same.
+    - Integer and floating-point numbers are automatically converted before
+      comparison. Floating-point numbers are compared indirectly: two
+      floating-point numbers `f1` and `f2` are considered equal if neither
+      `f1 > f2` nor `f2 > f1` holds.
+    - Two JSON null values are equal.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are equal
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__equal}
+    */
+    friend bool operator==(const_reference lhs, const_reference rhs) noexcept
+    {
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case (value_t::array):
+                    return *lhs.m_value.array == *rhs.m_value.array;
+                case (value_t::object):
+                    return *lhs.m_value.object == *rhs.m_value.object;
+                case (value_t::null):
+                    return true;
+                case (value_t::string):
+                    return *lhs.m_value.string == *rhs.m_value.string;
+                case (value_t::boolean):
+                    return lhs.m_value.boolean == rhs.m_value.boolean;
+                case (value_t::number_integer):
+                    return lhs.m_value.number_integer == rhs.m_value.number_integer;
+                case (value_t::number_float):
+                    return approx(lhs.m_value.number_float, rhs.m_value.number_float);
+                case (value_t::discarded):
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer and rhs_type == value_t::number_float)
+        {
+            return approx(static_cast<number_float_t>(lhs.m_value.number_integer),
+                          rhs.m_value.number_float);
+        }
+        else if (lhs_type == value_t::number_float and rhs_type == value_t::number_integer)
+        {
+            return approx(lhs.m_value.number_float,
+                          static_cast<number_float_t>(rhs.m_value.number_integer));
+        }
+        return false;
+    }
+
+    /*!
+    @brief comparison: equal
+
+    The functions compares the given JSON value against a null pointer. As the
+    null pointer can be used to initialize a JSON value to null, a comparison
+    of JSON value @a v with a null pointer should be equivalent to call
+    `v.is_null()`.
+
+    @param[in] v  JSON value to consider
+    @return whether @a v is null
+
+    @complexity Constant.
+
+    @liveexample{The example compares several JSON types to the null pointer.
+    ,operator__equal__nullptr_t}
+    */
+    friend bool operator==(const_reference v, std::nullptr_t) noexcept
+    {
+        return v.is_null();
+    }
+
+    /*!
+    @brief comparison: equal
+    @copydoc operator==(const_reference, std::nullptr_t)
+    */
+    friend bool operator==(std::nullptr_t, const_reference v) noexcept
+    {
+        return v.is_null();
+    }
+
+    /*!
+    @brief comparison: not equal
+
+    Compares two JSON values for inequality by calculating `not (lhs == rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are not equal
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__notequal}
+    */
+    friend bool operator!=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (lhs == rhs);
+    }
+
+    /*!
+    @brief comparison: not equal
+
+    The functions compares the given JSON value against a null pointer. As the
+    null pointer can be used to initialize a JSON value to null, a comparison
+    of JSON value @a v with a null pointer should be equivalent to call
+    `not v.is_null()`.
+
+    @param[in] v  JSON value to consider
+    @return whether @a v is not null
+
+    @complexity Constant.
+
+    @liveexample{The example compares several JSON types to the null pointer.
+    ,operator__notequal__nullptr_t}
+    */
+    friend bool operator!=(const_reference v, std::nullptr_t) noexcept
+    {
+        return not v.is_null();
+    }
+
+    /*!
+    @brief comparison: not equal
+    @copydoc operator!=(const_reference, std::nullptr_t)
+    */
+    friend bool operator!=(std::nullptr_t, const_reference v) noexcept
+    {
+        return not v.is_null();
+    }
+
+    /*!
+    @brief comparison: less than
+
+    Compares whether one JSON value @a lhs is less than another JSON value @a
+    rhs according to the following rules:
+    - If @a lhs and @a rhs have the same type, the values are compared using
+      the default `<` operator.
+    - Integer and floating-point numbers are automatically converted before
+      comparison
+    - In case @a lhs and @a rhs have different types, the values are ignored
+      and the order of the types is considered, see
+      @ref operator<(const value_t, const value_t).
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than @a rhs
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__less}
+    */
+    friend bool operator<(const_reference lhs, const_reference rhs) noexcept
+    {
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case (value_t::array):
+                    return *lhs.m_value.array < *rhs.m_value.array;
+                case (value_t::object):
+                    return *lhs.m_value.object < *rhs.m_value.object;
+                case (value_t::null):
+                    return false;
+                case (value_t::string):
+                    return *lhs.m_value.string < *rhs.m_value.string;
+                case (value_t::boolean):
+                    return lhs.m_value.boolean < rhs.m_value.boolean;
+                case (value_t::number_integer):
+                    return lhs.m_value.number_integer < rhs.m_value.number_integer;
+                case (value_t::number_float):
+                    return lhs.m_value.number_float < rhs.m_value.number_float;
+                case (value_t::discarded):
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer and rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_integer) <
+                   rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float and rhs_type == value_t::number_integer)
+        {
+            return lhs.m_value.number_float <
+                   static_cast<number_float_t>(rhs.m_value.number_integer);
+        }
+
+        // We only reach this line if we cannot compare values. In that case,
+        // we compare types. Note we have to call the operator explicitly,
+        // because MSVC has problems otherwise.
+        return operator<(lhs_type, rhs_type);
+    }
+
+    /*!
+    @brief comparison: less than or equal
+
+    Compares whether one JSON value @a lhs is less than or equal to another
+    JSON value by calculating `not (rhs < lhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than or equal to @a rhs
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greater}
+    */
+    friend bool operator<=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (rhs < lhs);
+    }
+
+    /*!
+    @brief comparison: greater than
+
+    Compares whether one JSON value @a lhs is greater than another
+    JSON value by calculating `not (lhs <= rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than to @a rhs
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__lessequal}
+    */
+    friend bool operator>(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (lhs <= rhs);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+
+    Compares whether one JSON value @a lhs is greater than or equal to another
+    JSON value by calculating `not (lhs < rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than or equal to @a rhs
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greaterequal}
+    */
+    friend bool operator>=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return not (lhs < rhs);
+    }
+
+    /// @}
+
+
+    ///////////////////
+    // serialization //
+    ///////////////////
+
+    /// @name serialization
+    /// @{
+
+    /*!
+    @brief serialize to stream
+
+    Serialize the given JSON value @a j to the output stream @a o. The JSON
+    value will be serialized using the @ref dump member function. The
+    indentation of the output can be controlled with the member variable
+    `width` of the output stream @a o. For instance, using the manipulator
+    `std::setw(4)` on @a o sets the indentation level to `4` and the
+    serialization result is the same as calling `dump(4)`.
+
+    @param[in,out] o  stream to serialize to
+    @param[in] j  JSON value to serialize
+
+    @return the stream @a o
+
+    @complexity Linear.
+
+    @liveexample{The example below shows the serialization with different
+    parameters to `width` to adjust the indentation level.,operator_serialize}
+    */
+    friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
+    {
+        // read width member and use it as indentation parameter if nonzero
+        const bool pretty_print = (o.width() > 0);
+        const auto indentation = (pretty_print ? o.width() : 0);
+
+        // reset width to 0 for subsequent calls to this stream
+        o.width(0);
+
+        // do the actual serialization
+        j.dump(o, pretty_print, static_cast<unsigned int>(indentation));
+        return o;
+    }
+
+    /*!
+    @brief serialize to stream
+    @copydoc operator<<(std::ostream&, const basic_json&)
+    */
+    friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
+    {
+        return o << j;
+    }
+
+    /// @}
+
+
+    /////////////////////
+    // deserialization //
+    /////////////////////
+
+    /// @name deserialization
+    /// @{
+
+    /*!
+    @brief deserialize from string
+
+    @param[in] s  string to read a serialized JSON value from
+    @param[in] cb a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+
+    @return result of the deserialization
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the parser callback function
+    @a cb has a super-linear complexity.
+
+    @liveexample{The example below demonstrates the parse function with and
+    without callback function.,parse__string__parser_callback_t}
+
+    @sa parse(std::istream&, parser_callback_t) for a version that reads from
+    an input stream
+    */
+    static basic_json parse(const string_t& s, parser_callback_t cb = nullptr)
+    {
+        return parser(s, cb).parse();
+    }
+
+    /*!
+    @brief deserialize from stream
+
+    @param[in,out] i  stream to read a serialized JSON value from
+    @param[in] cb a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+
+    @return result of the deserialization
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the parser callback function
+    @a cb has a super-linear complexity.
+
+    @liveexample{The example below demonstrates the parse function with and
+    without callback function.,parse__istream__parser_callback_t}
+
+    @sa parse(const string_t&, parser_callback_t) for a version that reads
+    from a string
+    */
+    static basic_json parse(std::istream& i, parser_callback_t cb = nullptr)
+    {
+        return parser(i, cb).parse();
+    }
+
+    static basic_json parse(std::istream&& i, parser_callback_t cb = nullptr)
+    {
+        return parser(i, cb).parse();
+    }
+
+    /*!
+    @brief deserialize from stream
+
+    Deserializes an input stream to a JSON value.
+
+    @param[in,out] i  input stream to read a serialized JSON value from
+    @param[in,out] j  JSON value to write the deserialized input to
+
+    @throw std::invalid_argument in case of parse errors
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser.
+
+    @liveexample{The example below shows how a JSON value is constructed by
+    reading a serialization from a stream.,operator_deserialize}
+
+    @sa parse(std::istream&, parser_callback_t) for a variant with a parser
+    callback function to filter values while parsing
+    */
+    friend std::istream& operator<<(basic_json& j, std::istream& i)
+    {
+        j = parser(i).parse();
+        return i;
+    }
+
+    /*!
+    @brief deserialize from stream
+    @copydoc operator<<(basic_json&, std::istream&)
+    */
+    friend std::istream& operator>>(std::istream& i, basic_json& j)
+    {
+        j = parser(i).parse();
+        return i;
+    }
+
+    /// @}
+
+
+  private:
+    ///////////////////////////
+    // convenience functions //
+    ///////////////////////////
+
+    /// return the type as string
+    string_t type_name() const
+    {
+        switch (m_type)
+        {
+            case (value_t::null):
+            {
+                return "null";
+            }
+
+            case (value_t::object):
+            {
+                return "object";
+            }
+
+            case (value_t::array):
+            {
+                return "array";
+            }
+
+            case (value_t::string):
+            {
+                return "string";
+            }
+
+            case (value_t::boolean):
+            {
+                return "boolean";
+            }
+
+            case (value_t::discarded):
+            {
+                return "discarded";
+            }
+
+            default:
+            {
+                return "number";
+            }
+        }
+    }
+
+    /*!
+    @brief calculates the extra space to escape a JSON string
+
+    @param[in] s  the string to escape
+    @return the number of characters required to escape string @a s
+
+    @complexity Linear in the length of string @a s.
+    */
+    static std::size_t extra_space(const string_t& s) noexcept
+    {
+        std::size_t result = 0;
+
+        for (const auto& c : s)
+        {
+            switch (c)
+            {
+                case '"':
+                case '\\':
+                case '\b':
+                case '\f':
+                case '\n':
+                case '\r':
+                case '\t':
+                {
+                    // from c (1 byte) to \x (2 bytes)
+                    result += 1;
+                    break;
+                }
+
+                default:
+                {
+                    if (c >= 0x00 and c <= 0x1f)
+                    {
+                        // from c (1 byte) to \uxxxx (6 bytes)
+                        result += 5;
+                    }
+                    break;
+                }
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief escape a string
+
+    Escape a string by replacing certain special characters by a sequence of an
+    escape character (backslash) and another character and other control
+    characters by a sequence of "\u" followed by a four-digit hex
+    representation.
+
+    @param[in] s  the string to escape
+    @return  the escaped string
+
+    @complexity Linear in the length of string @a s.
+    */
+    static string_t escape_string(const string_t& s) noexcept
+    {
+        const auto space = extra_space(s);
+        if (space == 0)
+        {
+            return s;
+        }
+
+        // create a result string of necessary size
+        string_t result(s.size() + space, '\\');
+        std::size_t pos = 0;
+
+        for (const auto& c : s)
+        {
+            switch (c)
+            {
+                // quotation mark (0x22)
+                case '"':
+                {
+                    result[pos + 1] = '"';
+                    pos += 2;
+                    break;
+                }
+
+                // reverse solidus (0x5c)
+                case '\\':
+                {
+                    // nothing to change
+                    pos += 2;
+                    break;
+                }
+
+                // backspace (0x08)
+                case '\b':
+                {
+                    result[pos + 1] = 'b';
+                    pos += 2;
+                    break;
+                }
+
+                // formfeed (0x0c)
+                case '\f':
+                {
+                    result[pos + 1] = 'f';
+                    pos += 2;
+                    break;
+                }
+
+                // newline (0x0a)
+                case '\n':
+                {
+                    result[pos + 1] = 'n';
+                    pos += 2;
+                    break;
+                }
+
+                // carriage return (0x0d)
+                case '\r':
+                {
+                    result[pos + 1] = 'r';
+                    pos += 2;
+                    break;
+                }
+
+                // horizontal tab (0x09)
+                case '\t':
+                {
+                    result[pos + 1] = 't';
+                    pos += 2;
+                    break;
+                }
+
+                default:
+                {
+                    if (c >= 0x00 and c <= 0x1f)
+                    {
+                        // print character c as \uxxxx
+                        sprintf(&result[pos + 1], "u%04x", int(c));
+                        pos += 6;
+                        // overwrite trailing null character
+                        result[pos] = '\\';
+                    }
+                    else
+                    {
+                        // all other characters are added as-is
+                        result[pos++] = c;
+                    }
+                    break;
+                }
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief internal implementation of the serialization function
+
+    This function is called by the public member function dump and organizes
+    the serializaion internally. The indentation level is propagated as
+    additional parameter. In case of arrays and objects, the function is called
+    recursively. Note that
+
+    - strings and object keys are escaped using escape_string()
+    - integer numbers are converted implictly via operator<<
+    - floating-point numbers are converted to a string using "%g" format
+
+    @param[out] o              stream to write to
+    @param[in] pretty_print    whether the output shall be pretty-printed
+    @param[in] indent_step     the indent level
+    @param[in] current_indent  the current indent level (only used internally)
+    */
+    void dump(std::ostream& o, const bool pretty_print, const unsigned int indent_step,
+              const unsigned int current_indent = 0) const
+    {
+        // variable to hold indentation for recursive calls
+        unsigned int new_indent = current_indent;
+
+        switch (m_type)
+        {
+            case (value_t::object):
+            {
+                if (m_value.object->empty())
+                {
+                    o << "{}";
+                    return;
+                }
+
+                o << "{";
+
+                // increase indentation
+                if (pretty_print)
+                {
+                    new_indent += indent_step;
+                    o << "\n";
+                }
+
+                for (auto i = m_value.object->cbegin(); i != m_value.object->cend(); ++i)
+                {
+                    if (i != m_value.object->cbegin())
+                    {
+                        o << (pretty_print ? ",\n" : ",");
+                    }
+                    o << string_t(new_indent, ' ') << "\""
+                      << escape_string(i->first) << "\":"
+                      << (pretty_print ? " " : "");
+                    i->second.dump(o, pretty_print, indent_step, new_indent);
+                }
+
+                // decrease indentation
+                if (pretty_print)
+                {
+                    new_indent -= indent_step;
+                    o << "\n";
+                }
+
+                o << string_t(new_indent, ' ') + "}";
+                return;
+            }
+
+            case (value_t::array):
+            {
+                if (m_value.array->empty())
+                {
+                    o << "[]";
+                    return;
+                }
+
+                o << "[";
+
+                // increase indentation
+                if (pretty_print)
+                {
+                    new_indent += indent_step;
+                    o << "\n";
+                }
+
+                for (auto i = m_value.array->cbegin(); i != m_value.array->cend(); ++i)
+                {
+                    if (i != m_value.array->cbegin())
+                    {
+                        o << (pretty_print ? ",\n" : ",");
+                    }
+                    o << string_t(new_indent, ' ');
+                    i->dump(o, pretty_print, indent_step, new_indent);
+                }
+
+                // decrease indentation
+                if (pretty_print)
+                {
+                    new_indent -= indent_step;
+                    o << "\n";
+                }
+
+                o << string_t(new_indent, ' ') << "]";
+                return;
+            }
+
+            case (value_t::string):
+            {
+                o << string_t("\"") << escape_string(*m_value.string) << "\"";
+                return;
+            }
+
+            case (value_t::boolean):
+            {
+                o << (m_value.boolean ? "true" : "false");
+                return;
+            }
+
+            case (value_t::number_integer):
+            {
+                o << m_value.number_integer;
+                return;
+            }
+
+            case (value_t::number_float):
+            {
+                // 15 digits of precision allows round-trip IEEE 754
+                // string->double->string; to be safe, we read this value from
+                // std::numeric_limits<number_float_t>::digits10
+                o << std::setprecision(std::numeric_limits<number_float_t>::digits10) << m_value.number_float;
+                return;
+            }
+
+            case (value_t::discarded):
+            {
+                o << "<discarded>";
+                return;
+            }
+
+            default:
+            {
+                o << "null";
+                return;
+            }
+        }
+    }
+
+  private:
+    //////////////////////
+    // member variables //
+    //////////////////////
+
+    /// the type of the current element
+    value_t m_type = value_t::null;
+
+    /// the value of the current element
+    json_value m_value = {};
+
+
+  private:
+    ///////////////
+    // iterators //
+    ///////////////
+
+    /*!
+    @brief an iterator for primitive JSON types
+
+    This class models an iterator for primitive JSON types (boolean, number,
+    string). It's only purpose is to allow the iterator/const_iterator classes
+    to "iterate" over primitive values. Internally, the iterator is modeled by
+    a `difference_type` variable. Value begin_value (`0`) models the begin,
+    end_value (`1`) models past the end.
+    */
+    class primitive_iterator_t
+    {
+      public:
+        /// set iterator to a defined beginning
+        void set_begin()
+        {
+            m_it = begin_value;
+        }
+
+        /// set iterator to a defined past the end
+        void set_end()
+        {
+            m_it = end_value;
+        }
+
+        /// return whether the iterator can be dereferenced
+        bool is_begin() const
+        {
+            return (m_it == begin_value);
+        }
+
+        /// return whether the iterator is at end
+        bool is_end() const
+        {
+            return (m_it == end_value);
+        }
+
+        /// return reference to the value to change and compare
+        operator difference_type& ()
+        {
+            return m_it;
+        }
+
+        /// return value to compare
+        operator difference_type () const
+        {
+            return m_it;
+        }
+
+      private:
+        static constexpr difference_type begin_value = 0;
+        static constexpr difference_type end_value = begin_value + 1;
+
+        /// iterator as signed integer type
+        difference_type m_it = std::numeric_limits<std::ptrdiff_t>::min();
+    };
+
+    /*!
+    @brief an iterator value
+
+    @note This structure could easily be a union, but MSVC currently does not
+    allow unions members with complex constructors, see
+    https://github.com/nlohmann/json/pull/105.
+    */
+    struct internal_iterator
+    {
+        /// iterator for JSON objects
+        typename object_t::iterator object_iterator;
+        /// iterator for JSON arrays
+        typename array_t::iterator array_iterator;
+        /// generic iterator for all other types
+        primitive_iterator_t primitive_iterator;
+
+        /// create an uninitialized internal_iterator
+        internal_iterator()
+            : object_iterator(), array_iterator(), primitive_iterator()
+        {}
+    };
+
+  public:
+    /*!
+    @brief a const random access iterator for the @ref basic_json class
+
+    This class implements a const iterator for the @ref basic_json class. From
+    this class, the @ref iterator class is derived.
+
+    @requirement The class satisfies the following concept requirements:
+    - [RandomAccessIterator](http://en.cppreference.com/w/cpp/concept/RandomAccessIterator):
+      The iterator that can be moved to point (forward and backward) to any
+      element in constant time.
+    */
+    class const_iterator : public std::iterator<std::random_access_iterator_tag, const basic_json>
+    {
+        /// allow basic_json to access private members
+        friend class basic_json;
+
+      public:
+        /// the type of the values when the iterator is dereferenced
+        using value_type = typename basic_json::value_type;
+        /// a type to represent differences between iterators
+        using difference_type = typename basic_json::difference_type;
+        /// defines a pointer to the type iterated over (value_type)
+        using pointer = typename basic_json::const_pointer;
+        /// defines a reference to the type iterated over (value_type)
+        using reference = typename basic_json::const_reference;
+        /// the category of the iterator
+        using iterator_category = std::bidirectional_iterator_tag;
+
+        /// default constructor
+        const_iterator() = default;
+
+        /// constructor for a given JSON instance
+        const_iterator(pointer object) : m_object(object)
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    m_it.object_iterator = typename object_t::iterator();
+                    break;
+                }
+                case (basic_json::value_t::array):
+                {
+                    m_it.array_iterator = typename array_t::iterator();
+                    break;
+                }
+                default:
+                {
+                    m_it.primitive_iterator = primitive_iterator_t();
+                    break;
+                }
+            }
+        }
+
+        /// copy constructor given a nonconst iterator
+        const_iterator(const iterator& other) : m_object(other.m_object)
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    m_it.object_iterator = other.m_it.object_iterator;
+                    break;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    m_it.array_iterator = other.m_it.array_iterator;
+                    break;
+                }
+
+                default:
+                {
+                    m_it.primitive_iterator = other.m_it.primitive_iterator;
+                    break;
+                }
+            }
+        }
+
+        /// copy constructor
+        const_iterator(const const_iterator& other) noexcept
+            : m_object(other.m_object), m_it(other.m_it)
+        {}
+
+        /// copy assignment
+        const_iterator& operator=(const_iterator other) noexcept(
+            std::is_nothrow_move_constructible<pointer>::value and
+            std::is_nothrow_move_assignable<pointer>::value and
+            std::is_nothrow_move_constructible<internal_iterator>::value and
+            std::is_nothrow_move_assignable<internal_iterator>::value
+        )
+        {
+            std::swap(m_object, other.m_object);
+            std::swap(m_it, other.m_it);
+            return *this;
+        }
+
+      private:
+        /// set the iterator to the first value
+        void set_begin()
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    m_it.object_iterator = m_object->m_value.object->begin();
+                    break;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    m_it.array_iterator = m_object->m_value.array->begin();
+                    break;
+                }
+
+                case (basic_json::value_t::null):
+                {
+                    // set to end so begin()==end() is true: null is empty
+                    m_it.primitive_iterator.set_end();
+                    break;
+                }
+
+                default:
+                {
+                    m_it.primitive_iterator.set_begin();
+                    break;
+                }
+            }
+        }
+
+        /// set the iterator past the last value
+        void set_end()
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    m_it.object_iterator = m_object->m_value.object->end();
+                    break;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    m_it.array_iterator = m_object->m_value.array->end();
+                    break;
+                }
+
+                default:
+                {
+                    m_it.primitive_iterator.set_end();
+                    break;
+                }
+            }
+        }
+
+      public:
+        /// return a reference to the value pointed to by the iterator
+        reference operator*() const
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    return m_it.object_iterator->second;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return *m_it.array_iterator;
+                }
+
+                case (basic_json::value_t::null):
+                {
+                    throw std::out_of_range("cannot get value");
+                }
+
+                default:
+                {
+                    if (m_it.primitive_iterator.is_begin())
+                    {
+                        return *m_object;
+                    }
+                    else
+                    {
+                        throw std::out_of_range("cannot get value");
+                    }
+                }
+            }
+        }
+
+        /// dereference the iterator
+        pointer operator->() const
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    return &(m_it.object_iterator->second);
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return &*m_it.array_iterator;
+                }
+
+                default:
+                {
+                    if (m_it.primitive_iterator.is_begin())
+                    {
+                        return m_object;
+                    }
+                    else
+                    {
+                        throw std::out_of_range("cannot get value");
+                    }
+                }
+            }
+        }
+
+        /// post-increment (it++)
+        const_iterator operator++(int)
+        {
+            auto result = *this;
+            ++(*this);
+
+            return result;
+        }
+
+        /// pre-increment (++it)
+        const_iterator& operator++()
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    ++m_it.object_iterator;
+                    break;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    ++m_it.array_iterator;
+                    break;
+                }
+
+                default:
+                {
+                    ++m_it.primitive_iterator;
+                    break;
+                }
+            }
+
+            return *this;
+        }
+
+        /// post-decrement (it--)
+        const_iterator operator--(int)
+        {
+            auto result = *this;
+            --(*this);
+
+            return result;
+        }
+
+        /// pre-decrement (--it)
+        const_iterator& operator--()
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    --m_it.object_iterator;
+                    break;
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    --m_it.array_iterator;
+                    break;
+                }
+
+                default:
+                {
+                    --m_it.primitive_iterator;
+                    break;
+                }
+            }
+
+            return *this;
+        }
+
+        /// comparison: equal
+        bool operator==(const const_iterator& other) const
+        {
+            // if objects are not the same, the comparison is undefined
+            if (m_object != other.m_object)
+            {
+                throw std::domain_error("cannot compare iterators of different containers");
+            }
+
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    return (m_it.object_iterator == other.m_it.object_iterator);
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return (m_it.array_iterator == other.m_it.array_iterator);
+                }
+
+                default:
+                {
+                    return (m_it.primitive_iterator == other.m_it.primitive_iterator);
+                }
+            }
+        }
+
+        /// comparison: not equal
+        bool operator!=(const const_iterator& other) const
+        {
+            return not operator==(other);
+        }
+
+        /// comparison: smaller
+        bool operator<(const const_iterator& other) const
+        {
+            // if objects are not the same, the comparison is undefined
+            if (m_object != other.m_object)
+            {
+                throw std::domain_error("cannot compare iterators of different containers");
+            }
+
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    throw std::domain_error("cannot use operator< for object iterators");
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return (m_it.array_iterator < other.m_it.array_iterator);
+                }
+
+                default:
+                {
+                    return (m_it.primitive_iterator < other.m_it.primitive_iterator);
+                }
+            }
+        }
+
+        /// comparison: less than or equal
+        bool operator<=(const const_iterator& other) const
+        {
+            return not other.operator < (*this);
+        }
+
+        /// comparison: greater than
+        bool operator>(const const_iterator& other) const
+        {
+            return not operator<=(other);
+        }
+
+        /// comparison: greater than or equal
+        bool operator>=(const const_iterator& other) const
+        {
+            return not operator<(other);
+        }
+
+        /// add to iterator
+        const_iterator& operator+=(difference_type i)
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    throw std::domain_error("cannot use operator+= for object iterators");
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    m_it.array_iterator += i;
+                    break;
+                }
+
+                default:
+                {
+                    m_it.primitive_iterator += i;
+                    break;
+                }
+            }
+
+            return *this;
+        }
+
+        /// subtract from iterator
+        const_iterator& operator-=(difference_type i)
+        {
+            return operator+=(-i);
+        }
+
+        /// add to iterator
+        const_iterator operator+(difference_type i)
+        {
+            auto result = *this;
+            result += i;
+            return result;
+        }
+
+        /// subtract from iterator
+        const_iterator operator-(difference_type i)
+        {
+            auto result = *this;
+            result -= i;
+            return result;
+        }
+
+        /// return difference
+        difference_type operator-(const const_iterator& other) const
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    throw std::domain_error("cannot use operator- for object iterators");
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return m_it.array_iterator - other.m_it.array_iterator;
+                }
+
+                default:
+                {
+                    return m_it.primitive_iterator - other.m_it.primitive_iterator;
+                }
+            }
+        }
+
+        /// access to successor
+        reference operator[](difference_type n) const
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    throw std::domain_error("cannot use operator[] for object iterators");
+                }
+
+                case (basic_json::value_t::array):
+                {
+                    return *(m_it.array_iterator + n);
+                }
+
+                case (basic_json::value_t::null):
+                {
+                    throw std::out_of_range("cannot get value");
+                }
+
+                default:
+                {
+                    if (m_it.primitive_iterator == -n)
+                    {
+                        return *m_object;
+                    }
+                    else
+                    {
+                        throw std::out_of_range("cannot get value");
+                    }
+                }
+            }
+        }
+
+        /// return the key of an object iterator
+        typename object_t::key_type key() const
+        {
+            switch (m_object->m_type)
+            {
+                case (basic_json::value_t::object):
+                {
+                    return m_it.object_iterator->first;
+                }
+
+                default:
+                {
+                    throw std::domain_error("cannot use key() for non-object iterators");
+                }
+            }
+        }
+
+        /// return the value of an iterator
+        reference value() const
+        {
+            return operator*();
+        }
+
+      private:
+        /// associated JSON instance
+        pointer m_object = nullptr;
+        /// the actual iterator of the associated instance
+        internal_iterator m_it = internal_iterator();
+    };
+
+    /*!
+    @brief a mutable random access iterator for the @ref basic_json class
+
+    @requirement The class satisfies the following concept requirements:
+    - [RandomAccessIterator](http://en.cppreference.com/w/cpp/concept/RandomAccessIterator):
+      The iterator that can be moved to point (forward and backward) to any
+      element in constant time.
+    - [OutputIterator](http://en.cppreference.com/w/cpp/concept/OutputIterator):
+      It is possible to write to the pointed-to element.
+    */
+    class iterator : public const_iterator
+    {
+      public:
+        using base_iterator = const_iterator;
+        using pointer = typename basic_json::pointer;
+        using reference = typename basic_json::reference;
+
+        /// default constructor
+        iterator() = default;
+
+        /// constructor for a given JSON instance
+        iterator(pointer object) noexcept : base_iterator(object)
+        {}
+
+        /// copy constructor
+        iterator(const iterator& other) noexcept
+            : base_iterator(other)
+        {}
+
+        /// copy assignment
+        iterator& operator=(iterator other) noexcept(
+            std::is_nothrow_move_constructible<pointer>::value and
+            std::is_nothrow_move_assignable<pointer>::value and
+            std::is_nothrow_move_constructible<internal_iterator>::value and
+            std::is_nothrow_move_assignable<internal_iterator>::value
+        )
+        {
+            base_iterator::operator=(other);
+            return *this;
+        }
+
+        /// return a reference to the value pointed to by the iterator
+        reference operator*()
+        {
+            return const_cast<reference>(base_iterator::operator*());
+        }
+
+        /// dereference the iterator
+        pointer operator->()
+        {
+            return const_cast<pointer>(base_iterator::operator->());
+        }
+
+        /// post-increment (it++)
+        iterator operator++(int)
+        {
+            iterator result = *this;
+            base_iterator::operator++();
+            return result;
+        }
+
+        /// pre-increment (++it)
+        iterator& operator++()
+        {
+            base_iterator::operator++();
+            return *this;
+        }
+
+        /// post-decrement (it--)
+        iterator operator--(int)
+        {
+            iterator result = *this;
+            base_iterator::operator--();
+            return result;
+        }
+
+        /// pre-decrement (--it)
+        iterator& operator--()
+        {
+            base_iterator::operator--();
+            return *this;
+        }
+
+        /// add to iterator
+        iterator& operator+=(difference_type i)
+        {
+            base_iterator::operator+=(i);
+            return *this;
+        }
+
+        /// subtract from iterator
+        iterator& operator-=(difference_type i)
+        {
+            base_iterator::operator-=(i);
+            return *this;
+        }
+
+        /// add to iterator
+        iterator operator+(difference_type i)
+        {
+            auto result = *this;
+            result += i;
+            return result;
+        }
+
+        /// subtract from iterator
+        iterator operator-(difference_type i)
+        {
+            auto result = *this;
+            result -= i;
+            return result;
+        }
+
+        difference_type operator-(const iterator& other) const
+        {
+            return base_iterator::operator-(other);
+        }
+
+        /// access to successor
+        reference operator[](difference_type n) const
+        {
+            return const_cast<reference>(base_iterator::operator[](n));
+        }
+
+        /// return the value of an iterator
+        reference value() const
+        {
+            return const_cast<reference>(base_iterator::value());
+        }
+    };
+
+    /*!
+    @brief a template for a reverse iterator class
+
+    @tparam Base the base iterator type to reverse. Valid types are @ref
+    iterator (to create @ref reverse_iterator) and @ref const_iterator (to
+    create @ref const_reverse_iterator).
+
+    @requirement The class satisfies the following concept requirements:
+    - [RandomAccessIterator](http://en.cppreference.com/w/cpp/concept/RandomAccessIterator):
+      The iterator that can be moved to point (forward and backward) to any
+      element in constant time.
+    - [OutputIterator](http://en.cppreference.com/w/cpp/concept/OutputIterator):
+      It is possible to write to the pointed-to element (only if @a Base is
+      @ref iterator).
+    */
+    template<typename Base>
+    class json_reverse_iterator : public std::reverse_iterator<Base>
+    {
+      public:
+        /// shortcut to the reverse iterator adaptor
+        using base_iterator = std::reverse_iterator<Base>;
+        /// the reference type for the pointed-to element
+        using reference = typename Base::reference;
+
+        /// create reverse iterator from iterator
+        json_reverse_iterator(const typename base_iterator::iterator_type& it)
+            : base_iterator(it) {}
+
+        /// create reverse iterator from base class
+        json_reverse_iterator(const base_iterator& it) : base_iterator(it) {}
+
+        /// post-increment (it++)
+        json_reverse_iterator operator++(int)
+        {
+            return base_iterator::operator++(1);
+        }
+
+        /// pre-increment (++it)
+        json_reverse_iterator& operator++()
+        {
+            base_iterator::operator++();
+            return *this;
+        }
+
+        /// post-decrement (it--)
+        json_reverse_iterator operator--(int)
+        {
+            return base_iterator::operator--(1);
+        }
+
+        /// pre-decrement (--it)
+        json_reverse_iterator& operator--()
+        {
+            base_iterator::operator--();
+            return *this;
+        }
+
+        /// add to iterator
+        json_reverse_iterator& operator+=(difference_type i)
+        {
+            base_iterator::operator+=(i);
+            return *this;
+        }
+
+        /// add to iterator
+        json_reverse_iterator operator+(difference_type i) const
+        {
+            auto result = *this;
+            result += i;
+            return result;
+        }
+
+        /// subtract from iterator
+        json_reverse_iterator operator-(difference_type i) const
+        {
+            auto result = *this;
+            result -= i;
+            return result;
+        }
+
+        /// return difference
+        difference_type operator-(const json_reverse_iterator& other) const
+        {
+            return this->base() - other.base();
+        }
+
+        /// access to successor
+        reference operator[](difference_type n) const
+        {
+            return *(this->operator+(n));
+        }
+
+        /// return the key of an object iterator
+        typename object_t::key_type key() const
+        {
+            auto it = --this->base();
+            return it.key();
+        }
+
+        /// return the value of an iterator
+        reference value() const
+        {
+            auto it = --this->base();
+            return it.operator * ();
+        }
+    };
+
+    /*!
+    @brief wrapper to access iterator member functions in range-based for
+
+    This class allows to access @ref key() and @ref value() during range-based
+    for loops. In these loops, a reference to the JSON values is returned, so
+    there is no access to the underlying iterator.
+    */
+    class iterator_wrapper
+    {
+      private:
+        /// the container to iterate
+        basic_json& container;
+        /// the type of the iterator to use while iteration
+        using json_iterator = decltype(std::begin(container));
+
+        /// internal iterator wrapper
+        class iterator_wrapper_internal
+        {
+          private:
+            /// the iterator
+            json_iterator anchor;
+            /// an index for arrays
+            size_t array_index = 0;
+
+          public:
+            /// construct wrapper given an iterator
+            iterator_wrapper_internal(json_iterator i) : anchor(i)
+            {}
+
+            /// dereference operator (needed for range-based for)
+            iterator_wrapper_internal& operator*()
+            {
+                return *this;
+            }
+
+            /// increment operator (needed for range-based for)
+            iterator_wrapper_internal& operator++()
+            {
+                ++anchor;
+                ++array_index;
+
+                return *this;
+            }
+
+            /// inequality operator (needed for range-based for)
+            bool operator!= (const iterator_wrapper_internal& o)
+            {
+                return anchor != o.anchor;
+            }
+
+            /// stream operator
+            friend std::ostream& operator<<(std::ostream& o, const iterator_wrapper_internal& w)
+            {
+                return o << w.value();
+            }
+
+            /// return key of the iterator
+            typename basic_json::string_t key() const
+            {
+                switch (anchor.m_object->type())
+                {
+                    /// use integer array index as key
+                    case (value_t::array):
+                    {
+                        return std::to_string(array_index);
+                    }
+
+                    /// use key from the object
+                    case (value_t::object):
+                    {
+                        return anchor.key();
+                    }
+
+                    /// use an empty key for all primitive types
+                    default:
+                    {
+                        return "";
+                    }
+                }
+            }
+
+            /// return value of the iterator
+            typename json_iterator::reference value() const
+            {
+                return anchor.value();
+            }
+        };
+
+      public:
+        /// construct iterator wrapper from a container
+        iterator_wrapper(basic_json& cont)
+            : container(cont)
+        {}
+
+        /// return iterator begin (needed for range-based for)
+        iterator_wrapper_internal begin()
+        {
+            return iterator_wrapper_internal(container.begin());
+        }
+
+        /// return iterator end (needed for range-based for)
+        iterator_wrapper_internal end()
+        {
+            return iterator_wrapper_internal(container.end());
+        }
+    };
+
+  private:
+    //////////////////////
+    // lexer and parser //
+    //////////////////////
+
+    /*!
+    @brief lexical analysis
+
+    This class organizes the lexical analysis during JSON deserialization. The
+    core of it is a scanner generated by re2c <http://re2c.org> that processes
+    a buffer and recognizes tokens according to RFC 7159.
+    */
+    class lexer
+    {
+      public:
+        /// token types for the parser
+        enum class token_type
+        {
+            uninitialized,    ///< indicating the scanner is uninitialized
+            literal_true,     ///< the "true" literal
+            literal_false,    ///< the "false" literal
+            literal_null,     ///< the "null" literal
+            value_string,     ///< a string - use get_string() for actual value
+            value_number,     ///< a number - use get_number() for actual value
+            begin_array,      ///< the character for array begin "["
+            begin_object,     ///< the character for object begin "{"
+            end_array,        ///< the character for array end "]"
+            end_object,       ///< the character for object end "}"
+            name_separator,   ///< the name separator ":"
+            value_separator,  ///< the value separator ","
+            parse_error,      ///< indicating a parse error
+            end_of_input      ///< indicating the end of the input buffer
+        };
+
+        /// the char type to use in the lexer
+        using lexer_char_t = unsigned char;
+
+        /// constructor with a given buffer
+        explicit lexer(const string_t& s) noexcept
+            : m_stream(nullptr), m_buffer(s)
+        {
+            m_content = reinterpret_cast<const lexer_char_t*>(s.c_str());
+            m_start = m_cursor = m_content;
+            m_limit = m_content + s.size();
+        }
+        explicit lexer(std::istream* s) noexcept
+            : m_stream(s), m_buffer()
+        {
+            getline(*m_stream, m_buffer);
+            m_content = reinterpret_cast<const lexer_char_t*>(m_buffer.c_str());
+            m_start = m_cursor = m_content;
+            m_limit = m_content + m_buffer.size();
+        }
+
+        /// default constructor
+        lexer() = default;
+
+        // switch of unwanted functions
+        lexer(const lexer&) = delete;
+        lexer operator=(const lexer&) = delete;
+
+        /*!
+        @brief create a string from a Unicode code point
+
+        @param[in] codepoint1  the code point (can be high surrogate)
+        @param[in] codepoint2  the code point (can be low surrogate or 0)
+        @return string representation of the code point
+        @throw std::out_of_range if code point is >0x10ffff
+        @throw std::invalid_argument if the low surrogate is invalid
+
+        @see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
+        */
+        static string_t to_unicode(const std::size_t codepoint1,
+                                   const std::size_t codepoint2 = 0)
+        {
+            string_t result;
+
+            // calculate the codepoint from the given code points
+            std::size_t codepoint = codepoint1;
+
+            // check if codepoint1 is a high surrogate
+            if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF)
+            {
+                // check if codepoint2 is a low surrogate
+                if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF)
+                {
+                    codepoint =
+                        // high surrogate occupies the most significant 22 bits
+                        (codepoint1 << 10)
+                        // low surrogate occupies the least significant 15 bits
+                        + codepoint2
+                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
+                        // in the result so we have to substract with:
+                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                        - 0x35FDC00;
+                }
+                else
+                {
+                    throw std::invalid_argument("missing or wrong low surrogate");
+                }
+            }
+
+            if (codepoint < 0x80)
+            {
+                // 1-byte characters: 0xxxxxxx (ASCII)
+                result.append(1, static_cast<typename string_t::value_type>(codepoint));
+            }
+            else if (codepoint <= 0x7ff)
+            {
+                // 2-byte characters: 110xxxxx 10xxxxxx
+                result.append(1, static_cast<typename string_t::value_type>(0xC0 | ((codepoint >> 6) & 0x1F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
+            }
+            else if (codepoint <= 0xffff)
+            {
+                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+                result.append(1, static_cast<typename string_t::value_type>(0xE0 | ((codepoint >> 12) & 0x0F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
+            }
+            else if (codepoint <= 0x10ffff)
+            {
+                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                result.append(1, static_cast<typename string_t::value_type>(0xF0 | ((codepoint >> 18) & 0x07)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 12) & 0x3F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
+            }
+            else
+            {
+                throw std::out_of_range("code points above 0x10FFFF are invalid");
+            }
+
+            return result;
+        }
+
+        /// return name of values of type token_type
+        static std::string token_type_name(token_type t)
+        {
+            switch (t)
+            {
+                case (token_type::uninitialized):
+                    return "<uninitialized>";
+                case (token_type::literal_true):
+                    return "true literal";
+                case (token_type::literal_false):
+                    return "false literal";
+                case (token_type::literal_null):
+                    return "null literal";
+                case (token_type::value_string):
+                    return "string literal";
+                case (token_type::value_number):
+                    return "number literal";
+                case (token_type::begin_array):
+                    return "[";
+                case (token_type::begin_object):
+                    return "{";
+                case (token_type::end_array):
+                    return "]";
+                case (token_type::end_object):
+                    return "}";
+                case (token_type::name_separator):
+                    return ":";
+                case (token_type::value_separator):
+                    return ",";
+                case (token_type::end_of_input):
+                    return "<end of input>";
+                default:
+                    return "<parse error>";
+            }
+        }
+
+        /*!
+        This function implements a scanner for JSON. It is specified using
+        regular expressions that try to follow RFC 7159 as close as possible.
+        These regular expressions are then translated into a deterministic
+        finite automaton (DFA) by the tool re2c <http://re2c.org>. As a result,
+        the translated code for this function consists of a large block of code
+        with goto jumps.
+
+        @return the class of the next token read from the buffer
+        */
+        token_type scan() noexcept
+        {
+            // pointer for backtracking information
+            m_marker = nullptr;
+
+            // remember the begin of the token
+            m_start = m_cursor;
+
+
+            {
+                lexer_char_t yych;
+                unsigned int yyaccept = 0;
+                static const unsigned char yybm[] =
+                {
+                    0,   0,   0,   0,   0,   0,   0,   0,
+                    0,  32,  32,   0,   0,  32,   0,   0,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    96,  64,   0,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    192, 192, 192, 192, 192, 192, 192, 192,
+                    192, 192,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,   0,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                    64,  64,  64,  64,  64,  64,  64,  64,
+                };
+
+                if ((m_limit - m_cursor) < 5)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '9')
+                {
+                    if (yych <= ' ')
+                    {
+                        if (yych <= '\n')
+                        {
+                            if (yych <= 0x00)
+                            {
+                                goto basic_json_parser_27;
+                            }
+                            if (yych <= 0x08)
+                            {
+                                goto basic_json_parser_29;
+                            }
+                            if (yych >= '\n')
+                            {
+                                goto basic_json_parser_4;
+                            }
+                        }
+                        else
+                        {
+                            if (yych == '\r')
+                            {
+                                goto basic_json_parser_2;
+                            }
+                            if (yych <= 0x1F)
+                            {
+                                goto basic_json_parser_29;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if (yych <= ',')
+                        {
+                            if (yych == '"')
+                            {
+                                goto basic_json_parser_26;
+                            }
+                            if (yych <= '+')
+                            {
+                                goto basic_json_parser_29;
+                            }
+                            goto basic_json_parser_14;
+                        }
+                        else
+                        {
+                            if (yych <= '-')
+                            {
+                                goto basic_json_parser_22;
+                            }
+                            if (yych <= '/')
+                            {
+                                goto basic_json_parser_29;
+                            }
+                            if (yych <= '0')
+                            {
+                                goto basic_json_parser_23;
+                            }
+                            goto basic_json_parser_25;
+                        }
+                    }
+                }
+                else
+                {
+                    if (yych <= 'm')
+                    {
+                        if (yych <= '\\')
+                        {
+                            if (yych <= ':')
+                            {
+                                goto basic_json_parser_16;
+                            }
+                            if (yych == '[')
+                            {
+                                goto basic_json_parser_6;
+                            }
+                            goto basic_json_parser_29;
+                        }
+                        else
+                        {
+                            if (yych <= ']')
+                            {
+                                goto basic_json_parser_8;
+                            }
+                            if (yych == 'f')
+                            {
+                                goto basic_json_parser_21;
+                            }
+                            goto basic_json_parser_29;
+                        }
+                    }
+                    else
+                    {
+                        if (yych <= 'z')
+                        {
+                            if (yych <= 'n')
+                            {
+                                goto basic_json_parser_18;
+                            }
+                            if (yych == 't')
+                            {
+                                goto basic_json_parser_20;
+                            }
+                            goto basic_json_parser_29;
+                        }
+                        else
+                        {
+                            if (yych <= '{')
+                            {
+                                goto basic_json_parser_10;
+                            }
+                            if (yych == '}')
+                            {
+                                goto basic_json_parser_12;
+                            }
+                            goto basic_json_parser_29;
+                        }
+                    }
+                }
+basic_json_parser_2:
+                ++m_cursor;
+                yych = *m_cursor;
+                goto basic_json_parser_5;
+basic_json_parser_3:
+                {
+                    return scan();
+                }
+basic_json_parser_4:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+basic_json_parser_5:
+                if (yybm[0 + yych] & 32)
+                {
+                    goto basic_json_parser_4;
+                }
+                goto basic_json_parser_3;
+basic_json_parser_6:
+                ++m_cursor;
+                {
+                    return token_type::begin_array;
+                }
+basic_json_parser_8:
+                ++m_cursor;
+                {
+                    return token_type::end_array;
+                }
+basic_json_parser_10:
+                ++m_cursor;
+                {
+                    return token_type::begin_object;
+                }
+basic_json_parser_12:
+                ++m_cursor;
+                {
+                    return token_type::end_object;
+                }
+basic_json_parser_14:
+                ++m_cursor;
+                {
+                    return token_type::value_separator;
+                }
+basic_json_parser_16:
+                ++m_cursor;
+                {
+                    return token_type::name_separator;
+                }
+basic_json_parser_18:
+                yyaccept = 0;
+                yych = *(m_marker = ++m_cursor);
+                if (yych == 'u')
+                {
+                    goto basic_json_parser_59;
+                }
+basic_json_parser_19:
+                {
+                    return token_type::parse_error;
+                }
+basic_json_parser_20:
+                yyaccept = 0;
+                yych = *(m_marker = ++m_cursor);
+                if (yych == 'r')
+                {
+                    goto basic_json_parser_55;
+                }
+                goto basic_json_parser_19;
+basic_json_parser_21:
+                yyaccept = 0;
+                yych = *(m_marker = ++m_cursor);
+                if (yych == 'a')
+                {
+                    goto basic_json_parser_50;
+                }
+                goto basic_json_parser_19;
+basic_json_parser_22:
+                yych = *++m_cursor;
+                if (yych <= '/')
+                {
+                    goto basic_json_parser_19;
+                }
+                if (yych <= '0')
+                {
+                    goto basic_json_parser_49;
+                }
+                if (yych <= '9')
+                {
+                    goto basic_json_parser_40;
+                }
+                goto basic_json_parser_19;
+basic_json_parser_23:
+                yyaccept = 1;
+                yych = *(m_marker = ++m_cursor);
+                if (yych <= 'D')
+                {
+                    if (yych == '.')
+                    {
+                        goto basic_json_parser_42;
+                    }
+                }
+                else
+                {
+                    if (yych <= 'E')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    if (yych == 'e')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                }
+basic_json_parser_24:
+                {
+                    return token_type::value_number;
+                }
+basic_json_parser_25:
+                yyaccept = 1;
+                yych = *(m_marker = ++m_cursor);
+                goto basic_json_parser_41;
+basic_json_parser_26:
+                yyaccept = 0;
+                yych = *(m_marker = ++m_cursor);
+                if (yych <= 0x0F)
+                {
+                    goto basic_json_parser_19;
+                }
+                goto basic_json_parser_31;
+basic_json_parser_27:
+                ++m_cursor;
+                {
+                    return token_type::end_of_input;
+                }
+basic_json_parser_29:
+                yych = *++m_cursor;
+                goto basic_json_parser_19;
+basic_json_parser_30:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+basic_json_parser_31:
+                if (yybm[0 + yych] & 64)
+                {
+                    goto basic_json_parser_30;
+                }
+                if (yych <= 0x0F)
+                {
+                    goto basic_json_parser_32;
+                }
+                if (yych <= '"')
+                {
+                    goto basic_json_parser_34;
+                }
+                goto basic_json_parser_33;
+basic_json_parser_32:
+                m_cursor = m_marker;
+                if (yyaccept == 0)
+                {
+                    goto basic_json_parser_19;
+                }
+                else
+                {
+                    goto basic_json_parser_24;
+                }
+basic_json_parser_33:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= 'e')
+                {
+                    if (yych <= '/')
+                    {
+                        if (yych == '"')
+                        {
+                            goto basic_json_parser_30;
+                        }
+                        if (yych <= '.')
+                        {
+                            goto basic_json_parser_32;
+                        }
+                        goto basic_json_parser_30;
+                    }
+                    else
+                    {
+                        if (yych <= '\\')
+                        {
+                            if (yych <= '[')
+                            {
+                                goto basic_json_parser_32;
+                            }
+                            goto basic_json_parser_30;
+                        }
+                        else
+                        {
+                            if (yych == 'b')
+                            {
+                                goto basic_json_parser_30;
+                            }
+                            goto basic_json_parser_32;
+                        }
+                    }
+                }
+                else
+                {
+                    if (yych <= 'q')
+                    {
+                        if (yych <= 'f')
+                        {
+                            goto basic_json_parser_30;
+                        }
+                        if (yych == 'n')
+                        {
+                            goto basic_json_parser_30;
+                        }
+                        goto basic_json_parser_32;
+                    }
+                    else
+                    {
+                        if (yych <= 's')
+                        {
+                            if (yych <= 'r')
+                            {
+                                goto basic_json_parser_30;
+                            }
+                            goto basic_json_parser_32;
+                        }
+                        else
+                        {
+                            if (yych <= 't')
+                            {
+                                goto basic_json_parser_30;
+                            }
+                            if (yych <= 'u')
+                            {
+                                goto basic_json_parser_36;
+                            }
+                            goto basic_json_parser_32;
+                        }
+                    }
+                }
+basic_json_parser_34:
+                ++m_cursor;
+                {
+                    return token_type::value_string;
+                }
+basic_json_parser_36:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '@')
+                {
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= ':')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+                else
+                {
+                    if (yych <= 'F')
+                    {
+                        goto basic_json_parser_37;
+                    }
+                    if (yych <= '`')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= 'g')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+basic_json_parser_37:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '@')
+                {
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= ':')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+                else
+                {
+                    if (yych <= 'F')
+                    {
+                        goto basic_json_parser_38;
+                    }
+                    if (yych <= '`')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= 'g')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+basic_json_parser_38:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '@')
+                {
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= ':')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+                else
+                {
+                    if (yych <= 'F')
+                    {
+                        goto basic_json_parser_39;
+                    }
+                    if (yych <= '`')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych >= 'g')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+basic_json_parser_39:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '@')
+                {
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych <= '9')
+                    {
+                        goto basic_json_parser_30;
+                    }
+                    goto basic_json_parser_32;
+                }
+                else
+                {
+                    if (yych <= 'F')
+                    {
+                        goto basic_json_parser_30;
+                    }
+                    if (yych <= '`')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych <= 'f')
+                    {
+                        goto basic_json_parser_30;
+                    }
+                    goto basic_json_parser_32;
+                }
+basic_json_parser_40:
+                yyaccept = 1;
+                m_marker = ++m_cursor;
+                if ((m_limit - m_cursor) < 3)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+basic_json_parser_41:
+                if (yybm[0 + yych] & 128)
+                {
+                    goto basic_json_parser_40;
+                }
+                if (yych <= 'D')
+                {
+                    if (yych != '.')
+                    {
+                        goto basic_json_parser_24;
+                    }
+                }
+                else
+                {
+                    if (yych <= 'E')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    if (yych == 'e')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    goto basic_json_parser_24;
+                }
+basic_json_parser_42:
+                yych = *++m_cursor;
+                if (yych <= '/')
+                {
+                    goto basic_json_parser_32;
+                }
+                if (yych <= '9')
+                {
+                    goto basic_json_parser_47;
+                }
+                goto basic_json_parser_32;
+basic_json_parser_43:
+                yych = *++m_cursor;
+                if (yych <= ',')
+                {
+                    if (yych != '+')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                }
+                else
+                {
+                    if (yych <= '-')
+                    {
+                        goto basic_json_parser_44;
+                    }
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_32;
+                    }
+                    if (yych <= '9')
+                    {
+                        goto basic_json_parser_45;
+                    }
+                    goto basic_json_parser_32;
+                }
+basic_json_parser_44:
+                yych = *++m_cursor;
+                if (yych <= '/')
+                {
+                    goto basic_json_parser_32;
+                }
+                if (yych >= ':')
+                {
+                    goto basic_json_parser_32;
+                }
+basic_json_parser_45:
+                ++m_cursor;
+                if (m_limit <= m_cursor)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= '/')
+                {
+                    goto basic_json_parser_24;
+                }
+                if (yych <= '9')
+                {
+                    goto basic_json_parser_45;
+                }
+                goto basic_json_parser_24;
+basic_json_parser_47:
+                yyaccept = 1;
+                m_marker = ++m_cursor;
+                if ((m_limit - m_cursor) < 3)
+                {
+                    yyfill();    // LCOV_EXCL_LINE;
+                }
+                yych = *m_cursor;
+                if (yych <= 'D')
+                {
+                    if (yych <= '/')
+                    {
+                        goto basic_json_parser_24;
+                    }
+                    if (yych <= '9')
+                    {
+                        goto basic_json_parser_47;
+                    }
+                    goto basic_json_parser_24;
+                }
+                else
+                {
+                    if (yych <= 'E')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    if (yych == 'e')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    goto basic_json_parser_24;
+                }
+basic_json_parser_49:
+                yyaccept = 1;
+                yych = *(m_marker = ++m_cursor);
+                if (yych <= 'D')
+                {
+                    if (yych == '.')
+                    {
+                        goto basic_json_parser_42;
+                    }
+                    goto basic_json_parser_24;
+                }
+                else
+                {
+                    if (yych <= 'E')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    if (yych == 'e')
+                    {
+                        goto basic_json_parser_43;
+                    }
+                    goto basic_json_parser_24;
+                }
+basic_json_parser_50:
+                yych = *++m_cursor;
+                if (yych != 'l')
+                {
+                    goto basic_json_parser_32;
+                }
+                yych = *++m_cursor;
+                if (yych != 's')
+                {
+                    goto basic_json_parser_32;
+                }
+                yych = *++m_cursor;
+                if (yych != 'e')
+                {
+                    goto basic_json_parser_32;
+                }
+                ++m_cursor;
+                {
+                    return token_type::literal_false;
+                }
+basic_json_parser_55:
+                yych = *++m_cursor;
+                if (yych != 'u')
+                {
+                    goto basic_json_parser_32;
+                }
+                yych = *++m_cursor;
+                if (yych != 'e')
+                {
+                    goto basic_json_parser_32;
+                }
+                ++m_cursor;
+                {
+                    return token_type::literal_true;
+                }
+basic_json_parser_59:
+                yych = *++m_cursor;
+                if (yych != 'l')
+                {
+                    goto basic_json_parser_32;
+                }
+                yych = *++m_cursor;
+                if (yych != 'l')
+                {
+                    goto basic_json_parser_32;
+                }
+                ++m_cursor;
+                {
+                    return token_type::literal_null;
+                }
+            }
+
+
+        }
+
+        /// append data from the stream to the internal buffer
+        void yyfill() noexcept
+        {
+            if (not m_stream or not * m_stream)
+            {
+                return;
+            }
+
+            const ssize_t offset_start = m_start - m_content;
+            const ssize_t offset_marker = m_marker - m_start;
+            const ssize_t offset_cursor = m_cursor - m_start;
+
+            m_buffer.erase(0, static_cast<size_t>(offset_start));
+            std::string line;
+            std::getline(*m_stream, line);
+            m_buffer += "\n" + line; // add line with newline symbol
+
+            m_content = reinterpret_cast<const lexer_char_t*>(m_buffer.c_str());
+            m_start  = m_content;
+            m_marker = m_start + offset_marker;
+            m_cursor = m_start + offset_cursor;
+            m_limit  = m_start + m_buffer.size() - 1;
+        }
+
+        /// return string representation of last read token
+        string_t get_token() const noexcept
+        {
+            return string_t(reinterpret_cast<typename string_t::const_pointer>(m_start),
+                            static_cast<size_t>(m_cursor - m_start));
+        }
+
+        /*!
+        @brief return string value for string tokens
+
+        The function iterates the characters between the opening and closing
+        quotes of the string value. The complete string is the range
+        [m_start,m_cursor). Consequently, we iterate from m_start+1 to
+        m_cursor-1.
+
+        We differentiate two cases:
+
+        1. Escaped characters. In this case, a new character is constructed
+           according to the nature of the escape. Some escapes create new
+           characters (e.g., @c "\\n" is replaced by @c "\n"), some are copied
+           as is (e.g., @c "\\\\"). Furthermore, Unicode escapes of the shape
+           @c "\\uxxxx" need special care. In this case, to_unicode takes care
+           of the construction of the values.
+        2. Unescaped characters are copied as is.
+
+        @return string value of current token without opening and closing quotes
+        @throw std::out_of_range if to_unicode fails
+        */
+        string_t get_string() const
+        {
+            string_t result;
+            result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
+
+            // iterate the result between the quotes
+            for (const lexer_char_t* i = m_start + 1; i < m_cursor - 1; ++i)
+            {
+                // process escaped characters
+                if (*i == '\\')
+                {
+                    // read next character
+                    ++i;
+
+                    switch (*i)
+                    {
+                        // the default escapes
+                        case 't':
+                        {
+                            result += "\t";
+                            break;
+                        }
+                        case 'b':
+                        {
+                            result += "\b";
+                            break;
+                        }
+                        case 'f':
+                        {
+                            result += "\f";
+                            break;
+                        }
+                        case 'n':
+                        {
+                            result += "\n";
+                            break;
+                        }
+                        case 'r':
+                        {
+                            result += "\r";
+                            break;
+                        }
+                        case '\\':
+                        {
+                            result += "\\";
+                            break;
+                        }
+                        case '/':
+                        {
+                            result += "/";
+                            break;
+                        }
+                        case '"':
+                        {
+                            result += "\"";
+                            break;
+                        }
+
+                        // unicode
+                        case 'u':
+                        {
+                            // get code xxxx from uxxxx
+                            auto codepoint = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>(i + 1),
+                                                          4).c_str(), nullptr, 16);
+
+                            // check if codepoint is a high surrogate
+                            if (codepoint >= 0xD800 and codepoint <= 0xDBFF)
+                            {
+                                // make sure there is a subsequent unicode
+                                if ((i + 6 >= m_limit) or * (i + 5) != '\\' or * (i + 6) != 'u')
+                                {
+                                    throw std::invalid_argument("missing low surrogate");
+                                }
+
+                                // get code yyyy from uxxxx\uyyyy
+                                auto codepoint2 = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>
+                                                               (i + 7), 4).c_str(), nullptr, 16);
+                                result += to_unicode(codepoint, codepoint2);
+                                // skip the next 11 characters (xxxx\uyyyy)
+                                i += 11;
+                            }
+                            else
+                            {
+                                // add unicode character(s)
+                                result += to_unicode(codepoint);
+                                // skip the next four characters (xxxx)
+                                i += 4;
+                            }
+                            break;
+                        }
+                    }
+                }
+                else
+                {
+                    // all other characters are just copied to the end of the
+                    // string
+                    result.append(1, static_cast<typename string_t::value_type>(*i));
+                }
+            }
+
+            return result;
+        }
+
+        /*!
+        @brief return number value for number tokens
+
+        This function translates the last token into a floating point number.
+        The pointer m_start points to the beginning of the parsed number. We
+        pass this pointer to std::strtod which sets endptr to the first
+        character past the converted number. If this pointer is not the same as
+        m_cursor, then either more or less characters have been used during the
+        comparison. This can happen for inputs like "01" which will be treated
+        like number 0 followed by number 1.
+
+        @return the result of the number conversion or NAN if the conversion
+        read past the current token. The latter case needs to be treated by the
+        caller function.
+
+        @throw std::range_error if passed value is out of range
+        */
+        long double get_number() const
+        {
+            // conversion
+            typename string_t::value_type* endptr;
+            const auto float_val = std::strtold(reinterpret_cast<typename string_t::const_pointer>(m_start),
+                                                &endptr);
+
+            // return float_val if the whole number was translated and NAN
+            // otherwise
+            return (reinterpret_cast<lexer_char_t*>(endptr) == m_cursor) ? float_val : static_cast<long double>(NAN);
+        }
+
+      private:
+        /// optional input stream
+        std::istream* m_stream;
+        /// the buffer
+        string_t m_buffer;
+        /// the buffer pointer
+        const lexer_char_t* m_content = nullptr;
+        /// pointer to the beginning of the current symbol
+        const lexer_char_t* m_start = nullptr;
+        /// pointer for backtracking information
+        const lexer_char_t* m_marker = nullptr;
+        /// pointer to the current symbol
+        const lexer_char_t* m_cursor = nullptr;
+        /// pointer to the end of the buffer
+        const lexer_char_t* m_limit = nullptr;
+    };
+
+    /*!
+    @brief syntax analysis
+    */
+    class parser
+    {
+      public:
+        /// constructor for strings
+        parser(const string_t& s, parser_callback_t cb = nullptr)
+            : callback(cb), m_lexer(s)
+        {
+            // read first token
+            get_token();
+        }
+
+        /// a parser reading from an input stream
+        parser(std::istream& _is, parser_callback_t cb = nullptr)
+            : callback(cb), m_lexer(&_is)
+        {
+            // read first token
+            get_token();
+        }
+
+        /// public parser interface
+        basic_json parse()
+        {
+            basic_json result = parse_internal(true);
+
+            expect(lexer::token_type::end_of_input);
+
+            // return parser result and replace it with null in case the
+            // top-level value was discarded by the callback function
+            return result.is_discarded() ? basic_json() : result;
+        }
+
+      private:
+        /// the actual parser
+        basic_json parse_internal(bool keep)
+        {
+            auto result = basic_json(value_t::discarded);
+
+            switch (last_token)
+            {
+                case (lexer::token_type::begin_object):
+                {
+                    if (keep and (not callback or (keep = callback(depth++, parse_event_t::object_start, result))))
+                    {
+                        // explicitly set result to object to cope with {}
+                        result.m_type = value_t::object;
+                        result.m_value = json_value(value_t::object);
+                    }
+
+                    // read next token
+                    get_token();
+
+                    // closing } -> we are done
+                    if (last_token == lexer::token_type::end_object)
+                    {
+                        get_token();
+                        if (keep and callback and not callback(--depth, parse_event_t::object_end, result))
+                        {
+                            result = basic_json(value_t::discarded);
+                        }
+                        return result;
+                    }
+
+                    // no comma is expected here
+                    unexpect(lexer::token_type::value_separator);
+
+                    // otherwise: parse key-value pairs
+                    do
+                    {
+                        // ugly, but could be fixed with loop reorganization
+                        if (last_token == lexer::token_type::value_separator)
+                        {
+                            get_token();
+                        }
+
+                        // store key
+                        expect(lexer::token_type::value_string);
+                        const auto key = m_lexer.get_string();
+
+                        bool keep_tag = false;
+                        if (keep)
+                        {
+                            if (callback)
+                            {
+                                basic_json k(key);
+                                keep_tag = callback(depth, parse_event_t::key, k);
+                            }
+                            else
+                            {
+                                keep_tag = true;
+                            }
+                        }
+
+                        // parse separator (:)
+                        get_token();
+                        expect(lexer::token_type::name_separator);
+
+                        // parse and add value
+                        get_token();
+                        auto value = parse_internal(keep);
+                        if (keep and keep_tag and not value.is_discarded())
+                        {
+                            result[key] = std::move(value);
+                        }
+                    }
+                    while (last_token == lexer::token_type::value_separator);
+
+                    // closing }
+                    expect(lexer::token_type::end_object);
+                    get_token();
+                    if (keep and callback and not callback(--depth, parse_event_t::object_end, result))
+                    {
+                        result = basic_json(value_t::discarded);
+                    }
+
+                    return result;
+                }
+
+                case (lexer::token_type::begin_array):
+                {
+                    if (keep and (not callback or (keep = callback(depth++, parse_event_t::array_start, result))))
+                    {
+                        // explicitly set result to object to cope with []
+                        result.m_type = value_t::array;
+                        result.m_value = json_value(value_t::array);
+                    }
+
+                    // read next token
+                    get_token();
+
+                    // closing ] -> we are done
+                    if (last_token == lexer::token_type::end_array)
+                    {
+                        get_token();
+                        if (callback and not callback(--depth, parse_event_t::array_end, result))
+                        {
+                            result = basic_json(value_t::discarded);
+                        }
+                        return result;
+                    }
+
+                    // no comma is expected here
+                    unexpect(lexer::token_type::value_separator);
+
+                    // otherwise: parse values
+                    do
+                    {
+                        // ugly, but could be fixed with loop reorganization
+                        if (last_token == lexer::token_type::value_separator)
+                        {
+                            get_token();
+                        }
+
+                        // parse value
+                        auto value = parse_internal(keep);
+                        if (keep and not value.is_discarded())
+                        {
+                            result.push_back(std::move(value));
+                        }
+                    }
+                    while (last_token == lexer::token_type::value_separator);
+
+                    // closing ]
+                    expect(lexer::token_type::end_array);
+                    get_token();
+                    if (keep and callback and not callback(--depth, parse_event_t::array_end, result))
+                    {
+                        result = basic_json(value_t::discarded);
+                    }
+
+                    return result;
+                }
+
+                case (lexer::token_type::literal_null):
+                {
+                    get_token();
+                    result.m_type = value_t::null;
+                    break;
+                }
+
+                case (lexer::token_type::value_string):
+                {
+                    const auto s = m_lexer.get_string();
+                    get_token();
+                    result = basic_json(s);
+                    break;
+                }
+
+                case (lexer::token_type::literal_true):
+                {
+                    get_token();
+                    result.m_type = value_t::boolean;
+                    result.m_value = true;
+                    break;
+                }
+
+                case (lexer::token_type::literal_false):
+                {
+                    get_token();
+                    result.m_type = value_t::boolean;
+                    result.m_value = false;
+                    break;
+                }
+
+                case (lexer::token_type::value_number):
+                {
+                    auto float_val = m_lexer.get_number();
+
+                    // NAN is returned if token could not be translated
+                    // completely
+                    if (std::isnan(float_val))
+                    {
+                        throw std::invalid_argument(std::string("parse error - ") +
+                                                    m_lexer.get_token() + " is not a number");
+                    }
+
+                    get_token();
+
+                    // check if conversion loses precision
+                    const auto int_val = static_cast<number_integer_t>(float_val);
+                    if (approx(float_val, static_cast<long double>(int_val)))
+                    {
+                        // we basic_json not lose precision -> return int
+                        result.m_type = value_t::number_integer;
+                        result.m_value = int_val;
+                    }
+                    else
+                    {
+                        // we would lose precision -> returnfloat
+                        result.m_type = value_t::number_float;
+                        result.m_value = static_cast<number_float_t>(float_val);
+                    }
+                    break;
+                }
+
+                default:
+                {
+                    // the last token was unexpected
+                    unexpect(last_token);
+                }
+            }
+
+            if (keep and callback and not callback(depth, parse_event_t::value, result))
+            {
+                result = basic_json(value_t::discarded);
+            }
+            return result;
+        }
+
+        /// get next token from lexer
+        typename lexer::token_type get_token()
+        {
+            last_token = m_lexer.scan();
+            return last_token;
+        }
+
+        void expect(typename lexer::token_type t) const
+        {
+            if (t != last_token)
+            {
+                std::string error_msg = "parse error - unexpected \'";
+                error_msg += m_lexer.get_token();
+                error_msg += "\' (" + lexer::token_type_name(last_token);
+                error_msg += "); expected " + lexer::token_type_name(t);
+                throw std::invalid_argument(error_msg);
+            }
+        }
+
+        void unexpect(typename lexer::token_type t) const
+        {
+            if (t == last_token)
+            {
+                std::string error_msg = "parse error - unexpected \'";
+                error_msg += m_lexer.get_token();
+                error_msg += "\' (";
+                error_msg += lexer::token_type_name(last_token) + ")";
+                throw std::invalid_argument(error_msg);
+            }
+        }
+
+      private:
+        /// current level of recursion
+        int depth = 0;
+        /// callback function
+        parser_callback_t callback;
+        /// the type of the last read token
+        typename lexer::token_type last_token = lexer::token_type::uninitialized;
+        /// the lexer
+        lexer m_lexer;
+    };
+};
+
+
+/////////////
+// presets //
+/////////////
+
+/*!
+@brief default JSON class
+
+This type is the default specialization of the @ref basic_json class which uses
+the standard template types.
+*/
+using json = basic_json<>;
+}
+
+
+/////////////////////////
+// nonmember functions //
+/////////////////////////
+
+// specialization of std::swap, and std::hash
+namespace std
+{
+/*!
+@brief exchanges the values of two JSON objects
+*/
+template <>
+inline void swap(nlohmann::json& j1,
+                 nlohmann::json& j2) noexcept(
+                     is_nothrow_move_constructible<nlohmann::json>::value and
+                     is_nothrow_move_assignable<nlohmann::json>::value
+                 )
+{
+    j1.swap(j2);
+}
+
+/// hash value for JSON objects
+template <>
+struct hash<nlohmann::json>
+{
+    /// return a hash value for a JSON object
+    std::size_t operator()(const nlohmann::json& j) const
+    {
+        // a naive hashing via the string representation
+        const auto& h = hash<nlohmann::json::string_t>();
+        return h(j.dump());
+    }
+};
+}
+
+/*!
+@brief user-defined string literal for JSON values
+
+This operator implements a user-defined string literal for JSON objects. It can
+be used by adding \p "_json" to a string literal and returns a JSON object if
+no parse error occurred.
+
+@param[in] s  a string representation of a JSON object
+@return a JSON object
+*/
+inline nlohmann::json operator "" _json(const char* s, std::size_t)
+{
+    return nlohmann::json::parse(s);
+}
+
+#endif
diff --git a/tools/pbindexdump/src/main.cpp b/tools/pbindexdump/src/main.cpp

new file mode 100644 (file)

index 0000000..5d1df25
--- /dev/null
+++ b/tools/pbindexdump/src/main.cpp
@@ -0,0 +1,108 @@
+// Author: Derek Barnett
+
+#include <cassert>
+#include <cstddef>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "../common/OptionParser.h"
+
+#include "PbIndexDump.h"
+#include "PbIndexDumpVersion.h"
+#include "Settings.h"
+
+static pbindexdump::Settings fromCommandLine(optparse::OptionParser& parser, int argc, char* argv[])
+{
+    const optparse::Values options = parser.parse_args(argc, argv);
+    pbindexdump::Settings settings;
+
+    // input
+    const std::vector<std::string> positionalArgs = parser.args();
+    const size_t numPositionalArgs = positionalArgs.size();
+    if (numPositionalArgs == 0)
+        settings.inputPbiFilename_ = "-";  // stdin
+    else if (numPositionalArgs == 1)
+        settings.inputPbiFilename_ = parser.args().front();
+    else {
+        assert(numPositionalArgs > 1);
+        settings.errors_.emplace_back(
+            "pbindexdump does not support more than one input file per run");
+    }
+
+    // output format
+    if (options.is_set("format")) settings.format_ = options["format"];
+
+    // JSON options
+    if (settings.format_ == "json") {
+        if (options.is_set("json_indent_level"))
+            settings.jsonIndentLevel_ = options.get("json_indent_level");
+        if (options.is_set("json_raw")) settings.jsonRaw_ = options.get("json_raw");
+    } else {
+        if (options.is_set("json_indent_level") || options.is_set("json_raw")) {
+            settings.errors_.emplace_back("JSON formatting options not valid on non-JSON output");
+        }
+    }
+
+    return settings;
+}
+
+int main(int argc, char* argv[])
+{
+    // setup help & options
+    optparse::OptionParser parser;
+    parser.description("pbindexdump prints a human-readable view of PBI data to stdout.");
+    parser.prog("pbindexdump");
+    parser.usage("pbindexdump [options] [input]");
+    parser.version(pbindexdump::Version);
+    parser.add_version_option(true);
+    parser.add_help_option(true);
+
+    auto ioGroup = optparse::OptionGroup(parser, "Input/Output");
+    ioGroup.add_option("").dest("input").metavar("input").help(
+        "Input PBI file. If not provided, stdin will be used as input.");
+    ioGroup.add_option("--format")
+        .dest("format")
+        .metavar("STRING")
+        .help(
+            "Output format, one of:\n"
+            "    json, cpp\n\n"
+            "json: pretty-printed JSON [default]\n\n"
+            "cpp: copy/paste-able C++ code that can be used to construct the"
+            " equivalent PacBio::BAM::PbiRawData object");
+    parser.add_option_group(ioGroup);
+
+    auto jsonGroup = optparse::OptionGroup(parser, "JSON Formatting");
+    jsonGroup.add_option("--json-indent-level")
+        .dest("json_indent_level")
+        .metavar("INT")
+        .help("JSON indent level [4]");
+    jsonGroup.add_option("--json-raw")
+        .dest("json_raw")
+        .action("store_true")
+        .help(
+            "Prints fields in a manner that more closely reflects the PBI"
+            " file format - presenting data as per-field columns, not"
+            " per-record objects.");
+    parser.add_option_group(jsonGroup);
+
+    // parse command line for settings
+    const pbindexdump::Settings settings = fromCommandLine(parser, argc, argv);
+    if (!settings.errors_.empty()) {
+        std::cerr << std::endl;
+        for (const auto e : settings.errors_)
+            std::cerr << "ERROR: " << e << std::endl;
+        std::cerr << std::endl;
+        parser.print_help();
+        return EXIT_FAILURE;
+    }
+
+    // run tool
+    try {
+        pbindexdump::PbIndexDump::Run(settings);
+        return EXIT_SUCCESS;
+    } catch (std::exception& e) {
+        std::cerr << "ERROR: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/tools/pbmerge/CMakeLists.txt b/tools/pbmerge/CMakeLists.txt

new file mode 100644 (file)

index 0000000..fa9a906
--- /dev/null
+++ b/tools/pbmerge/CMakeLists.txt
@@ -0,0 +1,72 @@
+
+set(PbmergeSrcDir ${PacBioBAM_ToolsDir}/pbmerge/src)
+
+# create version header
+set(PbMerge_VERSION ${PacBioBAM_VERSION})
+configure_file(
+    ${PbmergeSrcDir}/PbMergeVersion.h.in ${GeneratedDir}/PbMergeVersion.h @ONLY
+)
+
+# list source files
+set(PBMERGE_SOURCES
+    ${ToolsCommonDir}/BamFileMerger.h
+    ${ToolsCommonDir}/OptionParser.cpp
+    ${PbmergeSrcDir}/main.cpp
+)
+
+# build pbmerge executable
+include(PbbamTool)
+create_pbbam_tool(
+    TARGET  pbmerge
+    SOURCES ${PBMERGE_SOURCES}
+)
+
+# cram tests
+if (PacBioBAM_build_tests)
+    if(PacBioBAM_auto_validate)
+        # skip for now til we clean up merge tests under autovalidate, too
+    else()
+
+        configure_file(
+            ${PacBioBAM_CramTestsDir}/pbmerge_pacbio_ordering.t.in
+            ${GeneratedDir}/pbmerge_pacbio_ordering.t
+            @ONLY
+        )
+ 
+        configure_file(
+            ${PacBioBAM_CramTestsDir}/pbmerge_aligned_ordering.t.in
+            ${GeneratedDir}/pbmerge_aligned_ordering.t
+            @ONLY
+        )
+
+        configure_file(
+            ${PacBioBAM_CramTestsDir}/pbmerge_mixed_ordering.t.in
+            ${GeneratedDir}/pbmerge_mixed_ordering.t
+            @ONLY
+        )
+
+        configure_file(
+            ${PacBioBAM_CramTestsDir}/pbmerge_dataset.t.in
+            ${GeneratedDir}/pbmerge_dataset.t
+            @ONLY
+        )
+
+        configure_file(
+            ${PacBioBAM_CramTestsDir}/pbmerge_fofn.t.in
+            ${GeneratedDir}/pbmerge_fofn.t
+            @ONLY
+        )
+
+        add_test(
+            NAME pbmerge_CramTests
+            WORKING_DIRECTORY ${PacBioBAM_TestsDir}/scripts
+            COMMAND "python" cram.py
+                ${GeneratedDir}/pbmerge_pacbio_ordering.t
+                ${GeneratedDir}/pbmerge_aligned_ordering.t
+                ${GeneratedDir}/pbmerge_mixed_ordering.t
+                ${GeneratedDir}/pbmerge_dataset.t
+                ${GeneratedDir}/pbmerge_fofn.t
+        )
+
+    endif()
+endif()
diff --git a/tools/pbmerge/src/PbMergeVersion.h.in b/tools/pbmerge/src/PbMergeVersion.h.in

new file mode 100644 (file)

index 0000000..526b463
--- /dev/null
+++ b/tools/pbmerge/src/PbMergeVersion.h.in
@@ -0,0 +1,14 @@
+// Author: Derek Barnett
+
+#ifndef PBMERGEVERSION_H
+#define PBMERGEVERSION_H
+
+#include <string>
+
+namespace pbmerge {
+
+const std::string Version = std::string("@PbMerge_VERSION@");
+
+} // namespace pbmerge
+
+#endif // PBMERGEVERSION_H
diff --git a/tools/pbmerge/src/main.cpp b/tools/pbmerge/src/main.cpp

new file mode 100644 (file)

index 0000000..403a7f2
--- /dev/null
+++ b/tools/pbmerge/src/main.cpp
@@ -0,0 +1,130 @@
+// Author: Derek Barnett
+
+#include <cassert>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "../common/BamFileMerger.h"
+#include "../common/OptionParser.h"
+#include "PbMergeVersion.h"
+
+namespace pbmerge {
+
+class Settings
+{
+public:
+    static Settings FromCommandLine(optparse::OptionParser& parser, int argc, char* argv[])
+    {
+        pbmerge::Settings settings;
+        const optparse::Values options = parser.parse_args(argc, argv);
+
+        // input
+        const std::vector<std::string> positionalArgs = parser.args();
+        if (positionalArgs.empty())
+            settings.errors_.push_back("at least input one file must be specified");
+        else
+            settings.inputFilenames_ = positionalArgs;
+
+        // output
+        if (options.is_set("output"))
+            settings.outputFilename_ = options["output"];
+        else
+            settings.outputFilename_ = "-";  // stdout
+
+        // PBI?
+        if (settings.outputFilename_ == "-")
+            settings.createPbi_ = false;  // always skip PBI if writing to stdout
+        else {
+            if (options.is_set("no_pbi"))
+                settings.createPbi_ = !options.get("no_pbi");  // user-disabled
+            else
+                settings.createPbi_ = true;  // not specified, go ahead and generate by default
+        }
+
+        return settings;
+    }
+
+public:
+    std::vector<std::string> inputFilenames_;
+    std::string outputFilename_;
+    bool createPbi_;
+    std::vector<std::string> errors_;
+
+private:
+    Settings() {}
+};
+
+}  // namespace pbmerge
+
+int main(int argc, char* argv[])
+{
+    // setup help & options
+    optparse::OptionParser parser;
+    parser.description(
+        "pbmerge merges PacBio BAM files. If the input is DataSetXML, "
+        "any filters will be applied. If no output filename is specified, "
+        "new BAM will be written to stdout.");
+    parser.prog("pbmerge");
+    parser.usage("pbmerge [options] [-o <out.bam>] <INPUT>");
+    parser.version(pbmerge::Version);
+    parser.add_version_option(true);
+    parser.add_help_option(true);
+
+    auto ioGroup = optparse::OptionGroup(parser, "Input/Output");
+    ioGroup.add_option("-o").dest("output").metavar("output").help("Output BAM filename. ");
+    ioGroup.add_option("--no-pbi")
+        .dest("no_pbi")
+        .action("store_true")
+        .help(
+            "Set this option to skip PBI index file creation. PBI creation is "
+            "automatically skipped if no output filename is provided.");
+    ioGroup.add_option("").dest("input").metavar("INPUT").help(
+        "Input may be one of:\n"
+        "    DataSetXML, list of BAM files, or FOFN\n\n"
+        "    fofn: pbmerge -o merged.bam bams.fofn\n\n"
+        "    bams: pbmerge -o merged.bam 1.bam 2.bam 3.bam\n\n"
+        "    xml:  pbmerge -o merged.bam foo.subreadset.xml\n\n");
+    parser.add_option_group(ioGroup);
+
+    // parse command line for settings
+    const pbmerge::Settings settings = pbmerge::Settings::FromCommandLine(parser, argc, argv);
+    if (!settings.errors_.empty()) {
+        std::cerr << std::endl;
+        for (const auto e : settings.errors_)
+            std::cerr << "ERROR: " << e << std::endl;
+        std::cerr << std::endl;
+        parser.print_help();
+        return EXIT_FAILURE;
+    }
+
+    // run tool
+    try {
+        // setup our @PG entry to add to header
+        PacBio::BAM::ProgramInfo mergeProgram;
+        mergeProgram.Id(std::string("pbmerge-") + pbmerge::Version)
+            .Name("pbmerge")
+            .Version(pbmerge::Version);
+
+        PacBio::BAM::DataSet dataset;
+        if (settings.inputFilenames_.size() == 1)
+            dataset = PacBio::BAM::DataSet(settings.inputFilenames_.front());
+        else
+            dataset = PacBio::BAM::DataSet(settings.inputFilenames_);
+
+        PacBio::BAM::common::BamFileMerger::Merge(dataset, settings.outputFilename_, mergeProgram,
+                                                  settings.createPbi_);
+
+        //        PacBio::BAM::common::BamFileMerger merger(dataset,
+        //                                                  settings.outputFilename_,
+        //                                                  mergeProgram,
+        //                                                  settings.createPbi_);
+        //        merger.Merge();
+
+        return EXIT_SUCCESS;
+    } catch (std::exception& e) {
+        std::cerr << "ERROR: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+}
author	Andreas Tille <tille@debian.org>
	Wed, 10 Oct 2018 10:45:02 +0000 (11:45 +0100)
committer	Andreas Tille <tille@debian.org>
	Wed, 10 Oct 2018 10:45:02 +0000 (11:45 +0100)